def filter_motifs(opts, h5_files): """ Samples N=<N_reads> reads and looks for evidence of methylation in all the motifs in the query space. """ if os.path.exists(opts.tmp): shutil.rmtree(opts.tmp) os.mkdir(opts.tmp) ########################################################## # Check which motifs are in the dictionary of control IPDs ########################################################## logging.info("") logging.info( "Assessing the motifs for which we have control IPD values...") controls_d = pickle.load(open(opts.control_pkl_name, "r")) all_m = controls_d.keys() logging.info(" -- Contiguous (e.g. CATG):") for l in range(1, 20): hits = len([ m for m in all_m if len(m.split("-")[0]) == l and m.find("N") < 0 ]) if hits > 0: logging.info(" Found %s motifs of length %s" % (hits, l)) logging.info(" -- Bipartite (e.g. ACGNNNNNCTT):") bi_hits = len([m for m in all_m if m.find("N") > -1]) logging.info(" Found %s motifs" % bi_hits) logging.info("Found %s total motifs" % len(all_m)) logging.info("") mbinRunner = mbin.mbinRunner(opts) ################################################## # Launch analysis of <N_reads> for motif filtering ################################################## for i, h5_file in enumerate(h5_files): logging.info("Creating %s barcodes (%s motifs) from %s..." % (opts.N_reads, (len(opts.motifs) + len(opts.bi_motifs)), h5_file)) mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts) if opts.h5_type == "bas": # Combine subread data across multiple movies logging.info("Combining subread data across all movies...") results = mbinRunner.combine_subread_data_across_bas_movies() logging.info("Done.") # Combine movie-merged subreads to get read-level barcodes logging.info("Combining subreads to get read-level barcodes...") results = mbinRunner.bas_combine_subreads_for_read_level(tmp_run=True) logging.info("Done.") filter_runner = FilterRunner(opts, h5_files) filter_runner.run(mbinRunner) logging.info("Cleaning up temp files from motif filtering...") shutil.rmtree(opts.tmp)
def extract_controls(opts, control_aln_fn): """ """ controls = ControlRunner(control_aln_fn, opts) mbinRunner = mbin.mbinRunner(opts) # Pulling the IPD data for each motif from the WGA cmp.h5 file motifs, bi_motifs = motif_tools.build_motif_sets(opts) opts.motifs = motifs opts.bi_motifs = bi_motifs logging.info("") logging.info("Preparing to create new control data in %s" % opts.control_tmp) controls.goto_control_output_dir() opts = controls.scan_WGA_aligns() filter_N_reads = opts.N_reads mbinRunner.launch_data_loader(control_aln_fn, filter_N_reads, 1, opts) controls.analyze_WGA_reads() logging.info("Done.") logging.info("") # Building dictionary of mean control IPD values for each motif logging.info("Building dictionary of control values for all motifs...") logging.info(" * Initial build requires significant time and memory.") controls.combine_control_data_from_contigs() control_means = controls.build_control_IPD_dict(motifs, bi_motifs) controls.return_to_orig_dir() logging.info("") logging.info("Cleaning up temp files from control data processing...") shutil.rmtree(opts.control_tmp) # Controls are loaded into control_means, now pickle them for easy # passing between parallel processes pickle.dump(control_means, open(opts.control_pkl_name, "wb"))
def build_profiles(opts, h5_files, motifs, motifs_fn): """ """ if os.path.exists(opts.tmp): shutil.rmtree(opts.tmp) os.mkdir(opts.tmp) opts.motifs_file = motifs_fn opts.motifs = motifs opts.bi_motifs = None logging.info("Building methylation profiles using %s motifs..." % len(opts.motifs)) to_del = glob.glob(os.path.join(opts.tmp, "*")) for fn in to_del: os.remove(fn) mbinRunner = mbin.mbinRunner(opts) ################################################## # Launch analysis of <N_reads> for motif filtering ################################################## for i, h5_file in enumerate(h5_files): logging.info("Creating %s barcodes (%s motifs) from %s..." % (opts.N_reads, len(opts.motifs), h5_file)) mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts) if opts.h5_type == "cmp": logging.info( "Combining subread-level barcodes to get read-level barcodes from each contig..." ) contig_labels_fns = glob.glob( os.path.join(opts.tmp, "*_labels.tmp")) contigs = map( lambda x: os.path.basename(x).split("_labels.tmp")[0], contig_labels_fns) args = [(h5_file, contig, opts.tmp, opts.h5_labels, i, len(contigs)) for i, contig in enumerate(contigs)] results = mbin.launch_pool(opts.procs, combine_subreads_for_read_level, args) logging.info("Combining read-level barcodes from all contigs...") mbinRunner.combine_read_level_barcodes_across_contigs() logging.info("Done.") logging.info( "Creating contig-level barcodes (%s motifs) from %s..." % (len(opts.motifs), h5_file)) mbinRunner.combine_subreads_for_contig_level(h5_file) logging.info("Done.") n_contigs = len( np.loadtxt(os.path.join(opts.tmp, mbinRunner.fns["contig_names"]), dtype="str", ndmin=1)) if opts.cross_cov_bins != None: logging.info( "Creating bin-level barcodes (%s motifs) using %s..." % (len(opts.motifs), opts.cross_cov_bins)) mbinRunner.combine_contigs_for_bin_level() logging.info("Done.") if opts.h5_type == "bas": # Combine subread data across multiple movies logging.info("Combining subread data across all movies...") results = mbinRunner.combine_subread_data_across_bas_movies() logging.info("Done.") # Combine movie-merged subreads to get read-level barcodes logging.info("Combining subreads to get read-level barcodes...") results = mbinRunner.bas_combine_subreads_for_read_level() logging.info("Done.") if opts.sam != None: logging.info("Writing read-contig assignments based on %s..." % opts.sam) mbinRunner.get_read_refs_from_SAM() logging.info("Done.") for i, h5_file in enumerate(h5_files): logging.info( "Creating contig-level barcodes (%s motifs) from %s..." % (len(opts.motifs), h5_file)) mbinRunner.combine_subreads_for_contig_level(h5_file) logging.info("Done.") n_contigs = len( np.loadtxt(os.path.join(opts.tmp, mbinRunner.fns["contig_names"]), dtype="str", ndmin=1)) logging.info("Writing output files:") if opts.h5_type == "cmp": write_contig_features(mbinRunner, opts) if opts.aligned_read_barcodes: write_aligned_read_features(mbinRunner, opts) elif opts.h5_type == "bas": write_unaligned_read_features(mbinRunner, opts) logging.info("Cleaning up temp files from methylation profiling...") shutil.rmtree(opts.tmp) logging.info("Pipeline finished.")