Exemplo n.º 1
0
def filter_motifs(opts, h5_files):
    """
	Samples N=<N_reads> reads and looks for evidence of methylation
	in all the motifs in the query space.
	"""
    if os.path.exists(opts.tmp):
        shutil.rmtree(opts.tmp)
    os.mkdir(opts.tmp)

    ##########################################################
    # Check which motifs are in the dictionary of control IPDs
    ##########################################################
    logging.info("")
    logging.info(
        "Assessing the motifs for which we have control IPD values...")
    controls_d = pickle.load(open(opts.control_pkl_name, "r"))
    all_m = controls_d.keys()

    logging.info(" -- Contiguous (e.g. CATG):")
    for l in range(1, 20):
        hits = len([
            m for m in all_m if len(m.split("-")[0]) == l and m.find("N") < 0
        ])
        if hits > 0:
            logging.info("    Found %s motifs of length %s" % (hits, l))

    logging.info(" -- Bipartite (e.g. ACGNNNNNCTT):")
    bi_hits = len([m for m in all_m if m.find("N") > -1])
    logging.info("    Found %s motifs" % bi_hits)
    logging.info("Found %s total motifs" % len(all_m))
    logging.info("")

    mbinRunner = mbin.mbinRunner(opts)
    ##################################################
    # Launch analysis of <N_reads> for motif filtering
    ##################################################
    for i, h5_file in enumerate(h5_files):
        logging.info("Creating %s barcodes (%s motifs) from %s..." %
                     (opts.N_reads,
                      (len(opts.motifs) + len(opts.bi_motifs)), h5_file))
        mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts)

    if opts.h5_type == "bas":
        # Combine subread data across multiple movies
        logging.info("Combining subread data across all movies...")
        results = mbinRunner.combine_subread_data_across_bas_movies()
        logging.info("Done.")
        # Combine movie-merged subreads to get read-level barcodes
        logging.info("Combining subreads to get read-level barcodes...")
        results = mbinRunner.bas_combine_subreads_for_read_level(tmp_run=True)
        logging.info("Done.")

    filter_runner = FilterRunner(opts, h5_files)
    filter_runner.run(mbinRunner)

    logging.info("Cleaning up temp files from motif filtering...")
    shutil.rmtree(opts.tmp)
Exemplo n.º 2
0
def extract_controls(opts, control_aln_fn):
    """

	"""
    controls = ControlRunner(control_aln_fn, opts)
    mbinRunner = mbin.mbinRunner(opts)

    # Pulling the IPD data for each motif from the WGA cmp.h5 file
    motifs, bi_motifs = motif_tools.build_motif_sets(opts)
    opts.motifs = motifs
    opts.bi_motifs = bi_motifs

    logging.info("")
    logging.info("Preparing to create new control data in %s" %
                 opts.control_tmp)
    controls.goto_control_output_dir()

    opts = controls.scan_WGA_aligns()
    filter_N_reads = opts.N_reads

    mbinRunner.launch_data_loader(control_aln_fn, filter_N_reads, 1, opts)

    controls.analyze_WGA_reads()
    logging.info("Done.")
    logging.info("")

    # Building dictionary of mean control IPD values for each motif
    logging.info("Building dictionary of control values for all motifs...")
    logging.info("   * Initial build requires significant time and memory.")
    controls.combine_control_data_from_contigs()

    control_means = controls.build_control_IPD_dict(motifs, bi_motifs)
    controls.return_to_orig_dir()

    logging.info("")
    logging.info("Cleaning up temp files from control data processing...")
    shutil.rmtree(opts.control_tmp)

    # Controls are loaded into control_means, now pickle them for easy
    # passing between parallel processes
    pickle.dump(control_means, open(opts.control_pkl_name, "wb"))
Exemplo n.º 3
0
def build_profiles(opts, h5_files, motifs, motifs_fn):
    """

	"""
    if os.path.exists(opts.tmp):
        shutil.rmtree(opts.tmp)
    os.mkdir(opts.tmp)

    opts.motifs_file = motifs_fn
    opts.motifs = motifs
    opts.bi_motifs = None

    logging.info("Building methylation profiles using %s motifs..." %
                 len(opts.motifs))
    to_del = glob.glob(os.path.join(opts.tmp, "*"))
    for fn in to_del:
        os.remove(fn)

    mbinRunner = mbin.mbinRunner(opts)
    ##################################################
    # Launch analysis of <N_reads> for motif filtering
    ##################################################
    for i, h5_file in enumerate(h5_files):
        logging.info("Creating %s barcodes (%s motifs) from %s..." %
                     (opts.N_reads, len(opts.motifs), h5_file))
        mbinRunner.launch_data_loader(h5_file, opts.N_reads, i, opts)

        if opts.h5_type == "cmp":
            logging.info(
                "Combining subread-level barcodes to get read-level barcodes from each contig..."
            )
            contig_labels_fns = glob.glob(
                os.path.join(opts.tmp, "*_labels.tmp"))
            contigs = map(
                lambda x: os.path.basename(x).split("_labels.tmp")[0],
                contig_labels_fns)
            args = [(h5_file, contig, opts.tmp, opts.h5_labels, i,
                     len(contigs)) for i, contig in enumerate(contigs)]
            results = mbin.launch_pool(opts.procs,
                                       combine_subreads_for_read_level, args)

            logging.info("Combining read-level barcodes from all contigs...")
            mbinRunner.combine_read_level_barcodes_across_contigs()
            logging.info("Done.")

            logging.info(
                "Creating contig-level barcodes (%s motifs) from %s..." %
                (len(opts.motifs), h5_file))
            mbinRunner.combine_subreads_for_contig_level(h5_file)
            logging.info("Done.")
            n_contigs = len(
                np.loadtxt(os.path.join(opts.tmp,
                                        mbinRunner.fns["contig_names"]),
                           dtype="str",
                           ndmin=1))

            if opts.cross_cov_bins != None:
                logging.info(
                    "Creating bin-level barcodes (%s motifs) using %s..." %
                    (len(opts.motifs), opts.cross_cov_bins))
                mbinRunner.combine_contigs_for_bin_level()
                logging.info("Done.")

    if opts.h5_type == "bas":
        # Combine subread data across multiple movies
        logging.info("Combining subread data across all movies...")
        results = mbinRunner.combine_subread_data_across_bas_movies()
        logging.info("Done.")
        # Combine movie-merged subreads to get read-level barcodes
        logging.info("Combining subreads to get read-level barcodes...")
        results = mbinRunner.bas_combine_subreads_for_read_level()
        logging.info("Done.")

        if opts.sam != None:
            logging.info("Writing read-contig assignments based on %s..." %
                         opts.sam)
            mbinRunner.get_read_refs_from_SAM()
            logging.info("Done.")
            for i, h5_file in enumerate(h5_files):
                logging.info(
                    "Creating contig-level barcodes (%s motifs) from %s..." %
                    (len(opts.motifs), h5_file))
                mbinRunner.combine_subreads_for_contig_level(h5_file)
                logging.info("Done.")
            n_contigs = len(
                np.loadtxt(os.path.join(opts.tmp,
                                        mbinRunner.fns["contig_names"]),
                           dtype="str",
                           ndmin=1))

    logging.info("Writing output files:")

    if opts.h5_type == "cmp":
        write_contig_features(mbinRunner, opts)
        if opts.aligned_read_barcodes:
            write_aligned_read_features(mbinRunner, opts)
    elif opts.h5_type == "bas":
        write_unaligned_read_features(mbinRunner, opts)

    logging.info("Cleaning up temp files from methylation profiling...")
    shutil.rmtree(opts.tmp)
    logging.info("Pipeline finished.")