Exemplo n.º 1
0
def split_bed_to_chrom_bed_parallel(bed_files, out_dir, parallel=12):
    """split a list of bed files into chromosome bed files, in parallel
    """
    # put commands in queue
    split_queue = setup_multiprocessing_queue()
    for bed_file in bed_files:
        prefix = os.path.basename(bed_file).split(".narrowPeak")[0].split(
            ".bed")[0]
        split_args = [out_dir, bed_file, prefix]
        split_queue.put([split_bed_to_chrom_bed, split_args])

    # run the queue
    run_in_parallel(split_queue, parallel=parallel, wait=True)

    return None
Exemplo n.º 2
0
def run_crawl(cr_job):
    cr_agent = cr_job.crawl_agent
    url_tuples = cr_job.url_tuples

    # only copy the variables that'll be used by the agent. Parallelization requires picklable variables. 
    cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \
                     ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \
                      'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \
                      'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__])

    worker = partial(crawl_worker, cfg_dict)
    
    parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs)
    
    lp.close_index_html(cr_job.index_html_log)
Exemplo n.º 3
0
def run_crawl(cr_job):
    cr_agent = cr_job.crawl_agent
    url_tuples = cr_job.url_tuples

    # only copy the variables that'll be used by the agent. Parallelization requires picklable variables.
    cfg_dict = dict([(i, cr_agent.__dict__[i]) for i in \
                     ['fc_fontdebug', 'post_visit_func', 'timeout', 'binary_path', \
                      'use_mitm_proxy', 'mitm_proxy_logs', 'cmd_line_options', 'main_js', \
                      'casper_client_js', 'screenshot', 'job_dir', 'index_html_log', 'type', 'crawl_id'] if i in cr_agent.__dict__])

    worker = partial(crawl_worker, cfg_dict)

    parallelize.run_in_parallel(url_tuples, worker, cr_job.max_parallel_procs)

    lp.close_index_html(cr_job.index_html_log)
Exemplo n.º 4
0
def bin_regions_parallel(bed_files,
                         out_dir,
                         chromsizes,
                         bin_size=200,
                         stride=50,
                         final_length=1000,
                         parallel=12):
    """bin in parallel
    """
    split_queue = setup_multiprocessing_queue()

    for bed_file in bed_files:
        prefix = os.path.basename(bed_file).split(".narrowPeak")[0].split(
            ".bed")[0]
        split_args = [
            bed_file, "{}/{}".format(out_dir, prefix), bin_size, stride,
            final_length, chromsizes, "naive"
        ]
        split_queue.put([bin_regions_sharded, split_args])

    # run the queue
    run_in_parallel(split_queue, parallel=parallel, wait=True)

    return None
Exemplo n.º 5
0
def parse_crawl_logs(path, no_of_procs=16):
    files = fu.gen_find_files("*.txt", path)
    log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html)
    parallelize.run_in_parallel(files, log_worker, no_of_procs)
    wl_log.info("Worker processes are finished, will generate index")
Exemplo n.º 6
0
def parse_crawl_logs(path, no_of_procs=16):
    files = fu.gen_find_files("*.txt", path)
    log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html)
    parallelize.run_in_parallel(files, log_worker, no_of_procs)
    wl_log.info("Worker processes are finished, will generate index")
Exemplo n.º 7
0
def generate_h5_datasets(
        positives_bed_file,
        ref_fasta,
        chromsizes,
        label_files,
        signal_files,
        prefix,
        work_dir,
        bin_size=200,
        stride=50,
        final_length=1000,
        superset_bed_file=None,
        reverse_complemented=False,
        genome_wide=False,
        parallel=24,
        tmp_dir=".",
        normalize_signals=False):
    """generate a full h5 dataset
    """
    if True:
        # first select negatives
        training_negatives_bed_file, genomewide_negatives_bed_file = setup_negatives(
            positives_bed_file,
            superset_bed_file,
            chromsizes,
            bin_size=bin_size,
            stride=stride,
            genome_wide=genome_wide,
            tmp_dir=tmp_dir)

        # collect the bed files
        if genome_wide:
            all_bed_files = [
                positives_bed_file,
                training_negatives_bed_file,
                genomewide_negatives_bed_file]
        else:
            all_bed_files = [
                positives_bed_file,
                training_negatives_bed_file]

        # split to chromosomes
        chrom_dir = "{}/by_chrom".format(tmp_dir)
        os.system("mkdir -p {}".format(chrom_dir))
        split_bed_to_chrom_bed_parallel(
            all_bed_files, chrom_dir, parallel=parallel)

        # split to equally sized bin groups
        chrom_files = glob.glob("{}/*.bed.gz".format(chrom_dir))
        bin_dir = "{}/bin-{}.stride-{}".format(tmp_dir, bin_size, stride)
        os.system("mkdir -p {}".format(bin_dir))
        bin_regions_parallel(
            chrom_files, bin_dir, chromsizes, bin_size=bin_size, stride=stride, parallel=parallel)

        # grab all of these and process in parallel
        h5_dir = "{}/h5".format(work_dir)
        os.system("mkdir -p {}".format(h5_dir))
        chrom_bed_files = glob.glob("{}/*.filt.bed.gz".format(bin_dir))
        logging.info("Found {} bed files".format(chrom_bed_files))
        h5_queue = setup_multiprocessing_queue()
        for bed_file in chrom_bed_files:
            prefix = os.path.basename(bed_file).split(".bed")[0].split(".narrowPeak")[0]
            h5_file = "{}/{}.h5".format(h5_dir, prefix)
            if os.path.isfile(h5_file):
                continue
            parallel_tmp_dir = "{}/{}_tmp".format(tmp_dir, prefix)
            process_args = [
                bed_file,
                ref_fasta,
                chromsizes,
                h5_file,
                label_files,
                signal_files,
                bin_size,
                stride,
                final_length,
                reverse_complemented,
                "features",
                parallel_tmp_dir]
            h5_queue.put([setup_h5_dataset, process_args])

        # run the queue
        run_in_parallel(h5_queue, parallel=parallel, wait=True)

    # also tag each file with the chromosome and positives, negatives, etc
    h5_dir = "{}/h5".format(work_dir)
    h5_files = glob.glob("{}/*h5".format(h5_dir))
    for h5_file in h5_files:
        chrom = os.path.basename(h5_file).split(".")[-4]
        example_type = os.path.basename(h5_file).split(".")[-5]
        if example_type == "master":
            example_type = "positives"
        with h5py.File(h5_file, "a") as hf:
            hf["/"].attrs[_CHROM_TAG] = [chrom]
            hf["/"].attrs[_EXAMPLE_TYPE_TAG] = example_type
    
    return None