Exemplo n.º 1
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    fc_name, fc_date = get_flowcell_info(fc_dir)

    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        log.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    fastq_dir = get_fastq_dir(fc_dir)
    run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_lane,
                       ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
                        for i in run_items)):
            pass
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(align_dir,
            fastq_dir, work_dir, fc_name, fc_date, run_items)
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name],
                                         bam_files, work_dir, config, config_file)
                                        for name, bam_files in sample_files)):
            pass
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 2
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Exemplo n.º 3
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir), config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "align": align_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(run_module, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("recalibrate_sample", samples)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("process_sample", samples)
    samples = run_parallel("generate_bigwig", samples,
                           {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 4
0
def run_main(config, config_file, work_dir, parallel,
         fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"],
                                           dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 5
0
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:  # and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                  ",".join(lanes)]
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)

    return fastq_dir
Exemplo n.º 6
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    align_dir = os.path.join(work_dir, "alignments")

    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    # process each flowcell lane
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = _run_parallel("process_lane", lanes, dirs, config)
    _run_parallel("process_alignment", lane_items, dirs, config)
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = \
                  organize_samples(dirs, fc_name, fc_date, run_items)
    samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file)
               for n, bam_files in sample_files)
    _run_parallel("process_sample", samples, dirs, config)

    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 7
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    fastq_dir = analysis_dir if bc_id else get_fastq_dir(fc_dir)
    for fname in glob.glob(os.path.join(fastq_dir, "%s_*fastq.txt" % base)):
        yield (fname, os.path.basename(fname))
    for summary_file in glob.glob(
            os.path.join(analysis_dir, "%s-*summary.pdf" % base)):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for bam_file in glob.glob(
            os.path.join(analysis_dir, "%s-*sort-dup.bam" % base)):
        yield (bam_file, _name_with_ext(bam_file, ".bam"))
    for wig_file in glob.glob(
            os.path.join(analysis_dir, "%s-*sort.bigwig" % base)):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload any recalibrated BAM files used for SNP calling
    found_recal = False
    for bam_file in glob.glob(
            os.path.join(analysis_dir,
                         "%s-*gatkrecal-realign-sort.bam" % base)):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in glob.glob(
                os.path.join(analysis_dir, "%s-*gatkrecal.bam" % base)):
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    # Genotype files produced by SNP calling
    for snp_file in glob.glob(
            os.path.join(analysis_dir, "%s-*snp-filter.vcf" % base)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
    # Effect information on SNPs
    for snp_file in glob.glob(
            os.path.join(analysis_dir, "%s-*snp-filter-effects.tsv" % base)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
Exemplo n.º 8
0
Arquivo: main.py Projeto: kevyin/bcbb
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    align_dir = os.path.join(work_dir, "alignments")
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    ## process each flowcell lane
    #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    #lane_items = run_parallel("process_lane", lanes)

    logger.info (">>> Parse lane")
    lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config)
    
    #for item in lane_items:
        #utils.prettyprint_dict(item)

    logger.info (">>> Process alignment")
    align_items = run_parallel("process_alignment", lane_items)

    ## process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    logger.info (">>> Merge samples")
    samples = run_parallel("merge_sample", samples)
    logger.info (">>> Recalibrate samples")
    samples = run_parallel("recalibrate_sample", samples)
    logger.info (">>> realign sample")
    samples = parallel_realign_sample(samples, run_parallel)
    logger.info (">>> variantcall")
    samples = parallel_variantcall(samples, run_parallel)
    logger.info (">>> postprocess_variatns")
    samples = run_parallel("postprocess_variants", samples)
    logger.info (">>> combine_multiple_calles")
    samples = combine_multiple_callers(samples)
    logger.info (">>> detect_sv")
    samples = run_parallel("detect_sv", samples)
    logger.info (">>> combine_calls")
    samples = run_parallel("combine_calls", samples)
    logger.info (">>> process_sample")
    run_parallel("process_sample", samples)
    logger.info (">>> Generate bigwig")
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    logger.info (">>> Writing project summary")
    write_project_summary(samples)
    logger.info (">>> Writing metrics")
    write_metrics(run_info, fc_name, fc_date, dirs)
    logger.info (">>> Done")
Exemplo n.º 9
0
def select_upload_files(lane, fc_dir, analysis_dir):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    # fastq, summary and alignment file
    for fname in glob.glob(os.path.join(get_fastq_dir(fc_dir),
            "%s_*_fastq.txt" % lane)):
        yield (fname, os.path.basename(fname))
    for summary_file in glob.glob(os.path.join(analysis_dir,
            "%s_*-summary.pdf" % lane)):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for bam_file in glob.glob(os.path.join(analysis_dir,
            "%s_*-sort-dup.bam" % lane)):
        yield (bam_file, _name_with_ext(bam_file, ".bam"))
    for wig_file in glob.glob(os.path.join(analysis_dir,
            "%s_*-sort.bigwig" % lane)):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload any recalibrated BAM files used for SNP calling
    found_recal = False
    for bam_file in glob.glob(os.path.join(analysis_dir,
            "%s_*-gatkrecal-realign-sort.bam" % lane)):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in glob.glob(os.path.join(analysis_dir,
                "%s_*-gatkrecal.bam" % lane)):
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    # Genotype files produced by SNP calling
    for snp_file in glob.glob(os.path.join(analysis_dir,
            "%s_*-snp-filter.vcf" % lane)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
Exemplo n.º 10
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "align": align_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir,
    }
    run_parallel = parallel_runner(run_module, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("recalibrate_sample", samples)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("process_sample", samples)
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 11
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = run_info.organize(dirs, config, run_info_yaml)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    lane_items = lane.process_all_lanes(run_items, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Exemplo n.º 12
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    #print "Generating fastq files"
    #all_lanes = [i['lane'] for i in run_info["details"]]
    #short_fc_name = "%s_%s" % (fc_date, fc_name)
    #fastq_dir = generate_fastq(fc_dir, short_fc_name, all_lanes)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
    write_metrics(run_info, work_dir, fc_dir, fastq_dir)
Exemplo n.º 13
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    fastq_dir = analysis_dir if bc_id else get_fastq_dir(fc_dir)
    for fname in glob.glob(os.path.join(fastq_dir, "%s_*fastq.txt" % base)):
        yield (fname, os.path.basename(fname))
    for summary_file in glob.glob(os.path.join(analysis_dir,
            "%s-*summary.pdf" % base)):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for bam_file in glob.glob(os.path.join(analysis_dir,
            "%s-*sort-dup.bam" % base)):
        yield (bam_file, _name_with_ext(bam_file, ".bam"))
    for wig_file in glob.glob(os.path.join(analysis_dir,
            "%s-*sort.bigwig" % base)):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload any recalibrated BAM files used for SNP calling
    found_recal = False
    for bam_file in glob.glob(os.path.join(analysis_dir,
            "%s-*gatkrecal-realign-sort.bam" % base)):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in glob.glob(os.path.join(analysis_dir,
                "%s-*gatkrecal.bam" % base)):
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    # Genotype files produced by SNP calling
    for snp_file in glob.glob(os.path.join(analysis_dir,
            "%s-*snp-filter.vcf" % base)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
    # Effect information on SNPs
    for snp_file in glob.glob(os.path.join(analysis_dir,
            "%s-*snp-filter-effects.tsv" % base)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
Exemplo n.º 14
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    # if we have barcodes, update our search name and get local fastq files
    if bc_id:
        fastq_dir = os.path.join(analysis_dir, "%s_barcode" % base)
        base = "%s_%s" % (base, bc_id)
    # otherwise, use the original fastq files
    else:
        fastq_dir = get_fastq_dir(fc_dir)
    # fastq, summary and alignment file
    for fname in glob.glob(os.path.join(fastq_dir, "%s*_fastq.txt" % base)):
        yield (fname, os.path.basename(fname))
    for summary_file in glob.glob(os.path.join(analysis_dir, "%s*-summary.pdf" % base)):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-sort-dup.bam" % base)):
        yield (bam_file, _name_with_ext(bam_file, ".bam"))
    for wig_file in glob.glob(os.path.join(analysis_dir, "%s*-sort.bigwig" % base)):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload any recalibrated BAM files used for SNP calling
    found_recal = False
    for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-gatkrecal-realign-sort.bam" % base)):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-gatkrecal.bam" % base)):
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    # Genotype files produced by SNP calling
    for snp_file in glob.glob(os.path.join(analysis_dir, "%s*-snp-filter.vcf" % base)):
        yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
Exemplo n.º 15
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_lanes_with_pipelines(samples)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Exemplo n.º 16
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config, fname_out=None):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    def _name_with_ext(orig_file, ext):
        """Return a normalized filename without internal processing names.

        Use specific base out filename if specific, allowing configuration
        named output files.
        """
        if fname_out is None:
            base = os.path.basename(orig_file).split("-")[0]
        else:
            base = fname_out
        for extra in ["_trim"]:
            if base.endswith(extra):
                base = base[:-len(extra)]
        return "%s%s" % (base, ext)

    base_glob = _dir_glob(base, analysis_dir)
    # Configurable upload of fastq files -- BAM provide same information, compacted
    if config["algorithm"].get("upload_fastq", True):
        # look for fastq files in a barcode directory or the main fastq directory
        bc_base = base.rsplit("_", 1)[0] if bc_id else base
        bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base)
        fastq_glob = "%s_*fastq.txt" % base
        found_fastq = False
        for fname in glob.glob(os.path.join(bc_dir, fastq_glob)):
            found_fastq = True
            yield (fname, os.path.basename(fname))
        if not found_fastq:
            fastq_dir = get_fastq_dir(fc_dir)
            for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)):
                yield (fname, os.path.basename(fname))
    for summary_file in base_glob("summary.pdf"):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for wig_file in base_glob(".bigwig"):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload BAM files, preferring recalibrated and realigned files
    found_bam = False
    for orig_ext, new_ext in [("gatkrecal-realign-dup.bam", "-gatkrecal-realign.bam"),
                              ("gatkrecal-realign.bam", "-gatkrecal-realign.bam"),
                              ("gatkrecal.bam", "-gatkrecal.bam"),
                              ("sort-dup.bam", ".bam"),
                              ("sort.bam", ".bam")]:
        if not found_bam:
            for bam_file in base_glob(orig_ext):
                yield (bam_file, _name_with_ext(bam_file, new_ext))
                found_bam = True
    # Genotype files produced by SNP calling
    found = False
    for orig_ext, new_ext in [("variants-combined-annotated.vcf", "-variants.vcf"),
                              ("variants-*-annotated.vcf", "-variants.vcf")]:
        if not found:
            for snp_file in base_glob(orig_ext):
                yield (snp_file, _name_with_ext(bam_file, new_ext))
                found = True
    # Effect information on SNPs
    for snp_file in base_glob("variants-*-effects.tsv"):
        yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))
Exemplo n.º 17
0
def _generate_fastq(fc_dir):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        with utils.chdir(os.path.split(fastq_dir)[0]):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                    ",".join(lanes)]
            subprocess.check_call(cl)
    return fastq_dir
Exemplo n.º 18
0
Arquivo: main.py Projeto: jme9/wabio
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 19
0
def _normalize_files(item, fc_dir):
    """Ensure the files argument is a list of absolute file names.
    Handles BAM, single and paired end fastq.
    """
    files = item.get("files")
    if files:
        if isinstance(files, basestring):
            files = [files]
        if fc_dir:
            fastq_dir = get_fastq_dir(fc_dir)
            files = [
                x if os.path.isabs(x) else os.path.normpath(
                    os.path.join(fastq_dir, x)) for x in files
            ]
        item["files"] = files
    return item
Exemplo n.º 20
0
def _normalize_files(item, fc_dir):
    """Ensure the files argument is a list of absolute file names.
    Handles BAM, single and paired end fastq.
    """
    files = item.get("files")
    if files:
        if isinstance(files, basestring):
            files = [files]
        if fc_dir:
            fastq_dir = get_fastq_dir(fc_dir)
        else:
            fastq_dir = os.getcwd()
        files = [x if os.path.isabs(x) else os.path.normpath(os.path.join(fastq_dir, x))
                 for x in files]
        item["files"] = files
    return item
Exemplo n.º 21
0
def _generate_fastq(fc_dir, config):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        log.info("Generating fastq files for %s" % fc_dir)
        with utils.chdir(basecall_dir):
            lanes = sorted(
                list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)]
            log.info("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)
            log.info("Qseq to fastq conversion completed.")
    return fastq_dir
Exemplo n.º 22
0
def _generate_fastq(fc_dir, config):
    """Generate fastq files for the current flowcell.
    """
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        log.info("Generating fastq files for %s" % fc_dir)
        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
                glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
                    ",".join(lanes)]
            log.info("Converting qseq to fastq on all lanes.")
            subprocess.check_call(cl)
            log.info("Qseq to fastq conversion completed.")
    return fastq_dir
Exemplo n.º 23
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    config = load_config(config_file)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
Exemplo n.º 24
0
def main(config_file, fc_dir):
    work_dir = os.getcwd()
    config = load_config(config_file)
    galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_info = galaxy_api.run_details(fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    if config["algorithm"]["num_cores"] > 1:
        pool = Pool(config["algorithm"]["num_cores"])
        try:
            pool.map(_process_wrapper,
                    ((i, fastq_dir, fc_name, fc_date, config, config_file)
                        for i in run_info["details"]))
        except:
            pool.terminate()
            raise
    else:
        map(_process_wrapper,
            ((i, fastq_dir, fc_name, fc_date, config, config_file)
                for i in run_info["details"]))
Exemplo n.º 25
0
def main(config_file, fc_dir, run_info_yaml=None):
    work_dir = os.getcwd()
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    if run_info_yaml:
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name)
    fc_name, fc_date = get_flowcell_info(fc_dir)
    run_items = _add_multiplex_to_control(run_info["details"])
    fastq_dir = get_fastq_dir(fc_dir)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    pool = (Pool(config["algorithm"]["num_cores"])
            if config["algorithm"]["num_cores"] > 1 else None)
    map_fn = pool.map if pool else map
    try:
        map_fn(_process_lane_wrapper,
                ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
                    for i in run_items))
    except:
        if pool:
            pool.terminate()
        raise
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(align_dir,
            fastq_dir, work_dir, fc_name, fc_date, run_items)
    try:
        map_fn(_process_sample_wrapper,
          ((name, sample_fastq[name], sample_info[name], bam_files, work_dir,
              config, config_file) for name, bam_files in sample_files))
    except:
        if pool:
            pool.terminate()
        raise
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 26
0
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = lane.process_all_lanes(lanes, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        pipeline_items = _add_provenance(pipeline_items, dirs, config)
        for xs in pipeline.run(config, config_file, run_parallel, dirs,
                               pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 27
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    base_glob = _dir_glob(base, analysis_dir)
    # Configurable upload of fastq files -- BAM provide same information, compacted
    if config["algorithm"].get("upload_fastq", True):
        # look for fastq files in a barcode directory or the main fastq directory
        bc_base = base.rsplit("_", 1)[0] if bc_id else base
        bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base)
        fastq_glob = "%s_*fastq.txt" % base
        found_fastq = False
        for fname in glob.glob(os.path.join(bc_dir, fastq_glob)):
            found_fastq = True
            yield (fname, os.path.basename(fname))
        if not found_fastq:
            fastq_dir = get_fastq_dir(fc_dir)
            for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)):
                yield (fname, os.path.basename(fname))
    for summary_file in base_glob("summary.pdf"):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for wig_file in base_glob(".bigwig"):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload BAM files, preferring recalibrated and realigned files
    found_recal = False
    for bam_file in base_glob("gatkrecal-realign-dup.bam"):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in base_glob("gatkrecal.bam"):
            found_recal = True
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    if not found_recal:
        for bam_file in base_glob("sort-dup.bam"):
            yield (bam_file, _name_with_ext(bam_file, ".bam"))
    # Genotype files produced by SNP calling
    for snp_file in base_glob("variants-combined-annotated.vcf"):
        yield (snp_file, _name_with_ext(bam_file, "-variants.vcf"))
    # Effect information on SNPs
    for snp_file in base_glob("variants-combined-effects.tsv"):
        yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))
Exemplo n.º 28
0
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    base_glob = _dir_glob(base, analysis_dir)
    # Configurable upload of fastq files -- BAM provide same information, compacted
    if config["algorithm"].get("upload_fastq", True):
        # look for fastq files in a barcode directory or the main fastq directory
        bc_base = base.rsplit("_", 1)[0] if bc_id else base
        bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base)
        fastq_glob = "%s_*fastq.txt" % base
        found_fastq = False
        for fname in glob.glob(os.path.join(bc_dir, fastq_glob)):
            found_fastq = True
            yield (fname, os.path.basename(fname))
        if not found_fastq:
            fastq_dir = get_fastq_dir(fc_dir)
            for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)):
                yield (fname, os.path.basename(fname))
    for summary_file in base_glob("summary.pdf"):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for bam_file in base_glob("sort-dup.bam"):
        yield (bam_file, _name_with_ext(bam_file, ".bam"))
    for wig_file in base_glob("sort.bigwig"):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload any recalibrated BAM files used for SNP calling
    found_recal = False
    for bam_file in base_glob("gatkrecal-realign-sort.bam"):
        found_recal = True
        yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam"))
    if not found_recal:
        for bam_file in base_glob("gatkrecal.bam"):
            yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam"))
    # Genotype files produced by SNP calling
    for snp_file in base_glob("snp-filter.vcf"):
        yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
    # Effect information on SNPs
    for snp_file in base_glob("snp-filter-effects.tsv"):
        yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
Exemplo n.º 29
0
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Exemplo n.º 30
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    fc_name, fc_date = get_flowcell_info(fc_dir)

    if run_info_yaml and os.path.exists(run_info_yaml):
        log.info("Found YAML samplesheet, using %s instead of Galaxy API" %
                 run_info_yaml)
        with open(run_info_yaml) as in_handle:
            run_details = yaml.load(in_handle)
        run_info = dict(details=run_details, run_id="")
    else:
        log.info("Fetching run details from Galaxy instance")
        galaxy_api = GalaxyApiAccess(config['galaxy_url'],
                                     config['galaxy_api_key'])
        run_info = galaxy_api.run_details(fc_name, fc_date)
    fastq_dir = get_fastq_dir(fc_dir)
    run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir,
                                            fc_name)
    align_dir = os.path.join(work_dir, "alignments")

    # process each flowcell lane
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(
                process_lane,
            ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file)
             for i in run_items)):
            pass
    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = organize_samples(
        align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items)
    with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap:
        for _ in cpmap(process_sample,
                       ((name, sample_fastq[name], sample_info[name],
                         bam_files, work_dir, config, config_file)
                        for name, bam_files in sample_files)):
            pass
    write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
Exemplo n.º 31
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt"))
    prog = RecordProgress(work_dir)
    to_compress = set()
    prog.progress("analysis_start")

    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)

    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}

    run_parallel = parallel_runner(run_module, dirs, config, config_file)
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    _add_to_compress(to_compress, lane_items, 'lane_items')
    prog.dummy()
    prog.progress("process_lane")

    # Remove spiked in controls, contaminants etc.
    lane_items = run_parallel("remove_contaminants", lane_items)
    _add_to_compress(to_compress, lane_items, 'lane_items')
    prog.dummy()
    prog.progress("remove_contaminants")
    align_items = run_parallel("process_alignment", lane_items)
    _add_to_compress(to_compress, align_items, 'align_items')
    prog.dummy()
    prog.progress("process_alignment")

    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("merge_sample")
    samples = run_parallel("mark_duplicates_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("mark_duplicates_sample")
    run_parallel("screen_sample_contaminants", samples)
    prog.dummy()
    prog.progress("screen_sample_contaminants")
    samples = run_parallel("recalibrate_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("recalibrate_sample")
    samples = parallel_realign_sample(samples, run_parallel)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("realign_sample")
    samples = parallel_variantcall(samples, run_parallel)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("variantcall")
    samples = run_parallel("detect_sv", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("detect_sv")
    samples = run_parallel("process_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("process_sample")
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("generate_bigwig")
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
    prog.dummy()
    prog.progress("write_metrics")

    # Compress all files in to_compress
    if config['algorithm'].get('compress_files', True):
        sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress])
        before = sum([s[0] for s in sizes])
        after = sum([s[1] for s in sizes])
        logger.info("Space used by the files before compressing (in bytes): " \
                     + str(before))
        logger.info("Space used by the files after compressing (in bytes): " \
                     + str(after))
        logger.info("Saved space (in bytes): " + str(before - after))
Exemplo n.º 32
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):

    _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt"))
    prog = utils.RecordProgress(work_dir)
    to_compress = set()
    prog.progress("analysis_start")

    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)

    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}

    run_parallel = parallel_runner(run_module, dirs, config, config_file)
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    [to_compress.add(f) for f in lane_items[0][0:2]]
    prog.progress("process_lane")

    # upload the sequencing report to Google Docs
    # will skip this for now and rely on external mechanism for uploading this data
    #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt")
    #if not os.path.exists(gdocs_indicator) \
    #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file):
    #    utils.touch_file(gdocs_indicator)

    # Remove spiked in controls, contaminants etc.
    lane_items = run_parallel("remove_contaminants", lane_items)
    [to_compress.add(f) for f in lane_items[0][0:2]]
    prog.progress("remove_contaminants")
    align_items = run_parallel("process_alignment", lane_items)
    [to_compress.add(f) for f in align_items[0]['fastq']]
    prog.progress("process_alignment")

    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    to_compress.add(samples[0][0]['fastq1'])
    to_compress.add(samples[0][0]['fastq2'])
    prog.progress("merge_sample")
    samples = run_parallel("mark_duplicates_sample", samples)
    to_compress.add(samples[0][0]['fastq1'])
    to_compress.add(samples[0][0]['fastq2'])
    prog.progress("mark_duplicates_sample")
    run_parallel("screen_sample_contaminants", samples)
    prog.progress("screen_sample_contaminants")
    samples = run_parallel("recalibrate_sample", samples)
    prog.progress("recalibrate_sample")
    samples = parallel_realign_sample(samples, run_parallel)
    prog.progress("realign_sample")
    samples = parallel_variantcall(samples, run_parallel)
    prog.progress("variantcall")
    samples = run_parallel("detect_sv", samples)
    prog.progress("detect_sv")
    samples = run_parallel("process_sample", samples)
    prog.progress("process_sample")
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    prog.progress("generate_bigwig")
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
    prog.progress("write_metrics")
    # Write statusdb metrics
    # will skip this for now and rely on external mechanism for uploading this data
    #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config)

    #Compress all files in to_compress
    if config['algorithm'].get('compress_files', True):
        (before, after) = utils.compress_files(to_compress)
        logger.info("Space used by the files before compressing (in bytes): " \
                     + str(before))
        logger.info("Space used by the files after compressing (in bytes): " \
                     + str(after))
        logger.info("Saved space (in bytes): " + str(before - after))
Exemplo n.º 33
0
def select_upload_files(base,
                        bc_id,
                        fc_dir,
                        analysis_dir,
                        config,
                        fname_out=None):
    """Select fastq, bam alignment and summary files for upload to Galaxy.
    """
    def _name_with_ext(orig_file, ext):
        """Return a normalized filename without internal processing names.

        Use specific base out filename if specific, allowing configuration
        named output files.
        """
        if fname_out is None:
            base = os.path.basename(orig_file).split("-")[0]
        else:
            base = fname_out
        for extra in ["_trim"]:
            if base.endswith(extra):
                base = base[:-len(extra)]
        return "%s%s" % (base, ext)

    base_glob = _dir_glob(base, analysis_dir)
    # Configurable upload of fastq files -- BAM provide same information, compacted
    if config["algorithm"].get("upload_fastq", True):
        # look for fastq files in a barcode directory or the main fastq directory
        bc_base = base.rsplit("_", 1)[0] if bc_id else base
        bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base)
        fastq_glob = "%s_*fastq.txt" % base
        found_fastq = False
        for fname in glob.glob(os.path.join(bc_dir, fastq_glob)):
            found_fastq = True
            yield (fname, os.path.basename(fname))
        if not found_fastq:
            fastq_dir = get_fastq_dir(fc_dir)
            for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)):
                yield (fname, os.path.basename(fname))
    for summary_file in base_glob("summary.pdf"):
        yield (summary_file, _name_with_ext(summary_file, "-summary.pdf"))
    for wig_file in base_glob(".bigwig"):
        yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig"))
    # upload BAM files, preferring recalibrated and realigned files
    found_bam = False
    for orig_ext, new_ext in [
        ("gatkrecal-realign-dup.bam", "-gatkrecal-realign.bam"),
        ("gatkrecal-realign.bam", "-gatkrecal-realign.bam"),
        ("gatkrecal.bam", "-gatkrecal.bam"), ("sort-dup.bam", ".bam"),
        ("sort.bam", ".bam")
    ]:
        if not found_bam:
            for bam_file in base_glob(orig_ext):
                yield (bam_file, _name_with_ext(bam_file, new_ext))
                found_bam = True
    # Genotype files produced by SNP calling
    found = False
    for orig_ext, new_ext in [("variants-combined-annotated.vcf",
                               "-variants.vcf"),
                              ("variants-*-annotated.vcf", "-variants.vcf")]:
        if not found:
            for snp_file in base_glob(orig_ext):
                yield (snp_file, _name_with_ext(bam_file, new_ext))
                found = True
    # Effect information on SNPs
    for snp_file in base_glob("variants-*-effects.tsv"):
        yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))