예제 #1
0
def calculate_coverage_use_grid(cnf, samples, output_dirpath):
    assert len(samples) > 0

    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)

    chr_len_fpath = get_chr_len_fpath(cnf)
    jobs_to_wait = []

    for sample in samples:
        sample_output_dirpath = join(output_dirpath, sample.name)
        safe_mkdir(sample_output_dirpath)

    for chrom in chromosomes:
        info('Processing chromosome ' + chrom)
        avg_cov_output_fpath = join(output_dirpath, chrom + '.txt.gz')
        sample_output_fpaths = [
            join(output_dirpath, sample.name, chrom + '.txt.gz')
            for sample in samples
        ]

        sample_names = ','.join(sample.name for sample in samples)
        chrom_bams = []

        for sample in samples:
            if not verify_file(sample.bam):
                err('BAM for ' + sample.name + ' is not exist!')
                continue
            output_bam_fpath = join(
                cnf.work_dir,
                basename(sample.name) + '_' + str(chrom) + '.bam')
            cmdline = '{sambamba} slice {sample.bam} {chrom}'.format(
                **locals())
            call(cnf, cmdline, output_fpath=output_bam_fpath)
            if verify_file(output_bam_fpath):
                chrom_bams.append(output_bam_fpath)

        bam_fpaths = ','.join(chrom_bams)

        if cnf.reuse_intermediate and verify_file(avg_cov_output_fpath, silent=True) and \
                all(verify_file(output_fpath, silent=True) for output_fpath in sample_output_fpaths):
            info(avg_cov_output_fpath + ' exists, reusing')
        else:
            j = _submit_region_cov(cnf, cnf.work_dir, chrom, bam_fpaths,
                                   sample_names, output_dirpath, chr_len_fpath)
            if j and not j.is_done:
                jobs_to_wait.append(j)
            info()

        if len(jobs_to_wait) >= cnf.threads:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
            jobs_to_wait = []
        elif not jobs_to_wait:
            info('No jobs to submit.')
    if jobs_to_wait:
        wait_for_jobs(cnf, jobs_to_wait)
def run_sambamba_use_grid(cnf, infos_by_key, mut_bed_fpath):
    sambamba_output_by_experiment = dict()
    not_submitted_experiments = infos_by_key.values()
    while not_submitted_experiments:
        jobs_to_wait = []
        submitted_experiments = []
        reused_experiments = []

        for (group, uniq_key), e in infos_by_key.iteritems():
            if e not in not_submitted_experiments:
                continue
            sambamba_output_fpath = join(cnf.work_dir,
                                         uniq_key + '__mutations.bed')
            sambamba_output_by_experiment[e] = sambamba_output_fpath

            if cnf.reuse_intermediate and verify_file(sambamba_output_fpath,
                                                      silent=True):
                info(sambamba_output_fpath + ' exists, reusing')
                reused_experiments.append(e)
                continue
            else:
                if not e.sample.bam:
                    err('Sample ' + e.sample.name + ' in ' + str(group) +
                        ', ' + str(uniq_key) + ' has no BAM')
                    continue
                j = sambamba_depth(cnf,
                                   mut_bed_fpath,
                                   e.sample.bam,
                                   output_fpath=sambamba_output_fpath,
                                   only_depth=True,
                                   silent=True,
                                   use_grid=True)
                submitted_experiments.append(e)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_experiments = [
            e for e in not_submitted_experiments
            if e not in submitted_experiments and e not in reused_experiments
        ]

    return sambamba_output_by_experiment
예제 #3
0
def __get_mapped_reads(cnf, samples, bam_by_sample, output_fpath):
    if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
        info(output_fpath + ' exists, reusing')
        return output_fpath, samples

    mapped_reads_by_sample = OrderedDict()

    job_by_sample = dict()
    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0
    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for s in not_submitted_samples:
            with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf:
                safe_mkdir(cnf.work_dir)
                # if verify_file(s.targetcov_json_fpath, silent=True):
                #     info('Parsing targetSeq output ' + s.targetcov_json_fpath)
                #     with open(s.targetcov_json_fpath) as f:
                #         data = load(f, object_pairs_hook=OrderedDict)
                #     cov_report = SampleReport.load(data, s)
                #     mapped_reads = next(rec.value for rec in cov_report.records if rec.metric.name == 'Mapped reads')
                #     info(s.name + ': ')
                #     info('  Mapped reads: ' + str(mapped_reads))
                #     mapped_reads_by_sample[s.name] = mapped_reads
                #     reused_samples.append(s)
                #     continue
                #
                # else:
                if s.name not in bam_by_sample:
                    err('No BAM for ' + s.name + ', not running Seq2C')
                    return None, None

                info('Submitting a sambamba job to get mapped read numbers')
                bam_fpath = bam_by_sample[s.name]
                j = number_of_mapped_reads(cnf, bam_fpath, dedup=True, use_grid=True, sample_name=s.name)
                job_by_sample[s.name] = j
                submitted_samples.append(s)
                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    not_submitted_samples = [_s for _s in not_submitted_samples if
                                             _s not in submitted_samples and
                                             _s not in reused_samples]

                    if not_submitted_samples:
                        info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before '
                                 'submitting more ' + str(len(not_submitted_samples)))
                    else:
                        info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                    info()
                    break
                info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if 'work_dir' in j.__dict__ and isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [s for s in not_submitted_samples if
                                 s not in submitted_samples and
                                 s not in reused_samples]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()

    # wait_for_jobs(cnf, job_by_sample.values())
    for s_name, j in job_by_sample.items():
        if j and j.is_done and not j.is_failed:
            with open(j.output_fpath) as f:
                mapped_reads = int(f.read().strip())
                info(s_name + ': ')
                info('  Mapped reads: ' + str(mapped_reads))
                mapped_reads_by_sample[s_name] = mapped_reads
        else:
            err('ERROR: ' + s_name + ' could not get mapped reads, log saved to ' + j.log_fpath)

    with open(output_fpath, 'w') as f:
        for sample_name, mapped_reads in mapped_reads_by_sample.items():
            f.write(sample_name + '\t' + str(mapped_reads) + '\n')

    verify_file(output_fpath, is_critical=True)
    successful_samples = [s for s in samples if s.name in mapped_reads_by_sample]
    info('Samples processed: ' + str(len(samples)) + ', successfully: ' + str(len(successful_samples)))
    return output_fpath, successful_samples
예제 #4
0
def __seq2c_coverage(cnf, samples, bams_by_sample, bed_fpath, is_wgs, output_fpath):
    if cnf.reuse_intermediate and verify_file(output_fpath, silent=True):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    jobs_by_sample = dict()
    depth_output_by_sample = dict()
    seq2cov_output_by_sample = dict()
    seq2c_work_dirpath = join(cnf.work_dir, source.seq2c_name)
    safe_mkdir(seq2c_work_dirpath)
    info()

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0
    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []

        for s in not_submitted_samples:
            info('*' * 50)
            info(s.name + ':')
            with with_cnf(cnf, work_dir=join(cnf.work_dir, s.name)) as cnf:
                safe_mkdir(cnf.work_dir)
                seq2cov_output_by_sample[s.name] = join(seq2c_work_dirpath, s.name + '.seq2cov.txt')

                if not cnf.reuse_intermediate and isfile(seq2cov_output_by_sample[s.name]):
                    os.remove(seq2cov_output_by_sample[s.name])

                if cnf.reuse_intermediate and verify_file(seq2cov_output_by_sample[s.name], silent=True):
                    info(seq2cov_output_by_sample[s.name] + ' exists, reusing')
                    reused_samples.append(s)
                    continue

                elif verify_file(s.targetcov_detailed_tsv, silent=True):
                    info('Using targetcov detailed output for Seq2C coverage.')
                    info(s.name + ': using targetseq output')
                    targetcov_details_to_seq2cov(cnf, s.targetcov_detailed_tsv, seq2cov_output_by_sample[s.name], s.name, is_wgs=is_wgs)
                    reused_samples.append(s)
                    continue

                else:
                    info(s.name + ': ' + s.targetcov_detailed_tsv + ' does not exist: submitting sambamba depth')
                    bam_fpath = bams_by_sample[s.name]
                    depth_output = join(seq2c_work_dirpath, s.name + '_depth' + '.txt')
                    depth_output_by_sample[s.name] = depth_output
                    if cnf.reuse_intermediate and verify_file(depth_output, silent=True):
                        info(depth_output + ' exists, reusing')
                        reused_samples.append(s)
                        continue
                    else:
                        j = sambamba_depth(cnf, bed_fpath, bam_fpath, depth_output, use_grid=True, sample_name=s.name)
                        jobs_by_sample[s.name] = j
                        submitted_samples.append(s)

                        if not j.is_done:
                            jobs_to_wait.append(j)

                        if len(jobs_to_wait) >= cnf.threads:
                            not_submitted_samples = [_s for _s in not_submitted_samples if
                                                     _s not in submitted_samples and
                                                     _s not in reused_samples]

                            if not_submitted_samples:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting them to finish before '
                                         'submitting more ' + str(len(not_submitted_samples)))
                            else:
                                info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                            info()
                            break
                        info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if 'work_dir' in j.__dict__ and isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [s for s in not_submitted_samples if
                                 s not in submitted_samples and
                                 s not in reused_samples]
        info()
        info('*' * 50)

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))

    # wait_for_jobs(cnf, jobs_by_sample.values())
    for s_name, seq2cov_output_fpath in seq2cov_output_by_sample.items():
        if not isfile(seq2cov_output_fpath):
            if verify_file(depth_output_by_sample[s_name], is_critical=True, description='depth_output_by_sample for ' + s_name):
                info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
                bed_col_num = count_bed_cols(bed_fpath)
                sambamba_depth_to_seq2cov(cnf, depth_output_by_sample[s_name], seq2cov_output_by_sample[s_name], s_name, bed_col_num)

            # script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
            #                             is_critical=True)
            # bedcov_hist_fpath = depth_output_by_sample[s_name]
            # cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
            # j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
            # sum_jobs_by_sample[s_name] = j

    # sum_jobs_by_sample = dict()
    # info('* Submitting seq2cov output *')
    # for s_name, j in jobs_by_sample.items():
    #     if not verify_file(seq2cov_output_by_sample[s_name], silent=True):
    #         info(s_name + ': summarizing bedcoverage output ' + depth_output_by_sample[s_name])
    #
    #         script = get_script_cmdline(cnf, 'python', join('tools', 'bed_processing', 'find_ave_cov_for_regions.py'),
    #                                     is_critical=True)
    #         bedcov_hist_fpath = depth_output_by_sample[s_name]
    #         bed_col_num = count_bed_cols(seq2c_bed)
    #         cmdline = '{script} {bedcov_hist_fpath} {s_name} {bed_col_num}'.format(**locals())
    #         j = submit_job(cnf, cmdline, s_name + '_bedcov_2_seq2cov', output_fpath=seq2cov_output_by_sample[s_name])
    #         sum_jobs_by_sample[s_name] = j
    #
    # wait_for_jobs(cnf, sum_jobs_by_sample.values())

    info()
    info('Done')
    info('*' * 50)
    info()
    info('Combining seq2cov output')
    with open(output_fpath, 'w') as out:
        for i, s in enumerate(samples):
            verify_file(seq2cov_output_by_sample[s.name], description='seq2cov_output for ' + s.name, is_critical=True)
            with open(seq2cov_output_by_sample[s.name]) as inp:
                for l in inp:
                    out.write(l)

    verify_file(output_fpath, description='__simulate_cov2cnv_w_bedtools output_fpath', is_critical=True)
    info('Saved combined seq2cov output to ' + output_fpath)
    info()
    return output_fpath
예제 #5
0
def run_targqc(cnf,
               output_dir,
               samples,
               target_bed,
               features_bed,
               genes_fpath=None):
    max_threads = cnf.threads
    threads_per_sample = 1  # max(max_threads / len(samples), 1)
    summary_threads = min(len(samples), max_threads)
    info('Number of threads to run summary: ' + str(summary_threads))

    jobs_to_wait = []
    if not cnf.only_summary:
        original_target_bed = target_bed
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)
        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
        if not genes_fpath:
            genes_fpath = join(cnf.work_dir, 'genes.txt')
            with open(genes_fpath, 'w') as f:
                f.write('\n'.join(g + '\t' + c for g, c in gene_keys_list))

        info('*' * 70)
        info()

        step = _prep_steps(cnf, threads_per_sample, summary_threads, samples,
                           target_bed, original_target_bed, features_bed,
                           features_no_genes_bed, genes_fpath)

        summary_wait_for_steps = []

        for sample in samples:
            info('Processing ' + basename(sample.name))
            input_params = ''
            if sample.bam:
                input_params = ' --bam ' + sample.bam
            elif sample.l_fpath and sample.r_fpath:
                input_params = ' -1 ' + sample.l_fpath + ' -2 ' + sample.r_fpath
            if cnf.downsampled and sample.fastqc_dirpath:
                input_params += ' --downsampled --fastqc-dirpath ' + sample.fastqc_dirpath

            j = _submit_job(cnf,
                            step,
                            sample.name,
                            threads=threads_per_sample,
                            input_params=input_params,
                            targqc_dirpath=sample.targqc_dirpath)
            jobs_to_wait.append(j)
            summary_wait_for_steps.append(step.job_name(sample.name))

            info('Done ' + basename(sample.name))
            info()

    wait_for_jobs(cnf, jobs_to_wait)

    info('Making targqc summary')
    return summarize_targqc(cnf,
                            summary_threads,
                            output_dir,
                            samples,
                            bed_fpath=target_bed,
                            features_fpath=features_bed)
예제 #6
0
def _annotate(cnf, samples):
    varannotate_cmdl = (get_script_cmdline(
        cnf, 'python', join('scripts', 'post', 'varannotate.py')) +
                        ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' +
                        cnf.run_cnf + ' --project-name ' + cnf.project_name +
                        (' --reuse ' if cnf.reuse_intermediate else '') +
                        ' --log-dir -' + ' --genome ' + cnf.genome.name +
                        (' --no-check ' if cnf.no_check else '') +
                        (' --qc' if cnf.qc else ' --no-qc') +
                        ((' --caller ' + cnf.caller) if cnf.caller else ''))

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for sample in not_submitted_samples:
            if not sample.varannotate_dirpath:
                sample.varannotate_dirpath = join(sample.dirpath,
                                                  source.varannotate_name)
            if not sample.anno_vcf_fpath:
                sample.anno_vcf_fpath = join(
                    sample.varannotate_dirpath,
                    add_suffix(basename(sample.vcf), 'anno'))
            output_fpath = sample.anno_vcf_fpath
            if not output_fpath.endswith('.gz'):
                output_fpath += '.gz'
            debug('Checking ' + output_fpath)
            if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf(
                    output_fpath):
                info('Annotated results ' + output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            work_dir = join(cnf.work_dir,
                            source.varannotate_name + '_' + sample.name)
            j = submit_job(
                cnf,
                cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' +
                sample.varannotate_dirpath + ' -s ' + sample.name +
                ' --work-dir ' + work_dir + ' --output-file ' + output_fpath,
                job_name='VA_' + cnf.project_name + '_' + sample.name,
                output_fpath=output_fpath,
                stdout_to_outputfile=False,
                work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]

                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_vcf(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but j.work_dir ' + j.work_dir +
                        ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()
예제 #7
0
def _filter(cnf, samples, variants_fpath, variants_fname):
    # if cohort_mode:
    #     info('Running vcf2txt.pl in cohort mode')
    #     vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True)
    #     vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples}
    #     cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample)
    #     res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath)
    #     if not res:
    #         critical('Error: vcf2txt.pl crashed')

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    cohort_freqs_fpath = None
    # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0:
    #     cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv')
    #     info('*' * 70)
    #     info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort')
    #     # cnf.variant_filtering.max_ratio < 1.0 or \
    #     # cnf.fraction < 1.0
    #     cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut)
    #     info('*' * 70)
    # info()

    not_submitted_samples = samples
    while not_submitted_samples:
        reused_samples = []
        jobs_to_wait = []
        submitted_samples = []
        for sample in not_submitted_samples:
            output_dirpath = sample.varfilter_dirpath = join(
                sample.dirpath, source.varfilter_name)
            output_fpath = sample.variants_fpath = join(
                sample.varfilter_dirpath, variants_fname)
            pass_output_fpath = add_suffix(sample.variants_fpath,
                                           variant_filtering.mut_pass_suffix)

            if cnf.reuse_intermediate and check_filtering_results(output_fpath) \
                    and check_filtering_results(pass_output_fpath):
                info('Filtered results ' + output_fpath + ' and ' +
                     pass_output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            varfilter_py = 'varfilter'
            work_dir = join(cnf.work_dir, 'filt_' + sample.name)
            if not cnf.genome.dbsnp_multi_mafs:
                critical(
                    'Error: dbsnp_multi_mafs is not specified in the config ' +
                    cnf.sys_cnf)
            cmdl = (
                '{varfilter_py}' +
                ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') +
                ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') +
                ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') +
                ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' +
                ' -o {output_dirpath}' +
                ' --output-file {sample.variants_fpath}' + ' --project-name ' +
                cnf.project_name + ' --genome {cnf.genome.name}' +
                ' --work-dir {work_dir}' + ' --debug ' +
                (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath
                 else '') + (' --reuse ' if cnf.reuse_intermediate else '') +
                ((' --caller ' + cnf.caller) if cnf.caller else '') +
                (' --qc' if cnf.qc else ' --no-qc') +
                (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' +
                adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals())
            with with_cnf(cnf, reuse_intermediate=False):
                j = submit_job(cnf,
                               cmdl,
                               job_name='_filt_' + sample.name,
                               output_fpath=pass_output_fpath,
                               stdout_to_outputfile=False,
                               work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]
                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No filtering jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed and not cnf.debug:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but ' + j.work_dir + ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]
    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()

    info('Combining results...')
    vcf2txt_fpaths = [s.variants_fpath for s in samples]
    variants_fpath, pass_variants_fpath = combine_results(
        cnf, samples, vcf2txt_fpaths, variants_fpath)

    if cnf.qc:
        _summarize_varqc(cnf,
                         cnf.output_dir,
                         samples,
                         cnf.project_name,
                         post_filter=True)

    return variants_fpath, pass_variants_fpath
예제 #8
0
def make_fastqc_reports(cnf, fastq_fpaths, output_dir):
    # if isdir(fastqc_dirpath):
    #     if isdir(fastqc_dirpath + '.bak'):
    #         try:
    #             shutil.rmtree(fastqc_dirpath + '.bak')
    #         except OSError:
    #             pass
    #     if not isdir(fastqc_dirpath + '.bak'):
    #         os.rename(fastqc_dirpath, fastqc_dirpath + '.bak')
    # if isdir(fastqc_dirpath):
    #     err('Could not run and combine fastqc because it already exists and could not be moved to fastqc.bak')
    #     return None

    fastqc = get_system_path(cnf, 'fastqc')
    if not fastqc:
        err('FastQC is not found, cannot make reports')
        return None

    else:
        safe_mkdir(output_dir)

        fqc_samples = []
        fastqc_jobs = []
        for fastq_fpath in fastq_fpaths:
            s = FQC_Sample(name=splitext_plus(basename(fastq_fpath))[0],
                           fastq_fpath=fastq_fpath)
            fqc_samples.extend([s])
            info('Added sample ' + s.name)

        for fqc_s in fqc_samples:
            if cnf.reuse_intermediate and verify_file(fqc_s.fastqc_html_fpath,
                                                      silent=True):
                info(fqc_s.fastqc_html_fpath + ' exists, reusing')
            else:
                fastqc_jobs.append(
                    run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir))
            info()

        wait_for_jobs(cnf, fastqc_jobs)

        fastqc_jobs = []
        # while True:
        for fqc_s in fqc_samples:
            fqc_s.fastqc_html_fpath = find_fastqc_html(output_dir, fqc_s.name)
        not_done_fqc = [
            fqc_s for fqc_s in fqc_samples
            if not verify_file(fqc_s.fastqc_html_fpath,
                               description='Not found FastQC html for ' +
                               fqc_s.name)
        ]
        # if not not_done_fqc:
        #     info('')
        #     info('Every FastQC job is done, moving on.')
        #     info('-' * 70)
        #     break
        # else:
        #     info('')
        #     info('Some FastQC jobs are not done (' + ', '.join(f.name for f in not_done_fqc) + '). Retrying them.')
        #     info('')
        #     for fqc_s in not_done_fqc:
        #         fastqc_jobs.append(run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir))
        #     wait_for_jobs(cnf, fastqc_jobs)

        for fqc_s in fqc_samples:
            sample_fastqc_dirpath = join(output_dir, fqc_s.name + '_fastqc')
            if isfile(sample_fastqc_dirpath + '.zip'):
                try:
                    os.remove(sample_fastqc_dirpath + '.zip')
                except OSError:
                    pass

        comb_fastqc_fpath = join(output_dir, 'fastqc.html')
        write_fastqc_combo_report(cnf, comb_fastqc_fpath, fqc_samples)
        verify_file(comb_fastqc_fpath, is_critical=True)
        info('Combined FastQC saved to ' + comb_fastqc_fpath)
        return comb_fastqc_fpath
예제 #9
0
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath,
                             exac_features_fpath):
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False)
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True)

    vcfs_by_chrom = dict()
    tabix = get_system_path(cnf, 'tabix')
    for chrom in chromosomes:
        vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf')
        cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format(
            **locals())
        call(cnf, cmdline)
        if verify_file(vcf_fpath):
            vcfs_by_chrom[chrom] = vcf_fpath

    output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name)
    safe_mkdir(output_dirpath)
    not_submitted_chroms = vcfs_by_chrom.keys()
    sample_names = ','.join(sample.name for sample in samples)
    sample_bams = ','.join(sample.bam for sample in samples)
    while not_submitted_chroms:
        jobs_to_wait = []
        submitted_chroms = []
        reused_chroms = []

        for chrom, vcf_fpath in vcfs_by_chrom.iteritems():
            if chrom not in not_submitted_chroms:
                continue
            output_fpaths = [
                join(
                    output_dirpath,
                    chrom.replace('chr', '') + '-' +
                    sample.name.replace('-', '_') + '.bam'.format(**locals()))
                for sample in samples
            ]
            if cnf.reuse_intermediate and all(
                    verify_file(output_fpath, silent=True)
                    for output_fpath in output_fpaths):
                info('BAM files for ' + chrom + ' chromosome exists, reusing')
                reused_chroms.append(chrom)
                continue
            else:
                # if exac_venv_pythonpath:  # to avoid compatibility problems with pysam and tabix
                #     cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf,
                #                                                             join('tools', 'split_bams_by_variants.py'))
                # else:
                cmdline = get_script_cmdline(cnf,
                                             'python',
                                             join('tools',
                                                  'split_bams_by_variants.py'),
                                             is_critical=True)
                cmdline += (
                    ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} '
                    +
                    '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} '
                    + '-g {cnf.genome.name} ').format(**locals())
                if cnf.reuse_intermediate:
                    cmdline += ' --reuse'
                if exac_features_fpath and verify_file(exac_features_fpath):
                    cmdline += ' --features ' + exac_features_fpath
                j = submit_job(cnf, cmdline, chrom + '_split')
                info()
                submitted_chroms.append(chrom)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_chroms = [
            chrom for chrom in not_submitted_chroms
            if chrom not in submitted_chroms and chrom not in reused_chroms
        ]
예제 #10
0
def dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False):
    jobs_to_wait = []
    not_submitted_samples = [sample for sample in samples]
    done_samples = []
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []

        for sample in not_submitted_samples:
            if do_sort:
                output_bam_fpath = join(cnf.work_dir,
                                        sample.name + '.dedup.sorted.bam')
            else:
                output_bam_fpath = join(cnf.work_dir,
                                        sample.name + '.dedup.bam')

            if cnf.reuse_intermediate and verify_file(output_bam_fpath,
                                                      silent=True):
                info(output_bam_fpath + ' exists, reusing')
                sample.bam = output_bam_fpath
                done_samples.append(sample)
                reused_samples.append(sample)
                continue
            else:
                if do_sort:
                    cmdline = 'sort {sample.bam} -o {output_bam_fpath}'.format(
                        **locals())
                    j = call_sambamba(cnf,
                                      cmdline,
                                      output_fpath=output_bam_fpath,
                                      bam_fpath=sample.bam,
                                      use_grid=True,
                                      command_name='sort',
                                      sample_name=sample.name,
                                      stdout_to_outputfile=False)
                else:
                    cmdline = 'view -f bam -F "not duplicate and not failed_quality_control" {sample.bam}'.format(
                        **locals())
                    j = call_sambamba(cnf,
                                      cmdline,
                                      output_fpath=output_bam_fpath,
                                      bam_fpath=sample.bam,
                                      use_grid=True,
                                      command_name='dedup',
                                      sample_name=sample.name)
                info()
                sample.bam = output_bam_fpath
                done_samples.append(sample)
                submitted_samples.append(sample)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_samples = [
            sample for sample in not_submitted_samples
            if sample not in submitted_samples and sample not in reused_samples
        ]
    return done_samples