def _submit_job(cnf,
                step,
                sample_name='',
                wait_for_steps=None,
                threads=1,
                is_critical=True,
                **kwargs):
    tool_cmdline = get_system_path(cnf,
                                   step.interpreter,
                                   step.script,
                                   is_critical=is_critical)
    if not tool_cmdline:
        return False

    kwargs['sample_name'] = sample_name
    cmdline = tool_cmdline + ' ' + step.param_line.format(**kwargs)

    info(step.name)

    job = submit_job(cnf,
                     cmdline,
                     job_name=step.job_name(sample_name),
                     wait_for_steps=wait_for_steps,
                     threads=threads)

    info()
    return job
示例#2
0
def call_sambamba(cnf,
                  cmdl,
                  bam_fpath,
                  output_fpath=None,
                  sambamba=None,
                  use_grid=False,
                  command_name='',
                  sample_name=None,
                  silent=False,
                  stdout_to_outputfile=True):
    sambamba = sambamba or get_system_path(
        cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
    sample_name = sample_name or basename(bam_fpath).split('.')[0]
    if use_grid:
        grid_sambabma = get_script_cmdline(
            cnf, 'python', join('tools', 'bed_processing', 'sambamba.py'))
        cmdl = cmdl.replace(' "', ' \'\"__QUOTE__')
        cmdl = cmdl.replace('" ', '__QUOTE__\"\' ')
        grid_cmdl = grid_sambabma + ' ' + bam_fpath + ' ' + sambamba + ' ' + cmdl
        job_name = command_name + '_' + sample_name
        j = submit_job(cnf,
                       grid_cmdl,
                       job_name=job_name,
                       output_fpath=output_fpath,
                       stdout_to_outputfile=stdout_to_outputfile)
        info()
        return j
    else:
        index_bam(cnf, bam_fpath, sambamba=sambamba)
        cmdl = sambamba + ' ' + cmdl
        stderr_dump = []
        res = call(cnf,
                   cmdl,
                   output_fpath=output_fpath,
                   exit_on_error=False,
                   stderr_dump=stderr_dump,
                   stdout_to_outputfile=stdout_to_outputfile,
                   silent=silent,
                   print_stderr=not silent)
        if not res:
            for l in stderr_dump:
                if 'sambamba-view: BAM index file (.bai) must be provided' in l:
                    if isfile(isfile(bam_fpath + '.bai')):
                        info('Removing .bai and re-indexing...')
                        os.remove(bam_fpath + '.bai')
                    index_bam(cnf, bam_fpath, sambamba)
                    res = call(cnf, cmdl, output_fpath=output_fpath)
        return res
def run_fastqc(cnf,
               fastq_fpath,
               output_basename,
               fastqc_dirpath,
               need_downsample=True):
    fastqc = get_system_path(cnf, 'fastqc', is_critical=True)
    java = get_system_path(cnf, 'java', is_critical=True)
    tmp_dirpath = join(cnf.work_dir, 'FastQC_' + output_basename + '_tmp')
    safe_mkdir(tmp_dirpath)
    cmdline_l = '{fastqc} --dir {tmp_dirpath} --extract -o {fastqc_dirpath} -f fastq -j {java} {fastq_fpath}'.format(
        **locals())
    j = submit_job(cnf,
                   cmdline_l,
                   'FastQC_' + output_basename,
                   run_on_chara=True,
                   stdout_to_outputfile=False)
    # output_fpath=join(fastqc_dirpath, output_basename + '_fastqc', 'fastqc_report.html'))
    return j
示例#4
0
def _submit_region_cov(cnf, work_dir, chrom, bam_fpaths, sample_names,
                       output_dirpath, chr_len_fpath):
    if not bam_fpaths or not sample_names:
        return None

    cmdline = get_script_cmdline(cnf,
                                 'python',
                                 join('tools', 'get_region_coverage.py'),
                                 is_critical=True)
    cmdline += (' --chr ' + chrom + ' --bams ' + bam_fpaths + ' --samples ' +
                sample_names + ' -o ' + output_dirpath + ' -g ' +
                chr_len_fpath + ' ' + ' --work-dir ' + work_dir)
    if cnf.bed:
        cmdline += ' --bed ' + cnf.bed
    return submit_job(
        cnf, cmdline,
        chrom + '_coverage_' + ('project' if
                                (',' in sample_names) else sample_names))
示例#5
0
def index_bam_grid(cnf, bam_fpath, sambamba=None):
    indexed_bam = bam_fpath + '.bai'
    if not isfile(indexed_bam) or getctime(indexed_bam) < getctime(bam_fpath):
        info('Indexing BAM, writing ' + indexed_bam + '...')
        sambamba = sambamba or get_system_path(
            cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True)
        if sambamba is None:
            sambamba = sambamba or get_system_path(
                cnf,
                join(get_ext_tools_dirname(), 'sambamba'),
                is_critical=True)
        cmdline = '{sambamba} index {bam_fpath}'.format(
            **locals())  # -F (=not) 1024 (=duplicate)
        j = submit_job(cnf,
                       cmdline,
                       basename(bam_fpath) + '_index',
                       output_fpath=indexed_bam,
                       stdout_to_outputfile=False)
        info()
        return j
    else:
        debug('Actual "bai" index exists.')
        return None
示例#6
0
def _annotate(cnf, samples):
    varannotate_cmdl = (get_script_cmdline(
        cnf, 'python', join('scripts', 'post', 'varannotate.py')) +
                        ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' +
                        cnf.run_cnf + ' --project-name ' + cnf.project_name +
                        (' --reuse ' if cnf.reuse_intermediate else '') +
                        ' --log-dir -' + ' --genome ' + cnf.genome.name +
                        (' --no-check ' if cnf.no_check else '') +
                        (' --qc' if cnf.qc else ' --no-qc') +
                        ((' --caller ' + cnf.caller) if cnf.caller else ''))

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for sample in not_submitted_samples:
            if not sample.varannotate_dirpath:
                sample.varannotate_dirpath = join(sample.dirpath,
                                                  source.varannotate_name)
            if not sample.anno_vcf_fpath:
                sample.anno_vcf_fpath = join(
                    sample.varannotate_dirpath,
                    add_suffix(basename(sample.vcf), 'anno'))
            output_fpath = sample.anno_vcf_fpath
            if not output_fpath.endswith('.gz'):
                output_fpath += '.gz'
            debug('Checking ' + output_fpath)
            if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf(
                    output_fpath):
                info('Annotated results ' + output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            work_dir = join(cnf.work_dir,
                            source.varannotate_name + '_' + sample.name)
            j = submit_job(
                cnf,
                cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' +
                sample.varannotate_dirpath + ' -s ' + sample.name +
                ' --work-dir ' + work_dir + ' --output-file ' + output_fpath,
                job_name='VA_' + cnf.project_name + '_' + sample.name,
                output_fpath=output_fpath,
                stdout_to_outputfile=False,
                work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]

                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_vcf(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but j.work_dir ' + j.work_dir +
                        ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()
示例#7
0
def _filter(cnf, samples, variants_fpath, variants_fname):
    # if cohort_mode:
    #     info('Running vcf2txt.pl in cohort mode')
    #     vcf2txt = get_script_cmdline(cnf, 'perl', 'vcf2txt', is_critical=True)
    #     vcf_fpath_by_sample = {s.name: s.anno_vcf_fpath for s in samples}
    #     cmdline = vcf2txt + ' ' + make_vcf2txt_cmdl_params(cnf, vcf_fpath_by_sample)
    #     res = run_vcf2txt_with_retries(cnf, cmdline, variants_fpath)
    #     if not res:
    #         critical('Error: vcf2txt.pl crashed')

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    cohort_freqs_fpath = None
    # if cnf.variant_filtering.max_ratio_vardict2mut < 1.0:
    #     cohort_freqs_fpath = join(cnf.work_dir, 'cohort_freqs.tsv')
    #     info('*' * 70)
    #     info('Max ratio set to ' + str(cnf.variant_filtering.max_ratio_vardict2mut) + ', counting freqs in cohort')
    #     # cnf.variant_filtering.max_ratio < 1.0 or \
    #     # cnf.fraction < 1.0
    #     cohort_freqs_fpath = count_cohort_freqs(cnf, samples, cohort_freqs_fpath, max_ratio=cnf.variant_filtering.max_ratio_vardict2mut)
    #     info('*' * 70)
    # info()

    not_submitted_samples = samples
    while not_submitted_samples:
        reused_samples = []
        jobs_to_wait = []
        submitted_samples = []
        for sample in not_submitted_samples:
            output_dirpath = sample.varfilter_dirpath = join(
                sample.dirpath, source.varfilter_name)
            output_fpath = sample.variants_fpath = join(
                sample.varfilter_dirpath, variants_fname)
            pass_output_fpath = add_suffix(sample.variants_fpath,
                                           variant_filtering.mut_pass_suffix)

            if cnf.reuse_intermediate and check_filtering_results(output_fpath) \
                    and check_filtering_results(pass_output_fpath):
                info('Filtered results ' + output_fpath + ' and ' +
                     pass_output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            varfilter_py = 'varfilter'
            work_dir = join(cnf.work_dir, 'filt_' + sample.name)
            if not cnf.genome.dbsnp_multi_mafs:
                critical(
                    'Error: dbsnp_multi_mafs is not specified in the config ' +
                    cnf.sys_cnf)
            cmdl = (
                '{varfilter_py}' +
                ((' --sys-cnf ' + cnf.sys_cnf) if not cnf.filt_cnf else '') +
                ((' --run-cnf ' + cnf.run_cnf) if not cnf.filt_cnf else '') +
                ((' --filt-cnf ' + cnf.filt_cnf) if cnf.filt_cnf else '') +
                ' --vcf {sample.anno_vcf_fpath}' + ' --sample {sample.name}' +
                ' -o {output_dirpath}' +
                ' --output-file {sample.variants_fpath}' + ' --project-name ' +
                cnf.project_name + ' --genome {cnf.genome.name}' +
                ' --work-dir {work_dir}' + ' --debug ' +
                (' --cohort-freqs {cohort_freqs_fpath}' if cohort_freqs_fpath
                 else '') + (' --reuse ' if cnf.reuse_intermediate else '') +
                ((' --caller ' + cnf.caller) if cnf.caller else '') +
                (' --qc' if cnf.qc else ' --no-qc') +
                (' --no-tsv' if not cnf.tsv else '') + ' --dbsnp-multi-mafs ' +
                adjust_path(cnf.genome.dbsnp_multi_mafs)).format(**locals())
            with with_cnf(cnf, reuse_intermediate=False):
                j = submit_job(cnf,
                               cmdl,
                               job_name='_filt_' + sample.name,
                               output_fpath=pass_output_fpath,
                               stdout_to_outputfile=False,
                               work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]
                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No filtering jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed filtering ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_file(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed and not cnf.debug:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but ' + j.work_dir + ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]
    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()

    info('Combining results...')
    vcf2txt_fpaths = [s.variants_fpath for s in samples]
    variants_fpath, pass_variants_fpath = combine_results(
        cnf, samples, vcf2txt_fpaths, variants_fpath)

    if cnf.qc:
        _summarize_varqc(cnf,
                         cnf.output_dir,
                         samples,
                         cnf.project_name,
                         post_filter=True)

    return variants_fpath, pass_variants_fpath
示例#8
0
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath,
                             exac_features_fpath):
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False)
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True)

    vcfs_by_chrom = dict()
    tabix = get_system_path(cnf, 'tabix')
    for chrom in chromosomes:
        vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf')
        cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format(
            **locals())
        call(cnf, cmdline)
        if verify_file(vcf_fpath):
            vcfs_by_chrom[chrom] = vcf_fpath

    output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name)
    safe_mkdir(output_dirpath)
    not_submitted_chroms = vcfs_by_chrom.keys()
    sample_names = ','.join(sample.name for sample in samples)
    sample_bams = ','.join(sample.bam for sample in samples)
    while not_submitted_chroms:
        jobs_to_wait = []
        submitted_chroms = []
        reused_chroms = []

        for chrom, vcf_fpath in vcfs_by_chrom.iteritems():
            if chrom not in not_submitted_chroms:
                continue
            output_fpaths = [
                join(
                    output_dirpath,
                    chrom.replace('chr', '') + '-' +
                    sample.name.replace('-', '_') + '.bam'.format(**locals()))
                for sample in samples
            ]
            if cnf.reuse_intermediate and all(
                    verify_file(output_fpath, silent=True)
                    for output_fpath in output_fpaths):
                info('BAM files for ' + chrom + ' chromosome exists, reusing')
                reused_chroms.append(chrom)
                continue
            else:
                # if exac_venv_pythonpath:  # to avoid compatibility problems with pysam and tabix
                #     cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf,
                #                                                             join('tools', 'split_bams_by_variants.py'))
                # else:
                cmdline = get_script_cmdline(cnf,
                                             'python',
                                             join('tools',
                                                  'split_bams_by_variants.py'),
                                             is_critical=True)
                cmdline += (
                    ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} '
                    +
                    '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} '
                    + '-g {cnf.genome.name} ').format(**locals())
                if cnf.reuse_intermediate:
                    cmdline += ' --reuse'
                if exac_features_fpath and verify_file(exac_features_fpath):
                    cmdline += ' --features ' + exac_features_fpath
                j = submit_job(cnf, cmdline, chrom + '_split')
                info()
                submitted_chroms.append(chrom)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_chroms = [
            chrom for chrom in not_submitted_chroms
            if chrom not in submitted_chroms and chrom not in reused_chroms
        ]