Пример #1
0
def _read_vcf_records_per_bed_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath,
                                                  region_type, sample):
    info()
    info('Intersecting VCF ' + vcf_fpath + ' using BED ' + bed_fpath)

    vcf_columns_num = count_bed_cols(vcf_fpath)
    bed_columns_num = count_bed_cols(bed_fpath)

    vcf_bed_intersect = join(
        cnf.work_dir,
        splitext(basename(vcf_fpath))[0] + '_' + region_type +
        '_vcf_bed.intersect')
    bedtools = get_system_path(cnf, 'bedtools')
    if not cnf.reuse_intermediate or not verify_file(
            vcf_bed_intersect, silent=True, is_critical=False):
        cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath} -wo'.format(
            **locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vcf_bed_intersect,
                   max_number_of_tries=1,
                   exit_on_error=False)
        if not res:
            return None, None, None, None

    regions_in_order = []
    regions_set = set()
    vars_by_region = defaultdict(dict)
    var_by_site = dict()

    clipped_vcf_fpath = intermediate_fname(cnf,
                                           splitext(basename(vcf_fpath))[0],
                                           '_' + region_type + '_clip')

    with open(vcf_bed_intersect) as f, open(clipped_vcf_fpath,
                                            'w') as clip_vcf:
        for l in f:
            l = l.strip()
            if not l or l.startswith('#'):
                clip_vcf.write(l + '\n')
                continue
            fs = l.split('\t')
            chrom, pos, id_, ref, alt, qual, filt, info_fields = fs[:8]
            chrom_b, start_b, end_b, symbol, strand, feature, biotype = None, None, None, None, None, None, None
            if bed_columns_num >= 8:
                chrom_b, start_b, end_b, symbol, _, strand, feature, biotype, _ = fs[
                    -(bed_columns_num + 1):][:9]
            elif bed_columns_num >= 4:
                chrom_b, start_b, end_b, symbol, _ = fs[-(bed_columns_num +
                                                          1):][:5]
            assert chrom == chrom_b, l
            r = chrom, id_, start_b, end_b, symbol, strand, feature, biotype
            if r not in regions_set:
                regions_set.add(r)
                regions_in_order.append(r)

            cls = None
            if '=Hotspot' in info_fields: cls = 'Hotspot'
            if '=Deleterious' in info_fields: cls = 'Deleterious'
            if cls:
                var = Variant(chrom, pos, ref, alt, cls)
                vars_by_region[r][(chrom, pos, ref, alt)] = var
                var_by_site[(chrom, pos, ref, alt)] = var
                clip_vcf.write('\t'.join(
                    [chrom, pos, id_, ref, alt, qual, filt, info_fields]) +
                               '\n')

    clipped_gz_vcf_fpath = bgzip_and_tabix(cnf,
                                           clipped_vcf_fpath,
                                           max_number_of_tries=1,
                                           exit_on_error=False)

    return clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site
Пример #2
0
def _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report):
    """ 1. Generates Qualimap2 plots and put into plots_dirpath
        2. Adds records to targqc_full_report.plots
    """
    plots_dirpath = join(output_dir, 'plots')
    if cnf.reuse_intermediate and verify_dir(plots_dirpath) and [
            f for f in listdir(plots_dirpath) if not f.startswith('.')
    ]:
        info('Qualimap miltisample plots exist - ' + plots_dirpath +
             ', reusing...')
    else:
        # Qualimap2 run for multi-sample plots
        if len(
            [s.qualimap_html_fpath
             for s in samples if s.qualimap_html_fpath]) > 0:
            qualimap = get_system_path(cnf,
                                       interpreter_or_name=None,
                                       name='qualimap')

            if qualimap is not None and get_qualimap_type(qualimap) == 'full':
                qualimap_output_dir = join(cnf.work_dir,
                                           'qualimap_multi_bamqc')

                _correct_qualimap_genome_results(cnf, samples)
                _correct_qualimap_insert_size_histogram(cnf, samples)

                safe_mkdir(qualimap_output_dir)
                rows = []
                for sample in samples:
                    if sample.qualimap_html_fpath:
                        rows += [[sample.name, sample.qualimap_html_fpath]]

                data_fpath = write_tsv_rows(
                    rows,
                    join(qualimap_output_dir,
                         'qualimap_results_by_sample.tsv'))
                qualimap_plots_dirpath = join(qualimap_output_dir,
                                              'images_multisampleBamQcReport')
                cmdline = '{qualimap} multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format(
                    **locals())
                res = call(cnf,
                           cmdline,
                           exit_on_error=False,
                           return_err_code=True,
                           env_vars=dict(DISPLAY=None),
                           output_fpath=qualimap_plots_dirpath,
                           output_is_dir=True)
                if res is None or not verify_dir(qualimap_plots_dirpath):
                    warn(
                        'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.'
                    )
                    return None
                else:
                    if exists(plots_dirpath):
                        shutil.rmtree(plots_dirpath)
                    shutil.move(qualimap_plots_dirpath, plots_dirpath)
            else:
                warn(
                    'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.'
                )
                return None

    targqc_full_report.plots = []
    for plot_fpath in listdir(plots_dirpath):
        plot_fpath = join(plots_dirpath, plot_fpath)
        if verify_file(plot_fpath) and plot_fpath.endswith('.png'):
            targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
Пример #3
0
def _fix_bam_for_picard(cnf, bam_fpath):
    def __process_problem_read_aligns(read_aligns):
        # each alignment: 0:NAME 1:FLAG 2:CHR 3:COORD 4:MAPQUAL 5:CIGAR 6:MATE_CHR 7:MATE_COORD TLEN SEQ ...
        def __get_key(align):
            return align.split('\t')[2] + '@' + align.split('\t')[3]

        def __get_mate_key(align):
            return (align.split('\t')[6] if align.split('\t')[2] != '=' else align.split('\t')[2]) \
                   + '@' + align.split('\t')[7]

        chr_coord = OrderedDict()
        for align in read_aligns:
            key = __get_key(align)
            if key not in chr_coord:
                chr_coord[key] = []
            chr_coord[key].append(align)
        correct_pairs = []
        for align in read_aligns:
            mate_key = __get_mate_key(align)
            if mate_key in chr_coord:
                for pair_align in chr_coord[mate_key]:
                    if read_aligns.index(pair_align) <= read_aligns.index(
                            align):
                        continue
                    if __get_mate_key(pair_align) == __get_key(align):
                        correct_pairs.append((align, pair_align))
        if not correct_pairs:
            return []
        if len(correct_pairs) > 1:
            # sort by sum of mapping quality of both alignments
            correct_pairs.sort(key=lambda pair: pair[0].split('\t')[4] + pair[
                1].split('\t')[4],
                               reverse=True)
        return [correct_pairs[0][0], correct_pairs[0][1]]

    samtools = get_system_path(cnf, 'samtools')
    try:
        import pysam
        without_pysam = False
    except ImportError:
        without_pysam = True

    # find reads presented more than twice in input BAM
    if without_pysam:
        qname_sorted_sam_fpath = intermediate_fname(
            cnf, bam_fpath, 'qname_sorted')[:-len('bam')] + 'sam'
        # queryname sorting; output is SAM
        cmdline = '{samtools} view {bam_fpath} | sort '.format(**locals())
        call(cnf, cmdline, qname_sorted_sam_fpath)
        qname_sorted_file = open(qname_sorted_sam_fpath, 'r')
    else:
        qname_sorted_bam_fpath = intermediate_fname(cnf, bam_fpath,
                                                    'qname_sorted')
        # queryname sorting (-n), to stdout (-o), 'prefix' is not used; output is BAM
        cmdline = '{samtools} sort -n -o {bam_fpath} prefix'.format(**locals())
        call(cnf, cmdline, qname_sorted_bam_fpath)
        qname_sorted_file = pysam.Samfile(qname_sorted_bam_fpath, 'rb')
    problem_reads = dict()
    cur_read_aligns = []
    for line in qname_sorted_file:
        line = str(line)
        if cur_read_aligns:
            if line.split('\t')[0] != cur_read_aligns[0].split('\t')[0]:
                if len(cur_read_aligns) > 2:
                    problem_reads[cur_read_aligns[0].split('\t')
                                  [0]] = cur_read_aligns
                cur_read_aligns = []
        flag = int(line.split('\t')[1])
        cur_read_aligns.append(line)
    if len(cur_read_aligns) > 2:
        problem_reads[cur_read_aligns[0].split('\t')[0]] = cur_read_aligns
    qname_sorted_file.close()

    for read_id, read_aligns in problem_reads.items():
        problem_reads[read_id] = __process_problem_read_aligns(read_aligns)

    # correct input BAM
    fixed_bam_fpath = intermediate_fname(cnf, bam_fpath, 'fixed_for_picard')
    fixed_sam_fpath = fixed_bam_fpath[:-len('bam')] + 'sam'
    if without_pysam:
        sam_fpath = intermediate_fname(cnf, bam_fpath,
                                       'tmp')[:-len('bam')] + 'sam'
        cmdline = '{samtools} view -h {bam_fpath}'.format(**locals())
        call(cnf, cmdline, sam_fpath)
        input_file = open(sam_fpath, 'r')
        fixed_file = open(fixed_sam_fpath, 'w')
    else:
        input_file = pysam.Samfile(bam_fpath, 'rb')
        fixed_file = pysam.Samfile(fixed_bam_fpath, 'wb', template=input_file)
    for line in input_file:
        if without_pysam and line.startswith('@'):  # header
            fixed_file.write(line)
            continue
        read_name = str(line).split('\t')[0]
        if read_name in problem_reads and str(
                line) not in problem_reads[read_name]:
            continue
        fixed_file.write(line)
    input_file.close()
    fixed_file.close()
    if without_pysam:
        cmdline = '{samtools} view -bS {fixed_sam_fpath}'.format(**locals())
        call(cnf, cmdline, fixed_bam_fpath)

    return fixed_bam_fpath
def intersect_regions(cnf, bcbio_structures, all_regions, min_samples):
    all_regions_fname = 'all_regions.bed'
    all_regions_bed_fpath = join(
        cnf.output_dir,
        add_suffix(all_regions_fname, str(cnf.min_depth))
        if cnf.min_depth else all_regions_fname)

    with open(all_regions_bed_fpath, 'w') as out:
        if not cnf.min_depth:
            out.write(
                '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n'
            )
        else:
            out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) +
                      'x\n')
        out.write('\t'.join([
            '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx',
            'SamplesSharingSameFeature'
        ]) + '\n')
        for region in all_regions:
            out.write('\t'.join([str(val) for val in region]) + '\n')

    regions_overlaps = defaultdict(lambda: defaultdict(list))
    regions = []
    if cnf.tricky_regions:
        intersection_fpath = _intersect_with_tricky_regions(
            cnf, all_regions_bed_fpath, 'samples')
    else:
        bed_fpath = cnf.bed
        intersection_fpath = join(
            cnf.work_dir,
            splitext(basename(all_regions_bed_fpath))[0] + '_bed.intersect')
        bedtools = get_system_path(cnf, 'bedtools')
        if not cnf.reuse_intermediate or not verify_file(
                intersection_fpath, silent=True, is_critical=False):
            cmdline = '{bedtools} intersect -header -a {all_regions_bed_fpath} -b {bed_fpath} -wo'.format(
                **locals())
            res = call(cnf,
                       cmdline,
                       output_fpath=intersection_fpath,
                       max_number_of_tries=1,
                       exit_on_error=False)
            if not res:
                return None

    with open(intersection_fpath) as f:
        for l in f:
            l = l.strip()
            if not l or l.startswith('#'):
                continue
            fs = l.split('\t')
            chrom, start, end, size, symbol, pct_depth, num_samples = fs[:7]
            overlap_bps = int(fs[-1])
            r = (chrom, start, end, size, symbol, pct_depth, num_samples)
            if cnf.tricky_regions:
                filename = tricky_regions_fnames_d[basename(
                    fs[7]).split('.')[0]]
                regions_overlaps[r][filename].append(overlap_bps)
            else:
                regions_overlaps[r][basename(cnf.bed)].append(overlap_bps)
    for r in all_regions:
        if r in regions_overlaps:
            overlaps = ''
            chrom, start, end, size, symbol, pct_depth, num_samples = r
            overlaps_txt = ', '.join(
                fname + ': %.0f' %
                (sum(regions_overlaps[r][fname]) / float(size) * 100) + '%'
                for fname in regions_overlaps[r])
            r = list(r)
            r.append(overlaps_txt)
        else:
            r = list(r)
            r.append('')
        regions.append(r)
    os.remove(intersection_fpath)
    return regions
Пример #5
0
def main():
    cnf, output_dir, fastq_fpaths = proc_opts()

    targqc_dirpath = output_dir

    fastqs_by_sample = find_fastq_pairs(fastq_fpaths)
    samples = []
    for sname, (l, r) in fastqs_by_sample.items():
        s = source.TargQC_Sample(sname, join(cnf.output_dir, sname))
        s.l_fpath = l
        s.r_fpath = r
        samples.append(s)

    threads = len(samples)
    info('Found ' + str(len(samples)) + ' samples.')
    if len(samples) == 0:
        critical('ERROR: No fastq pairs found.')
    info()

    # samples = [source.TargQC_Sample(
    #     s.name,
    #     dirpath=join(targqc_dirpath, s.name),
    #     bed=cnf.bed) for s in fastq_fpaths]

    if cnf.downsample_to == 0:
        lefts = [s.l_fpath for s in samples]
        rights = [s.r_fpath for s in samples]
    else:
        if cnf.downsample_to is None:
            downsample_to = int(5e5)
        else:
            downsample_to = cnf.downsample_to

        info('Downsampling the reads to ' + str(downsample_to))
        lefts, rights = downsample_fastq(cnf, samples, downsample_to)

    bam_by_sample = OrderedDict()
    sambamba = get_system_path(cnf,
                               join(get_ext_tools_dirname(), 'sambamba'),
                               is_critical=True)
    bwa = get_system_path(cnf, 'bwa')
    bammarkduplicates = get_system_path(cnf, 'bammarkduplicates')
    if sambamba and bwa and bammarkduplicates:
        info()
        info('Aligning reads to the reference')
        bam_fpaths = Parallel(n_jobs=threads)(
            delayed(align)(CallCnf(cnf.__dict__), s, l, r, sambamba, bwa,
                           bammarkduplicates, cnf.genome.bwa, cnf.is_pcr)
            for s, l, r in zip(samples, lefts, rights))
        for sample, bam_fpath in zip(samples, bam_fpaths):
            if verify_bam(bam_fpath):
                bam_by_sample[sample.name] = bam_fpath
            else:
                err('Sample ' + sample + ' was not aligned successfully.')
        if not bam_by_sample:
            err('ERROR: No sample was alined.')
        else:
            info()
            cnf.work_dir = join(cnf.work_dir, source.targqc_name)
            safe_mkdir(cnf.work_dir)
            info('Making TargQC reports for BAMs from reads')
            safe_mkdir(targqc_dirpath)
            run_targqc(cnf, bam_by_sample, cnf.bed, targqc_dirpath)
            cnf.work_dir = dirname(cnf.work_dir)
            info('Done TargQC')
    info()
    info('*' * 70)
Пример #6
0
def make_fastqc_reports(cnf, fastq_fpaths, output_dir):
    # if isdir(fastqc_dirpath):
    #     if isdir(fastqc_dirpath + '.bak'):
    #         try:
    #             shutil.rmtree(fastqc_dirpath + '.bak')
    #         except OSError:
    #             pass
    #     if not isdir(fastqc_dirpath + '.bak'):
    #         os.rename(fastqc_dirpath, fastqc_dirpath + '.bak')
    # if isdir(fastqc_dirpath):
    #     err('Could not run and combine fastqc because it already exists and could not be moved to fastqc.bak')
    #     return None

    fastqc = get_system_path(cnf, 'fastqc')
    if not fastqc:
        err('FastQC is not found, cannot make reports')
        return None

    else:
        safe_mkdir(output_dir)

        fqc_samples = []
        fastqc_jobs = []
        for fastq_fpath in fastq_fpaths:
            s = FQC_Sample(name=splitext_plus(basename(fastq_fpath))[0],
                           fastq_fpath=fastq_fpath)
            fqc_samples.extend([s])
            info('Added sample ' + s.name)

        for fqc_s in fqc_samples:
            if cnf.reuse_intermediate and verify_file(fqc_s.fastqc_html_fpath,
                                                      silent=True):
                info(fqc_s.fastqc_html_fpath + ' exists, reusing')
            else:
                fastqc_jobs.append(
                    run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir))
            info()

        wait_for_jobs(cnf, fastqc_jobs)

        fastqc_jobs = []
        # while True:
        for fqc_s in fqc_samples:
            fqc_s.fastqc_html_fpath = find_fastqc_html(output_dir, fqc_s.name)
        not_done_fqc = [
            fqc_s for fqc_s in fqc_samples
            if not verify_file(fqc_s.fastqc_html_fpath,
                               description='Not found FastQC html for ' +
                               fqc_s.name)
        ]
        # if not not_done_fqc:
        #     info('')
        #     info('Every FastQC job is done, moving on.')
        #     info('-' * 70)
        #     break
        # else:
        #     info('')
        #     info('Some FastQC jobs are not done (' + ', '.join(f.name for f in not_done_fqc) + '). Retrying them.')
        #     info('')
        #     for fqc_s in not_done_fqc:
        #         fastqc_jobs.append(run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir))
        #     wait_for_jobs(cnf, fastqc_jobs)

        for fqc_s in fqc_samples:
            sample_fastqc_dirpath = join(output_dir, fqc_s.name + '_fastqc')
            if isfile(sample_fastqc_dirpath + '.zip'):
                try:
                    os.remove(sample_fastqc_dirpath + '.zip')
                except OSError:
                    pass

        comb_fastqc_fpath = join(output_dir, 'fastqc.html')
        write_fastqc_combo_report(cnf, comb_fastqc_fpath, fqc_samples)
        verify_file(comb_fastqc_fpath, is_critical=True)
        info('Combined FastQC saved to ' + comb_fastqc_fpath)
        return comb_fastqc_fpath
Пример #7
0
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath,
                             exac_features_fpath):
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False)
    samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True)

    vcfs_by_chrom = dict()
    tabix = get_system_path(cnf, 'tabix')
    for chrom in chromosomes:
        vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf')
        cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format(
            **locals())
        call(cnf, cmdline)
        if verify_file(vcf_fpath):
            vcfs_by_chrom[chrom] = vcf_fpath

    output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name)
    safe_mkdir(output_dirpath)
    not_submitted_chroms = vcfs_by_chrom.keys()
    sample_names = ','.join(sample.name for sample in samples)
    sample_bams = ','.join(sample.bam for sample in samples)
    while not_submitted_chroms:
        jobs_to_wait = []
        submitted_chroms = []
        reused_chroms = []

        for chrom, vcf_fpath in vcfs_by_chrom.iteritems():
            if chrom not in not_submitted_chroms:
                continue
            output_fpaths = [
                join(
                    output_dirpath,
                    chrom.replace('chr', '') + '-' +
                    sample.name.replace('-', '_') + '.bam'.format(**locals()))
                for sample in samples
            ]
            if cnf.reuse_intermediate and all(
                    verify_file(output_fpath, silent=True)
                    for output_fpath in output_fpaths):
                info('BAM files for ' + chrom + ' chromosome exists, reusing')
                reused_chroms.append(chrom)
                continue
            else:
                # if exac_venv_pythonpath:  # to avoid compatibility problems with pysam and tabix
                #     cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf,
                #                                                             join('tools', 'split_bams_by_variants.py'))
                # else:
                cmdline = get_script_cmdline(cnf,
                                             'python',
                                             join('tools',
                                                  'split_bams_by_variants.py'),
                                             is_critical=True)
                cmdline += (
                    ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} '
                    +
                    '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} '
                    + '-g {cnf.genome.name} ').format(**locals())
                if cnf.reuse_intermediate:
                    cmdline += ' --reuse'
                if exac_features_fpath and verify_file(exac_features_fpath):
                    cmdline += ' --features ' + exac_features_fpath
                j = submit_job(cnf, cmdline, chrom + '_split')
                info()
                submitted_chroms.append(chrom)

                if not j.is_done:
                    jobs_to_wait.append(j)
                if len(jobs_to_wait) >= cnf.threads:
                    break
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No jobs to submit.')
        not_submitted_chroms = [
            chrom for chrom in not_submitted_chroms
            if chrom not in submitted_chroms and chrom not in reused_chroms
        ]
Пример #8
0
def _snpeff(cnf, input_fpath):
    if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome:
        return None, None, None

    step_greetings('SnpEff')

    output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff')
    stats_fpath = join(
        cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') +
        '.snpEff_summary.csv')

    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')

    ref_name = cnf.genome.snpeff.reference or cnf.genome.name
    if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'):
        ref_name = 'GRCh37.75'
    if ref_name.startswith('hg38'): ref_name = 'GRCh38.82'

    opts = ''
    if cnf.annotation.snpeff.cancer: opts += ' -cancer'

    assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!'
    verify_file(cnf.transcripts_fpath,
                'Transcripts for snpEff -onlyTr',
                is_critical=True)
    opts += ' -onlyTr ' + cnf.transcripts_fpath + ' '

    db_path = adjust_system_path(cnf.genome.snpeff.data)
    if db_path:
        opts += ' -dataDir ' + db_path
    elif cnf.resources.snpeff.config:
        conf = get_system_path(cnf, cnf.resources.snpeff.config)
        if conf:
            opts += ' -c ' + conf + ' '
        else:
            err('Cannot find snpEff config file ' +
                str(cnf.resources.snpeff.config))

    if cnf.annotation.snpeff.extra_options:
        opts += ''

    if not cnf.no_check:
        info('Removing previous snpEff annotations...')
        res = remove_prev_eff_annotation(cnf, input_fpath)
        if not res:
            err('Could not remove preivous snpEff annotations')
            return None, None, None
        input_fpath = res

    snpeff_type = get_snpeff_type(snpeff)
    if snpeff_type == "old":
        opts += ' -stats ' + stats_fpath + ' -csvStats'
    else:
        opts += ' -csvStats ' + stats_fpath

    cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format(
        **locals())

    for i in range(1, 20):
        try:
            res = call_subprocess(cnf,
                                  cmdline,
                                  input_fpath,
                                  output_fpath,
                                  exit_on_error=False,
                                  stdout_to_outputfile=True,
                                  overwrite=True)
        except OSError:
            import traceback, time
            err(traceback.format_exc())
            warn()
            info('Waiting 1 minute')
            time.sleep(60)
            info('Rerunning ' + str(i))
        else:
            break

    output_fpath = verify_vcf(output_fpath, is_critical=True)

    snpeff_summary_html_fpath = 'snpEff_summary.html'
    if isfile(snpeff_summary_html_fpath):
        info('SnpEff created ' + snpeff_summary_html_fpath +
             ' in the cwd, removing it...')
        try:
            os.remove(snpeff_summary_html_fpath)
        except OSError:
            pass

    if res:
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'
    else:
        return None, None, None
Пример #9
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)
Пример #10
0
def run_annotators(cnf, vcf_fpath, bam_fpath):
    original_vcf = cnf.vcf

    db_section_by_name = OrderedDict(
        (dbname, cnf.annotation[dbname])
        for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine']
        if dbname in cnf.annotation
        and not cnf.annotation[dbname].get('skip-annotation'))

    # if not cnf.no_check:
    #     to_delete_id_ref = []
    #     if 'dbsnp' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as rs*')
    #         to_delete_id_ref.append('rs')
    #     if 'cosmic' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as COS*')
    #         to_delete_id_ref.append('COS')
    #
    #     def delete_ids(rec):  # deleting existing dbsnp and cosmic ID annotations
    #         if rec.ID:
    #             if isinstance(rec.ID, basestring):
    #                 if any(rec.ID.startswith(pref) for pref in to_delete_id_ref):
    #                     rec.ID = None
    #             else:
    #                 rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)]
    #
    #         if not rec.FILTER:
    #             rec.FILTER = 'PASS'
    #
    #         return rec
    #
    #     info('Removing previous rs* and COS* IDs')
    #     vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID')

    bcftools = get_system_path(cnf, 'bcftools')

    if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'):
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    cmdl = '{bcftools} annotate --remove ID {vcf_fpath}'
    res = call(cnf,
               cmdl.format(**locals()),
               output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid'))
    if res:
        vcf_fpath = res
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get(
            'custom_vcfs', dict()).items():
        step_greetings('Annotating using ' + dbname)
        annotations = ','.join('INFO/' + a for a in dbconf.get('annotations'))
        if dbname in ('cosmic', 'dbsnp'):
            annotations += ',=ID'
        db_fpath = get_db_path(cnf, dbconf, dbname)
        if db_fpath:
            cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}'
            res = call(cnf,
                       cmdl.format(**locals()),
                       output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname))
            if res:
                vcf_fpath = res
                vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    verify_vcf(vcf_fpath, is_critical=True)

    if 'dbnsfp' in cnf.annotation:
        res = _snpsift_db_nsfp(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    if 'snpeff' in cnf.annotation:
        res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath)
        if res:
            vcf_fpath = res
            verify_vcf(vcf_fpath, is_critical=True)
            final_summary_fpath = join(cnf.output_dir, basename(summary_fpath))
            final_genes_fpath = join(cnf.output_dir, basename(genes_fpath))
            if isfile(final_summary_fpath): os.remove(final_summary_fpath)
            if isfile(final_genes_fpath): os.remove(final_genes_fpath)
            if file_exists(summary_fpath):
                shutil.move(summary_fpath, final_summary_fpath)
            if file_exists(genes_fpath):
                shutil.move(genes_fpath, final_genes_fpath)

    if 'tracks' in cnf.annotation and cnf.annotation[
            'tracks'] and cnf.annotation['tracks']:
        track_fapths = []
        for track_name in cnf.annotation['tracks']:
            if isfile(track_name) and verify_file(track_name):
                track_fapths.append(track_name)
            else:
                if 'tracks' in cnf['genome'] and cnf['genome'][
                        'tracks'] and track_name in cnf['genome']['tracks']:
                    track_fpath = cnf['genome']['tracks'][track_name]
                    if verify_file(track_fpath):
                        track_fapths.append(track_fpath)
        for track_fapth in track_fapths:
            res = _tracks(cnf, track_fapth, vcf_fpath)
            if res:
                vcf_fpath = res

    step_greetings('Intersection with database VCFs...')
    if 'intersect_with' in cnf.annotation:
        for key, db_fpath in cnf.annotation['intersect_with'].items():
            res = intersect_vcf(cnf,
                                input_fpath=vcf_fpath,
                                db_fpath=db_fpath,
                                key=key)
            if res:
                vcf_fpath = res

    if 'mongo' in cnf.annotation:
        res = _mongo(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    return vcf_fpath