def _read_vcf_records_per_bed_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath, region_type, sample): info() info('Intersecting VCF ' + vcf_fpath + ' using BED ' + bed_fpath) vcf_columns_num = count_bed_cols(vcf_fpath) bed_columns_num = count_bed_cols(bed_fpath) vcf_bed_intersect = join( cnf.work_dir, splitext(basename(vcf_fpath))[0] + '_' + region_type + '_vcf_bed.intersect') bedtools = get_system_path(cnf, 'bedtools') if not cnf.reuse_intermediate or not verify_file( vcf_bed_intersect, silent=True, is_critical=False): cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath} -wo'.format( **locals()) res = call(cnf, cmdline, output_fpath=vcf_bed_intersect, max_number_of_tries=1, exit_on_error=False) if not res: return None, None, None, None regions_in_order = [] regions_set = set() vars_by_region = defaultdict(dict) var_by_site = dict() clipped_vcf_fpath = intermediate_fname(cnf, splitext(basename(vcf_fpath))[0], '_' + region_type + '_clip') with open(vcf_bed_intersect) as f, open(clipped_vcf_fpath, 'w') as clip_vcf: for l in f: l = l.strip() if not l or l.startswith('#'): clip_vcf.write(l + '\n') continue fs = l.split('\t') chrom, pos, id_, ref, alt, qual, filt, info_fields = fs[:8] chrom_b, start_b, end_b, symbol, strand, feature, biotype = None, None, None, None, None, None, None if bed_columns_num >= 8: chrom_b, start_b, end_b, symbol, _, strand, feature, biotype, _ = fs[ -(bed_columns_num + 1):][:9] elif bed_columns_num >= 4: chrom_b, start_b, end_b, symbol, _ = fs[-(bed_columns_num + 1):][:5] assert chrom == chrom_b, l r = chrom, id_, start_b, end_b, symbol, strand, feature, biotype if r not in regions_set: regions_set.add(r) regions_in_order.append(r) cls = None if '=Hotspot' in info_fields: cls = 'Hotspot' if '=Deleterious' in info_fields: cls = 'Deleterious' if cls: var = Variant(chrom, pos, ref, alt, cls) vars_by_region[r][(chrom, pos, ref, alt)] = var var_by_site[(chrom, pos, ref, alt)] = var clip_vcf.write('\t'.join( [chrom, pos, id_, ref, alt, qual, filt, info_fields]) + '\n') clipped_gz_vcf_fpath = bgzip_and_tabix(cnf, clipped_vcf_fpath, max_number_of_tries=1, exit_on_error=False) return clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site
def _run_multisample_qualimap(cnf, output_dir, samples, targqc_full_report): """ 1. Generates Qualimap2 plots and put into plots_dirpath 2. Adds records to targqc_full_report.plots """ plots_dirpath = join(output_dir, 'plots') if cnf.reuse_intermediate and verify_dir(plots_dirpath) and [ f for f in listdir(plots_dirpath) if not f.startswith('.') ]: info('Qualimap miltisample plots exist - ' + plots_dirpath + ', reusing...') else: # Qualimap2 run for multi-sample plots if len( [s.qualimap_html_fpath for s in samples if s.qualimap_html_fpath]) > 0: qualimap = get_system_path(cnf, interpreter_or_name=None, name='qualimap') if qualimap is not None and get_qualimap_type(qualimap) == 'full': qualimap_output_dir = join(cnf.work_dir, 'qualimap_multi_bamqc') _correct_qualimap_genome_results(cnf, samples) _correct_qualimap_insert_size_histogram(cnf, samples) safe_mkdir(qualimap_output_dir) rows = [] for sample in samples: if sample.qualimap_html_fpath: rows += [[sample.name, sample.qualimap_html_fpath]] data_fpath = write_tsv_rows( rows, join(qualimap_output_dir, 'qualimap_results_by_sample.tsv')) qualimap_plots_dirpath = join(qualimap_output_dir, 'images_multisampleBamQcReport') cmdline = '{qualimap} multi-bamqc --data {data_fpath} -outdir {qualimap_output_dir}'.format( **locals()) res = call(cnf, cmdline, exit_on_error=False, return_err_code=True, env_vars=dict(DISPLAY=None), output_fpath=qualimap_plots_dirpath, output_is_dir=True) if res is None or not verify_dir(qualimap_plots_dirpath): warn( 'Warning: Qualimap for multi-sample analysis failed to finish. TargQC will not contain plots.' ) return None else: if exists(plots_dirpath): shutil.rmtree(plots_dirpath) shutil.move(qualimap_plots_dirpath, plots_dirpath) else: warn( 'Warning: Qualimap for multi-sample analysis was not found. TargQC will not contain plots.' ) return None targqc_full_report.plots = [] for plot_fpath in listdir(plots_dirpath): plot_fpath = join(plots_dirpath, plot_fpath) if verify_file(plot_fpath) and plot_fpath.endswith('.png'): targqc_full_report.plots.append(relpath(plot_fpath, output_dir))
def _fix_bam_for_picard(cnf, bam_fpath): def __process_problem_read_aligns(read_aligns): # each alignment: 0:NAME 1:FLAG 2:CHR 3:COORD 4:MAPQUAL 5:CIGAR 6:MATE_CHR 7:MATE_COORD TLEN SEQ ... def __get_key(align): return align.split('\t')[2] + '@' + align.split('\t')[3] def __get_mate_key(align): return (align.split('\t')[6] if align.split('\t')[2] != '=' else align.split('\t')[2]) \ + '@' + align.split('\t')[7] chr_coord = OrderedDict() for align in read_aligns: key = __get_key(align) if key not in chr_coord: chr_coord[key] = [] chr_coord[key].append(align) correct_pairs = [] for align in read_aligns: mate_key = __get_mate_key(align) if mate_key in chr_coord: for pair_align in chr_coord[mate_key]: if read_aligns.index(pair_align) <= read_aligns.index( align): continue if __get_mate_key(pair_align) == __get_key(align): correct_pairs.append((align, pair_align)) if not correct_pairs: return [] if len(correct_pairs) > 1: # sort by sum of mapping quality of both alignments correct_pairs.sort(key=lambda pair: pair[0].split('\t')[4] + pair[ 1].split('\t')[4], reverse=True) return [correct_pairs[0][0], correct_pairs[0][1]] samtools = get_system_path(cnf, 'samtools') try: import pysam without_pysam = False except ImportError: without_pysam = True # find reads presented more than twice in input BAM if without_pysam: qname_sorted_sam_fpath = intermediate_fname( cnf, bam_fpath, 'qname_sorted')[:-len('bam')] + 'sam' # queryname sorting; output is SAM cmdline = '{samtools} view {bam_fpath} | sort '.format(**locals()) call(cnf, cmdline, qname_sorted_sam_fpath) qname_sorted_file = open(qname_sorted_sam_fpath, 'r') else: qname_sorted_bam_fpath = intermediate_fname(cnf, bam_fpath, 'qname_sorted') # queryname sorting (-n), to stdout (-o), 'prefix' is not used; output is BAM cmdline = '{samtools} sort -n -o {bam_fpath} prefix'.format(**locals()) call(cnf, cmdline, qname_sorted_bam_fpath) qname_sorted_file = pysam.Samfile(qname_sorted_bam_fpath, 'rb') problem_reads = dict() cur_read_aligns = [] for line in qname_sorted_file: line = str(line) if cur_read_aligns: if line.split('\t')[0] != cur_read_aligns[0].split('\t')[0]: if len(cur_read_aligns) > 2: problem_reads[cur_read_aligns[0].split('\t') [0]] = cur_read_aligns cur_read_aligns = [] flag = int(line.split('\t')[1]) cur_read_aligns.append(line) if len(cur_read_aligns) > 2: problem_reads[cur_read_aligns[0].split('\t')[0]] = cur_read_aligns qname_sorted_file.close() for read_id, read_aligns in problem_reads.items(): problem_reads[read_id] = __process_problem_read_aligns(read_aligns) # correct input BAM fixed_bam_fpath = intermediate_fname(cnf, bam_fpath, 'fixed_for_picard') fixed_sam_fpath = fixed_bam_fpath[:-len('bam')] + 'sam' if without_pysam: sam_fpath = intermediate_fname(cnf, bam_fpath, 'tmp')[:-len('bam')] + 'sam' cmdline = '{samtools} view -h {bam_fpath}'.format(**locals()) call(cnf, cmdline, sam_fpath) input_file = open(sam_fpath, 'r') fixed_file = open(fixed_sam_fpath, 'w') else: input_file = pysam.Samfile(bam_fpath, 'rb') fixed_file = pysam.Samfile(fixed_bam_fpath, 'wb', template=input_file) for line in input_file: if without_pysam and line.startswith('@'): # header fixed_file.write(line) continue read_name = str(line).split('\t')[0] if read_name in problem_reads and str( line) not in problem_reads[read_name]: continue fixed_file.write(line) input_file.close() fixed_file.close() if without_pysam: cmdline = '{samtools} view -bS {fixed_sam_fpath}'.format(**locals()) call(cnf, cmdline, fixed_bam_fpath) return fixed_bam_fpath
def intersect_regions(cnf, bcbio_structures, all_regions, min_samples): all_regions_fname = 'all_regions.bed' all_regions_bed_fpath = join( cnf.output_dir, add_suffix(all_regions_fname, str(cnf.min_depth)) if cnf.min_depth else all_regions_fname) with open(all_regions_bed_fpath, 'w') as out: if not cnf.min_depth: out.write( '## Coverage threshold Nx is 10x for cell line and 100x for plasma\n' ) else: out.write('## Coverage threshold Nx is ' + str(cnf.min_depth) + 'x\n') out.write('\t'.join([ '#Chr', 'Start', 'End', 'Size', 'Gene', 'Depth<Nx', 'SamplesSharingSameFeature' ]) + '\n') for region in all_regions: out.write('\t'.join([str(val) for val in region]) + '\n') regions_overlaps = defaultdict(lambda: defaultdict(list)) regions = [] if cnf.tricky_regions: intersection_fpath = _intersect_with_tricky_regions( cnf, all_regions_bed_fpath, 'samples') else: bed_fpath = cnf.bed intersection_fpath = join( cnf.work_dir, splitext(basename(all_regions_bed_fpath))[0] + '_bed.intersect') bedtools = get_system_path(cnf, 'bedtools') if not cnf.reuse_intermediate or not verify_file( intersection_fpath, silent=True, is_critical=False): cmdline = '{bedtools} intersect -header -a {all_regions_bed_fpath} -b {bed_fpath} -wo'.format( **locals()) res = call(cnf, cmdline, output_fpath=intersection_fpath, max_number_of_tries=1, exit_on_error=False) if not res: return None with open(intersection_fpath) as f: for l in f: l = l.strip() if not l or l.startswith('#'): continue fs = l.split('\t') chrom, start, end, size, symbol, pct_depth, num_samples = fs[:7] overlap_bps = int(fs[-1]) r = (chrom, start, end, size, symbol, pct_depth, num_samples) if cnf.tricky_regions: filename = tricky_regions_fnames_d[basename( fs[7]).split('.')[0]] regions_overlaps[r][filename].append(overlap_bps) else: regions_overlaps[r][basename(cnf.bed)].append(overlap_bps) for r in all_regions: if r in regions_overlaps: overlaps = '' chrom, start, end, size, symbol, pct_depth, num_samples = r overlaps_txt = ', '.join( fname + ': %.0f' % (sum(regions_overlaps[r][fname]) / float(size) * 100) + '%' for fname in regions_overlaps[r]) r = list(r) r.append(overlaps_txt) else: r = list(r) r.append('') regions.append(r) os.remove(intersection_fpath) return regions
def main(): cnf, output_dir, fastq_fpaths = proc_opts() targqc_dirpath = output_dir fastqs_by_sample = find_fastq_pairs(fastq_fpaths) samples = [] for sname, (l, r) in fastqs_by_sample.items(): s = source.TargQC_Sample(sname, join(cnf.output_dir, sname)) s.l_fpath = l s.r_fpath = r samples.append(s) threads = len(samples) info('Found ' + str(len(samples)) + ' samples.') if len(samples) == 0: critical('ERROR: No fastq pairs found.') info() # samples = [source.TargQC_Sample( # s.name, # dirpath=join(targqc_dirpath, s.name), # bed=cnf.bed) for s in fastq_fpaths] if cnf.downsample_to == 0: lefts = [s.l_fpath for s in samples] rights = [s.r_fpath for s in samples] else: if cnf.downsample_to is None: downsample_to = int(5e5) else: downsample_to = cnf.downsample_to info('Downsampling the reads to ' + str(downsample_to)) lefts, rights = downsample_fastq(cnf, samples, downsample_to) bam_by_sample = OrderedDict() sambamba = get_system_path(cnf, join(get_ext_tools_dirname(), 'sambamba'), is_critical=True) bwa = get_system_path(cnf, 'bwa') bammarkduplicates = get_system_path(cnf, 'bammarkduplicates') if sambamba and bwa and bammarkduplicates: info() info('Aligning reads to the reference') bam_fpaths = Parallel(n_jobs=threads)( delayed(align)(CallCnf(cnf.__dict__), s, l, r, sambamba, bwa, bammarkduplicates, cnf.genome.bwa, cnf.is_pcr) for s, l, r in zip(samples, lefts, rights)) for sample, bam_fpath in zip(samples, bam_fpaths): if verify_bam(bam_fpath): bam_by_sample[sample.name] = bam_fpath else: err('Sample ' + sample + ' was not aligned successfully.') if not bam_by_sample: err('ERROR: No sample was alined.') else: info() cnf.work_dir = join(cnf.work_dir, source.targqc_name) safe_mkdir(cnf.work_dir) info('Making TargQC reports for BAMs from reads') safe_mkdir(targqc_dirpath) run_targqc(cnf, bam_by_sample, cnf.bed, targqc_dirpath) cnf.work_dir = dirname(cnf.work_dir) info('Done TargQC') info() info('*' * 70)
def make_fastqc_reports(cnf, fastq_fpaths, output_dir): # if isdir(fastqc_dirpath): # if isdir(fastqc_dirpath + '.bak'): # try: # shutil.rmtree(fastqc_dirpath + '.bak') # except OSError: # pass # if not isdir(fastqc_dirpath + '.bak'): # os.rename(fastqc_dirpath, fastqc_dirpath + '.bak') # if isdir(fastqc_dirpath): # err('Could not run and combine fastqc because it already exists and could not be moved to fastqc.bak') # return None fastqc = get_system_path(cnf, 'fastqc') if not fastqc: err('FastQC is not found, cannot make reports') return None else: safe_mkdir(output_dir) fqc_samples = [] fastqc_jobs = [] for fastq_fpath in fastq_fpaths: s = FQC_Sample(name=splitext_plus(basename(fastq_fpath))[0], fastq_fpath=fastq_fpath) fqc_samples.extend([s]) info('Added sample ' + s.name) for fqc_s in fqc_samples: if cnf.reuse_intermediate and verify_file(fqc_s.fastqc_html_fpath, silent=True): info(fqc_s.fastqc_html_fpath + ' exists, reusing') else: fastqc_jobs.append( run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) info() wait_for_jobs(cnf, fastqc_jobs) fastqc_jobs = [] # while True: for fqc_s in fqc_samples: fqc_s.fastqc_html_fpath = find_fastqc_html(output_dir, fqc_s.name) not_done_fqc = [ fqc_s for fqc_s in fqc_samples if not verify_file(fqc_s.fastqc_html_fpath, description='Not found FastQC html for ' + fqc_s.name) ] # if not not_done_fqc: # info('') # info('Every FastQC job is done, moving on.') # info('-' * 70) # break # else: # info('') # info('Some FastQC jobs are not done (' + ', '.join(f.name for f in not_done_fqc) + '). Retrying them.') # info('') # for fqc_s in not_done_fqc: # fastqc_jobs.append(run_fastqc(cnf, fqc_s.fastq_fpath, fqc_s.name, output_dir)) # wait_for_jobs(cnf, fastqc_jobs) for fqc_s in fqc_samples: sample_fastqc_dirpath = join(output_dir, fqc_s.name + '_fastqc') if isfile(sample_fastqc_dirpath + '.zip'): try: os.remove(sample_fastqc_dirpath + '.zip') except OSError: pass comb_fastqc_fpath = join(output_dir, 'fastqc.html') write_fastqc_combo_report(cnf, comb_fastqc_fpath, fqc_samples) verify_file(comb_fastqc_fpath, is_critical=True) info('Combined FastQC saved to ' + comb_fastqc_fpath) return comb_fastqc_fpath
def split_bam_files_use_grid(cnf, samples, combined_vcf_fpath, exac_features_fpath): samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=False) samples = dedup_and_sort_bams_use_grid(cnf, samples, do_sort=True) vcfs_by_chrom = dict() tabix = get_system_path(cnf, 'tabix') for chrom in chromosomes: vcf_fpath = join(cnf.work_dir, str(chrom) + '.vcf') cmdline = '{tabix} -h {combined_vcf_fpath} {chrom} > {vcf_fpath}'.format( **locals()) call(cnf, cmdline) if verify_file(vcf_fpath): vcfs_by_chrom[chrom] = vcf_fpath output_dirpath = join(cnf.output_dir, 'combined_bams', cnf.project_name) safe_mkdir(output_dirpath) not_submitted_chroms = vcfs_by_chrom.keys() sample_names = ','.join(sample.name for sample in samples) sample_bams = ','.join(sample.bam for sample in samples) while not_submitted_chroms: jobs_to_wait = [] submitted_chroms = [] reused_chroms = [] for chrom, vcf_fpath in vcfs_by_chrom.iteritems(): if chrom not in not_submitted_chroms: continue output_fpaths = [ join( output_dirpath, chrom.replace('chr', '') + '-' + sample.name.replace('-', '_') + '.bam'.format(**locals())) for sample in samples ] if cnf.reuse_intermediate and all( verify_file(output_fpath, silent=True) for output_fpath in output_fpaths): info('BAM files for ' + chrom + ' chromosome exists, reusing') reused_chroms.append(chrom) continue else: # if exac_venv_pythonpath: # to avoid compatibility problems with pysam and tabix # cmdline = exac_venv_pythonpath + ' ' + get_system_path(cnf, # join('tools', 'split_bams_by_variants.py')) # else: cmdline = get_script_cmdline(cnf, 'python', join('tools', 'split_bams_by_variants.py'), is_critical=True) cmdline += ( ' --chr {chrom} --vcf {vcf_fpath} --samples {sample_names} ' + '--bams {sample_bams} -o {output_dirpath} --work-dir {cnf.work_dir} ' + '-g {cnf.genome.name} ').format(**locals()) if cnf.reuse_intermediate: cmdline += ' --reuse' if exac_features_fpath and verify_file(exac_features_fpath): cmdline += ' --features ' + exac_features_fpath j = submit_job(cnf, cmdline, chrom + '_split') info() submitted_chroms.append(chrom) if not j.is_done: jobs_to_wait.append(j) if len(jobs_to_wait) >= cnf.threads: break if jobs_to_wait: info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...') jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait) else: info('No jobs to submit.') not_submitted_chroms = [ chrom for chrom in not_submitted_chroms if chrom not in submitted_chroms and chrom not in reused_chroms ]
def _snpeff(cnf, input_fpath): if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome: return None, None, None step_greetings('SnpEff') output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff') stats_fpath = join( cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') + '.snpEff_summary.csv') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' snpeff = get_java_tool_cmdline(cnf, 'snpeff') ref_name = cnf.genome.snpeff.reference or cnf.genome.name if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'): ref_name = 'GRCh37.75' if ref_name.startswith('hg38'): ref_name = 'GRCh38.82' opts = '' if cnf.annotation.snpeff.cancer: opts += ' -cancer' assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!' verify_file(cnf.transcripts_fpath, 'Transcripts for snpEff -onlyTr', is_critical=True) opts += ' -onlyTr ' + cnf.transcripts_fpath + ' ' db_path = adjust_system_path(cnf.genome.snpeff.data) if db_path: opts += ' -dataDir ' + db_path elif cnf.resources.snpeff.config: conf = get_system_path(cnf, cnf.resources.snpeff.config) if conf: opts += ' -c ' + conf + ' ' else: err('Cannot find snpEff config file ' + str(cnf.resources.snpeff.config)) if cnf.annotation.snpeff.extra_options: opts += '' if not cnf.no_check: info('Removing previous snpEff annotations...') res = remove_prev_eff_annotation(cnf, input_fpath) if not res: err('Could not remove preivous snpEff annotations') return None, None, None input_fpath = res snpeff_type = get_snpeff_type(snpeff) if snpeff_type == "old": opts += ' -stats ' + stats_fpath + ' -csvStats' else: opts += ' -csvStats ' + stats_fpath cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format( **locals()) for i in range(1, 20): try: res = call_subprocess(cnf, cmdline, input_fpath, output_fpath, exit_on_error=False, stdout_to_outputfile=True, overwrite=True) except OSError: import traceback, time err(traceback.format_exc()) warn() info('Waiting 1 minute') time.sleep(60) info('Rerunning ' + str(i)) else: break output_fpath = verify_vcf(output_fpath, is_critical=True) snpeff_summary_html_fpath = 'snpEff_summary.html' if isfile(snpeff_summary_html_fpath): info('SnpEff created ' + snpeff_summary_html_fpath + ' in the cwd, removing it...') try: os.remove(snpeff_summary_html_fpath) except OSError: pass if res: return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' else: return None, None, None
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)
def run_annotators(cnf, vcf_fpath, bam_fpath): original_vcf = cnf.vcf db_section_by_name = OrderedDict( (dbname, cnf.annotation[dbname]) for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine'] if dbname in cnf.annotation and not cnf.annotation[dbname].get('skip-annotation')) # if not cnf.no_check: # to_delete_id_ref = [] # if 'dbsnp' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as rs*') # to_delete_id_ref.append('rs') # if 'cosmic' in db_section_by_name.keys(): # info('Removing IDs from dbsnp as COS*') # to_delete_id_ref.append('COS') # # def delete_ids(rec): # deleting existing dbsnp and cosmic ID annotations # if rec.ID: # if isinstance(rec.ID, basestring): # if any(rec.ID.startswith(pref) for pref in to_delete_id_ref): # rec.ID = None # else: # rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)] # # if not rec.FILTER: # rec.FILTER = 'PASS' # # return rec # # info('Removing previous rs* and COS* IDs') # vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID') bcftools = get_system_path(cnf, 'bcftools') if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'): vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) cmdl = '{bcftools} annotate --remove ID {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid')) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get( 'custom_vcfs', dict()).items(): step_greetings('Annotating using ' + dbname) annotations = ','.join('INFO/' + a for a in dbconf.get('annotations')) if dbname in ('cosmic', 'dbsnp'): annotations += ',=ID' db_fpath = get_db_path(cnf, dbconf, dbname) if db_fpath: cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}' res = call(cnf, cmdl.format(**locals()), output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname)) if res: vcf_fpath = res vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath) verify_vcf(vcf_fpath, is_critical=True) if 'dbnsfp' in cnf.annotation: res = _snpsift_db_nsfp(cnf, vcf_fpath) if res: vcf_fpath = res if 'snpeff' in cnf.annotation: res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath) if res: vcf_fpath = res verify_vcf(vcf_fpath, is_critical=True) final_summary_fpath = join(cnf.output_dir, basename(summary_fpath)) final_genes_fpath = join(cnf.output_dir, basename(genes_fpath)) if isfile(final_summary_fpath): os.remove(final_summary_fpath) if isfile(final_genes_fpath): os.remove(final_genes_fpath) if file_exists(summary_fpath): shutil.move(summary_fpath, final_summary_fpath) if file_exists(genes_fpath): shutil.move(genes_fpath, final_genes_fpath) if 'tracks' in cnf.annotation and cnf.annotation[ 'tracks'] and cnf.annotation['tracks']: track_fapths = [] for track_name in cnf.annotation['tracks']: if isfile(track_name) and verify_file(track_name): track_fapths.append(track_name) else: if 'tracks' in cnf['genome'] and cnf['genome'][ 'tracks'] and track_name in cnf['genome']['tracks']: track_fpath = cnf['genome']['tracks'][track_name] if verify_file(track_fpath): track_fapths.append(track_fpath) for track_fapth in track_fapths: res = _tracks(cnf, track_fapth, vcf_fpath) if res: vcf_fpath = res step_greetings('Intersection with database VCFs...') if 'intersect_with' in cnf.annotation: for key, db_fpath in cnf.annotation['intersect_with'].items(): res = intersect_vcf(cnf, input_fpath=vcf_fpath, db_fpath=db_fpath, key=key) if res: vcf_fpath = res if 'mongo' in cnf.annotation: res = _mongo(cnf, vcf_fpath) if res: vcf_fpath = res return vcf_fpath