def total_merge_bed(cnf, bed_fpath): bedops = get_system_path(cnf, 'bedops') if bedops: cmdline = '{bedops} --merge {bed_fpath}'.format(**locals()) output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged') call(cnf, cmdline, output_fpath) return output_fpath else: bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} merge -i {bed_fpath}'.format(**locals()) output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged') call(cnf, cmdline, output_fpath) return output_fpath
def annotate_target(cnf, target_bed): output_fpath = intermediate_fname(cnf, target_bed, 'ann') if not cnf.genome.bed_annotation_features: return output_fpath if can_reuse(output_fpath, target_bed): info(output_fpath + ' exists, reusing') return output_fpath features_bed = verify_bed( cnf.genome.bed_annotation_features, is_critical=True, description='bed_annotation_features in system config') # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py')) # bedtools = get_system_path(cnf, 'bedtools') annotate_bed_py = which('annotate_bed.py') if not annotate_bed_py: critical( 'Error: annotate_bed.py not found in PATH, please install TargQC.') cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \ '-o {output_fpath} --canonical'.format(**locals()) # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \ # '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \ # '-o {output_fpath}'.format(**locals()) call(cnf, cmdline, output_fpath, stdout_to_outputfile=False) output_fpath = remove_comments(cnf, output_fpath) return output_fpath
def _rename_fields(cnf, inp_tsv_fpath, field_map): if cnf.get('keep_intermediate'): step_greetings('Renaming fields.') with open(inp_tsv_fpath) as f: first_line = f.readline() fields = first_line.split() new_fields = [field_map.get(f) or f for f in fields] new_first_line = '\t'.join(new_fields) if cnf.get('keep_intermediate'): out_tsv_fpath = intermediate_fname(cnf, inp_tsv_fpath, 'renamed') else: out_tsv_fpath = inp_tsv_fpath with file_transaction(cnf.work_dir, out_tsv_fpath) as tx_out_fpath: with open(tx_out_fpath, 'w') as out: out.write(new_first_line + '\n') with open(inp_tsv_fpath) as f: for i, l in enumerate(f): if i >= 1: out.write(l) if not cnf.get('keep_intermediate'): shutil.move(out_tsv_fpath, inp_tsv_fpath) return inp_tsv_fpath else: return out_tsv_fpath
def _tracks(cnf, track_fpath, input_fpath): if not verify_file(track_fpath): return None field_name = splitext_plus(basename(track_fpath))[0] step_greetings('Intersecting with ' + field_name) output_fpath = intermediate_fname(cnf, input_fpath, field_name) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath toolpath = get_system_path(cnf, 'vcfannotate') if not toolpath: err('WARNING: Skipping annotation with tracks: vcfannotate ' 'executable not found, you probably need to specify path in system_config, or ' 'run load bcbio: . /group/ngs/bin/bcbio-prod.sh"') return None # self.all_fields.append(field_name) cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format( **locals()) assert input_fpath output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, overwrite=True) if not verify_vcf(output_fpath): err('Error: tracks resulted ' + str(output_fpath) + ' for ' + track_fpath) return output_fpath # Set TRUE or FALSE for tracks def proc_line(line, i): if field_name in line: if not line.startswith('#'): fields = line.split('\t') info_line = fields[7] info_pairs = [attr.split('=') for attr in info_line.split(';')] info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if pair[0] == field_name and len(pair) > 1 else pair for pair in info_pairs] info_line = ';'.join( '='.join(pair) if len(pair) == 2 else pair[0] for pair in info_pairs) fields = fields[:7] + [info_line] + fields[8:] return '\t'.join(fields) return line assert output_fpath output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk') return verify_vcf(output_fpath, is_critical=True)
def get_padded_bed_file(cnf, bed, genome, padding): info('Making bed file for padded regions...') bedtools = get_system_path(cnf, 'bedtools') cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format( **locals()) output_fpath = intermediate_fname(cnf, bed, 'padded') call(cnf, cmdline, output_fpath) return output_fpath
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None): input_bed_fpath = verify_bed(input_bed_fpath) output_bed_fpath = adjust_path( output_bed_fpath) if output_bed_fpath else intermediate_fname( cnf, input_bed_fpath, 'sorted') class Region(SortableByChrom): def __init__(self, chrom, start, end, other_fields, chrom_ref_order): SortableByChrom.__init__(self, chrom, chrom_ref_order) self.start = start self.end = end self.chrom_ref_order = chrom_ref_order self.other_fields = tuple(other_fields) def get_key(self): return self.chrom_ref_order, self.start, self.end, self.other_fields regions = [] chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq) chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)} info('Sorting regions in ' + input_bed_fpath) if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed( output_bed_fpath): info(output_bed_fpath + ' exists, reusing') return output_bed_fpath with open(input_bed_fpath) as f: with file_transaction(cnf.work_dir, output_bed_fpath) as tx: with open(tx, 'w') as out: for l in f: if not l.strip(): continue if l.strip().startswith('#'): out.write(l) continue fs = l.strip().split('\t') chrom = fs[0] start = int(fs[1]) end = int(fs[2]) other_fields = fs[3:] order = chr_order.get(chrom, -1) regions.append( Region(chrom, start, end, other_fields, order)) for region in sorted(regions, key=lambda r: r.get_key()): fs = [region.chrom, str(region.start), str(region.end)] fs.extend(region.other_fields) out.write('\t'.join(fs) + '\n') info('Sorted ' + str(len(regions)) + ' regions, saved to ' + output_bed_fpath + '\n') return output_bed_fpath
def vcf_one_per_line(cnf, vcf_fpath): info('Converting VCF to one-effect-per-line...') oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl') vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl')) call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False) info() if not verify_file(oneperline_vcf_fpath): critical('Error: vcf_one_per_line didn\'t generate output file.') return oneperline_vcf_fpath
def _clip_vcf_by_bed(cnf, vcf_fpath, bed_fpath): info('Clipping VCF ' + vcf_fpath + ' using BED ' + bed_fpath) bedtools = get_system_path(cnf, 'bedtools') clipped_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'clip') cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath}'.format( **locals()) res = call(cnf, cmdline, output_fpath=clipped_vcf_fpath) clipped_gz_vcf_fpath = bgzip_and_tabix(cnf, clipped_vcf_fpath) return clipped_gz_vcf_fpath
def group_and_merge_regions_by_gene(cnf, bed_fpath, keep_genes=False): output_fpath = intermediate_fname(cnf, bed_fpath, 'grp_mrg') group_merge_bed_py = get_system_path( cnf, 'python', join('tools', 'bed_processing', 'group_and_merge_by_gene.py')) cmdline = '{group_merge_bed_py} {bed_fpath}'.format(**locals()) if not keep_genes: cmdline += ' | grep -vw Gene' call(cnf, cmdline, output_fpath) return output_fpath
def fix_vcf_sample_name(cnf, sample_name, vcf_fpath, output_fpath=None): output_fpath = output_fpath or intermediate_fname(cnf, vcf_fpath, 'sample') def fix_sample_name(l, i): if l.startswith('#CHROM'): fs = l.split('\t') fs[9] = sample_name l = '\t'.join(fs) elif not l.startswith('#'): fs = l.split('\t') kvs = fs[7].split(';') for i, kv in enumerate(kvs[:]): if kv.startswith('SAMPLE='): kvs[i] = 'SAMPLE=' + sample_name l = '\t'.join(fs[:7]) + '\t' + ';'.join(kvs) + '\t' + '\t'.join(fs[8:]) # l = re.sub("(?<=SAMPLE=)[^;](?=;)", sample_name, l) return l fixed_vcf = iterate_file(cnf, vcf_fpath, fix_sample_name, output_fpath=output_fpath) return bgzip_and_tabix(cnf, fixed_vcf)
def split_bed_by_chrom(cnf, bed_fpath): info('Splitting the BED file ' + bed_fpath + ' by chromosome: ', ending='') bed_fpath_by_chrom = dict() cur_chr_f = None cur_chr = None with open(bed_fpath) as f: for l in f: fs = l.strip().split('\t') if fs: if fs[0] != cur_chr: if cur_chr: info(str(cur_chr), ending=', ', print_date=False) cur_chr = fs[0] cur_chr_fpath = intermediate_fname(cnf, bed_fpath, cur_chr) cur_chr_f = open(cur_chr_fpath, 'w') bed_fpath_by_chrom[cur_chr] = cur_chr_fpath cur_chr_f.write(l) info('Done.', print_date=False) return bed_fpath_by_chrom
def remove_dups_picard(cnf, bam_fpath): picard = get_system_path(cnf, 'java', 'picard') if not picard: critical('No picard in the system') info('Running picard dedup for "' + basename(bam_fpath) + '"') dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt') output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup') cmdline = '{picard} MarkDuplicates' \ ' I={bam_fpath}' \ ' O={output_fpath}' \ ' METRICS_FILE={dup_metrics_txt}' \ ' REMOVE_DUPLICATES=True' \ ' VALIDATION_STRINGENCY=LENIENT' res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res != output_fpath: # error occurred, try to correct BAM and restart warn('Picard deduplication failed for "' + basename(bam_fpath) + '". Fixing BAM and restarting Picard...') bam_fpath = _fix_bam_for_picard(cnf, bam_fpath) res = call(cnf, cmdline.format(**locals()), output_fpath=output_fpath, stdout_to_outputfile=False, exit_on_error=False) if res == output_fpath: dup_rate = _parse_picard_dup_report(dup_metrics_txt) assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate) info('Duplication rate (picard): ' + str(dup_rate)) return output_fpath else: return None
def _snpsift_db_nsfp(cnf, input_fpath): if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome: return None step_greetings('DB SNFP') output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') db_path = cnf['genome']['dbnsfp'] if not verify_file(db_path, 'DB NSFP file'): err('DB NSFP file is incorrect. Skipping.') return None annotations = cnf.annotation['dbnsfp'].get('annotations') or [] # all_fields.extend(['dbNSFP_' + ann for ann in annotations]) ann_line = ('-f ' + ','.join(annotations)) if annotations else '' cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \ '{input_fpath}'.format(**locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True): return verify_vcf(output_fpath, is_critical=True) else: return None
def get_bedgraph_coverage(cnf, bam_fpath, chr_len_fpath=None, output_fpath=None, bed_fpath=None, exit_on_error=True): chr_len_fpath = chr_len_fpath or get_chr_len_fpath(cnf) dedup_bam = intermediate_fname(cnf, bam_fpath, source.dedup_bam) if not verify_bam(dedup_bam, silent=True): info('Deduplicating bam file ' + bam_fpath) remove_dups(cnf, bam_fpath, dedup_bam) else: info(dedup_bam + ' exists') index_bam(cnf, dedup_bam) bam_bed_fpath = bam_to_bed(cnf, dedup_bam, to_gzip=False) if getsize(bam_bed_fpath) <= 0: info('No coverage for ' + bam_fpath + ', skipping.') return None sorted_bed_fpath = sort_bed_by_alphabet(cnf, bam_bed_fpath, chr_len_fpath=chr_len_fpath) if bed_fpath: in_bed_fpath = intersect_bed(cnf, sorted_bed_fpath, bed_fpath) else: in_bed_fpath = sorted_bed_fpath if not verify_file(in_bed_fpath, silent=True): info('No coverage in ' + in_bed_fpath) return None bedgraph_fpath = output_fpath or '%s.bedgraph' % splitext(bam_fpath)[0] with file_transaction(cnf.work_dir, bedgraph_fpath) as tx_fpath: bedtools = get_system_path(cnf, 'bedtools') cmdl = '{bedtools} genomecov -bg -split -g {chr_len_fpath} -i {in_bed_fpath}'.format( **locals()) call(cnf, cmdl, exit_on_error=exit_on_error, output_fpath=tx_fpath) return bedgraph_fpath
def _mongo(cnf, input_fpath): step_greetings('Annotating from Mongo') if 'mongo' not in cnf.annotation: return None executable = get_java_tool_cmdline( cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar')) output_fpath = intermediate_fname(cnf, input_fpath, 'mongo') project_name = cnf.project_name cmdline = ('{executable} -module annotation -inputFile {input_fpath} ' '' '-outputFile {output_fpath} -project {project_name} ').format( **locals()) if call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=False, exit_on_error=False): return output_fpath else: return None
def _read_vcf_records_per_bed_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath, region_type, sample): info() info('Intersecting VCF ' + vcf_fpath + ' using BED ' + bed_fpath) vcf_columns_num = count_bed_cols(vcf_fpath) bed_columns_num = count_bed_cols(bed_fpath) vcf_bed_intersect = join( cnf.work_dir, splitext(basename(vcf_fpath))[0] + '_' + region_type + '_vcf_bed.intersect') bedtools = get_system_path(cnf, 'bedtools') if not cnf.reuse_intermediate or not verify_file( vcf_bed_intersect, silent=True, is_critical=False): cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath} -wo'.format( **locals()) res = call(cnf, cmdline, output_fpath=vcf_bed_intersect, max_number_of_tries=1, exit_on_error=False) if not res: return None, None, None, None regions_in_order = [] regions_set = set() vars_by_region = defaultdict(dict) var_by_site = dict() clipped_vcf_fpath = intermediate_fname(cnf, splitext(basename(vcf_fpath))[0], '_' + region_type + '_clip') with open(vcf_bed_intersect) as f, open(clipped_vcf_fpath, 'w') as clip_vcf: for l in f: l = l.strip() if not l or l.startswith('#'): clip_vcf.write(l + '\n') continue fs = l.split('\t') chrom, pos, id_, ref, alt, qual, filt, info_fields = fs[:8] chrom_b, start_b, end_b, symbol, strand, feature, biotype = None, None, None, None, None, None, None if bed_columns_num >= 8: chrom_b, start_b, end_b, symbol, _, strand, feature, biotype, _ = fs[ -(bed_columns_num + 1):][:9] elif bed_columns_num >= 4: chrom_b, start_b, end_b, symbol, _ = fs[-(bed_columns_num + 1):][:5] assert chrom == chrom_b, l r = chrom, id_, start_b, end_b, symbol, strand, feature, biotype if r not in regions_set: regions_set.add(r) regions_in_order.append(r) cls = None if '=Hotspot' in info_fields: cls = 'Hotspot' if '=Deleterious' in info_fields: cls = 'Deleterious' if cls: var = Variant(chrom, pos, ref, alt, cls) vars_by_region[r][(chrom, pos, ref, alt)] = var var_by_site[(chrom, pos, ref, alt)] = var clip_vcf.write('\t'.join( [chrom, pos, id_, ref, alt, qual, filt, info_fields]) + '\n') clipped_gz_vcf_fpath = bgzip_and_tabix(cnf, clipped_vcf_fpath, max_number_of_tries=1, exit_on_error=False) return clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site
def _fix_bam_for_picard(cnf, bam_fpath): def __process_problem_read_aligns(read_aligns): # each alignment: 0:NAME 1:FLAG 2:CHR 3:COORD 4:MAPQUAL 5:CIGAR 6:MATE_CHR 7:MATE_COORD TLEN SEQ ... def __get_key(align): return align.split('\t')[2] + '@' + align.split('\t')[3] def __get_mate_key(align): return (align.split('\t')[6] if align.split('\t')[2] != '=' else align.split('\t')[2]) \ + '@' + align.split('\t')[7] chr_coord = OrderedDict() for align in read_aligns: key = __get_key(align) if key not in chr_coord: chr_coord[key] = [] chr_coord[key].append(align) correct_pairs = [] for align in read_aligns: mate_key = __get_mate_key(align) if mate_key in chr_coord: for pair_align in chr_coord[mate_key]: if read_aligns.index(pair_align) <= read_aligns.index( align): continue if __get_mate_key(pair_align) == __get_key(align): correct_pairs.append((align, pair_align)) if not correct_pairs: return [] if len(correct_pairs) > 1: # sort by sum of mapping quality of both alignments correct_pairs.sort(key=lambda pair: pair[0].split('\t')[4] + pair[ 1].split('\t')[4], reverse=True) return [correct_pairs[0][0], correct_pairs[0][1]] samtools = get_system_path(cnf, 'samtools') try: import pysam without_pysam = False except ImportError: without_pysam = True # find reads presented more than twice in input BAM if without_pysam: qname_sorted_sam_fpath = intermediate_fname( cnf, bam_fpath, 'qname_sorted')[:-len('bam')] + 'sam' # queryname sorting; output is SAM cmdline = '{samtools} view {bam_fpath} | sort '.format(**locals()) call(cnf, cmdline, qname_sorted_sam_fpath) qname_sorted_file = open(qname_sorted_sam_fpath, 'r') else: qname_sorted_bam_fpath = intermediate_fname(cnf, bam_fpath, 'qname_sorted') # queryname sorting (-n), to stdout (-o), 'prefix' is not used; output is BAM cmdline = '{samtools} sort -n -o {bam_fpath} prefix'.format(**locals()) call(cnf, cmdline, qname_sorted_bam_fpath) qname_sorted_file = pysam.Samfile(qname_sorted_bam_fpath, 'rb') problem_reads = dict() cur_read_aligns = [] for line in qname_sorted_file: line = str(line) if cur_read_aligns: if line.split('\t')[0] != cur_read_aligns[0].split('\t')[0]: if len(cur_read_aligns) > 2: problem_reads[cur_read_aligns[0].split('\t') [0]] = cur_read_aligns cur_read_aligns = [] flag = int(line.split('\t')[1]) cur_read_aligns.append(line) if len(cur_read_aligns) > 2: problem_reads[cur_read_aligns[0].split('\t')[0]] = cur_read_aligns qname_sorted_file.close() for read_id, read_aligns in problem_reads.items(): problem_reads[read_id] = __process_problem_read_aligns(read_aligns) # correct input BAM fixed_bam_fpath = intermediate_fname(cnf, bam_fpath, 'fixed_for_picard') fixed_sam_fpath = fixed_bam_fpath[:-len('bam')] + 'sam' if without_pysam: sam_fpath = intermediate_fname(cnf, bam_fpath, 'tmp')[:-len('bam')] + 'sam' cmdline = '{samtools} view -h {bam_fpath}'.format(**locals()) call(cnf, cmdline, sam_fpath) input_file = open(sam_fpath, 'r') fixed_file = open(fixed_sam_fpath, 'w') else: input_file = pysam.Samfile(bam_fpath, 'rb') fixed_file = pysam.Samfile(fixed_bam_fpath, 'wb', template=input_file) for line in input_file: if without_pysam and line.startswith('@'): # header fixed_file.write(line) continue read_name = str(line).split('\t')[0] if read_name in problem_reads and str( line) not in problem_reads[read_name]: continue fixed_file.write(line) input_file.close() fixed_file.close() if without_pysam: cmdline = '{samtools} view -bS {fixed_sam_fpath}'.format(**locals()) call(cnf, cmdline, fixed_bam_fpath) return fixed_bam_fpath
def _snpeff(cnf, input_fpath): if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome: return None, None, None step_greetings('SnpEff') output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff') stats_fpath = join( cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') + '.snpEff_summary.csv') if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' snpeff = get_java_tool_cmdline(cnf, 'snpeff') ref_name = cnf.genome.snpeff.reference or cnf.genome.name if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'): ref_name = 'GRCh37.75' if ref_name.startswith('hg38'): ref_name = 'GRCh38.82' opts = '' if cnf.annotation.snpeff.cancer: opts += ' -cancer' assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!' verify_file(cnf.transcripts_fpath, 'Transcripts for snpEff -onlyTr', is_critical=True) opts += ' -onlyTr ' + cnf.transcripts_fpath + ' ' db_path = adjust_system_path(cnf.genome.snpeff.data) if db_path: opts += ' -dataDir ' + db_path elif cnf.resources.snpeff.config: conf = get_system_path(cnf, cnf.resources.snpeff.config) if conf: opts += ' -c ' + conf + ' ' else: err('Cannot find snpEff config file ' + str(cnf.resources.snpeff.config)) if cnf.annotation.snpeff.extra_options: opts += '' if not cnf.no_check: info('Removing previous snpEff annotations...') res = remove_prev_eff_annotation(cnf, input_fpath) if not res: err('Could not remove preivous snpEff annotations') return None, None, None input_fpath = res snpeff_type = get_snpeff_type(snpeff) if snpeff_type == "old": opts += ' -stats ' + stats_fpath + ' -csvStats' else: opts += ' -csvStats ' + stats_fpath cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format( **locals()) for i in range(1, 20): try: res = call_subprocess(cnf, cmdline, input_fpath, output_fpath, exit_on_error=False, stdout_to_outputfile=True, overwrite=True) except OSError: import traceback, time err(traceback.format_exc()) warn() info('Waiting 1 minute') time.sleep(60) info('Rerunning ' + str(i)) else: break output_fpath = verify_vcf(output_fpath, is_critical=True) snpeff_summary_html_fpath = 'snpEff_summary.html' if isfile(snpeff_summary_html_fpath): info('SnpEff created ' + snpeff_summary_html_fpath + ' in the cwd, removing it...') try: os.remove(snpeff_summary_html_fpath) except OSError: pass if res: return output_fpath, stats_fpath, splitext( stats_fpath)[0] + '.genes.txt' else: return None, None, None
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath): if not vcf_conf: err('No database for ' + dbname + ', skipping.') return None step_greetings('Annotating with ' + dbname) output_fpath = intermediate_fname(cnf, input_fpath, dbname) if output_fpath.endswith('.gz'): output_fpath = output_fpath[:-3] if cnf.reuse_intermediate and verify_vcf(output_fpath): info('VCF ' + output_fpath + ' exists, reusing...') return output_fpath executable = get_java_tool_cmdline(cnf, 'snpsift') java = get_system_path(cnf, 'java') info('Java version:') call(cnf, java + ' -version') info() db_path = cnf['genome'].get(dbname) if not db_path: db_path = vcf_conf.get('path') if not db_path: err('Please, provide a path to ' + dbname + ' in the "genomes" section in the system config. The config is: ' + str(cnf['genome'])) return verify_file(db_path, is_critical=True) annotations = vcf_conf.get('annotations') if not cnf.no_check: info('Removing previous annotations...') def delete_annos(rec): for anno in annotations: if anno in rec.INFO: del rec.INFO[anno] return rec if annotations: input_fpath = iterate_vcf(cnf, input_fpath, delete_annos, suffix='d') anno_line = '' if annotations: anno_line = '-info ' + ','.join(annotations) cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format( **locals()) output_fpath = call_subprocess(cnf, cmdline, input_fpath, output_fpath, stdout_to_outputfile=True, exit_on_error=False, overwrite=True) if not output_fpath: err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname) return output_fpath verify_vcf(output_fpath, is_critical=True) # f = open(output_fpath) # l = f.readline() # if 'Cannot allocate memory' in l: # f.close() # f = open(output_fpath) # contents = f.read() # critical('SnpSift failed with memory issue:\n' + contents) # f.close() # return None if not cnf.no_check: info_pattern = re.compile( r'''\#\#INFO=< ID=(?P<id>[^,]+),\s* Number=(?P<number>-?\d+|\.|[AG]),\s* Type=(?P<type>Integer|Float|Flag|Character|String),\s* Description="(?P<desc>[^"]*)" >''', re.VERBOSE) def _fix_after_snpsift(line, i, ctx): if not line.startswith('#'): if not ctx['met_CHROM']: return None line = line.replace(' ', '_') assert ' ' not in line # elif line.startswith('##INFO=<ID=om'): # line = line.replace(' ', '') elif not ctx['met_CHROM'] and line.startswith('#CHROM'): ctx['met_CHROM'] = True elif line.startswith('##INFO'): m = info_pattern.match(line) if m: line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format( m.group('id'), m.group('number'), m.group('type'), m.group('desc')) return line output_fpath = iterate_file(cnf, output_fpath, _fix_after_snpsift, suffix='fx', ctx=dict(met_CHROM=False)) return verify_vcf(output_fpath, is_critical=True)
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None): if features_bed is None and target_bed is None: warn( 'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.' ) # return None, None, None, None if target_bed: target_bed = verify_bed(target_bed, is_critical=True) if seq2c_bed: seq2c_bed = verify_bed(seq2c_bed, is_critical=True) if features_bed: features_bed = verify_bed(features_bed, is_critical=True) # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed): # warn('Same file used for exons and amplicons: ' + features_bed) # Features features_no_genes_bed = None if features_bed: # info() # info('Merging regions within genes...') # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True) # # info() # info('Sorting exons by (chrom, gene name, start)') # exons_bed = sort_bed(cnf, exons_bed) info() info( 'Filtering the features bed file to have only non-gene and no-transcript records...' ) features_no_genes_bed = intermediate_fname(cnf, features_bed, 'no_genes') call(cnf, 'grep -vw Gene ' + features_bed + ' | grep -vw Transcript', output_fpath=features_no_genes_bed) ori_target_bed_path = target_bed if target_bed: info() info('Remove comments in target...') target_bed = remove_comments(cnf, target_bed) info() info('Cut -f1,2,3,4 target...') target_bed = cut(cnf, target_bed, 4) info() info('Sorting target...') target_bed = sort_bed(cnf, target_bed) cols = count_bed_cols(target_bed) if cnf.reannotate or cols < 4: info() if not features_bed: critical( str(cols) + ' columns (less than 4), and no features to annotate regions ' '(please make sure you have set the "features" key in the corresponding genome section ' '(' + cnf.genome.name + ') in ' + cnf.sys_cnf) info( 'cnf.reannotate is ' + str(cnf.reannotate) + ', and cols in the target BED is ' + str(cols) + '. Annotating target with the gene names from the "features" file ' + features_bed + '...') target_bed = annotate_target(cnf, target_bed) def remove_no_anno(l, i): if l.split('\t')[3].strip() == '.': return None else: return l if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path: info('Seq2C bed: remove regions with no gene annotation') seq2c_bed = target_bed seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') elif seq2c_bed: info() info('Remove comments in seq2c bed...') seq2c_bed = remove_comments(cnf, seq2c_bed) info() info('Sorting seq2c bed...') seq2c_bed = sort_bed(cnf, seq2c_bed) cols = count_bed_cols(seq2c_bed) if cols < 4: info() info('Number columns in SV bed is ' + str(cols) + '. Annotating amplicons with gene names...') seq2c_bed = annotate_target(cnf, seq2c_bed) elif 8 > cols > 4: seq2c_bed = cut(cnf, seq2c_bed, 4) elif cols > 8: seq2c_bed = cut(cnf, seq2c_bed, 8) info('Filtering non-annotated entries in seq2c bed') seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt') else: seq2c_bed = verify_bed(cnf.genome.cds) if target_bed: info() # info('Merging amplicons...') # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False) info('Sorting target by (chrom, gene name, start)') target_bed = sort_bed(cnf, target_bed) return features_bed, features_no_genes_bed, target_bed, seq2c_bed
def cut(cnf, fpath, col_num): cut_fpath = intermediate_fname(cnf, fpath, 'cut') cmdline = 'cut -f' + ','.join(map(str, range(1, col_num + 1))) + ' ' + fpath call(cnf, cmdline, cut_fpath) return cut_fpath