def gen_report(vcf): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in xrange(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n') for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def novosort_merge_pe(config_file, sample_list): fh = open(sample_list, 'r') (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, novo_merge_rmdup_slurm) \ = parse_config(config_file) for sample in fh: sample = sample.rstrip('\n') loc = '../LOGS/' + sample + '.novosort_merge.log' job_loc = sample + '.novosort_merge.log' (bam_list, n) = list_bam(project, align, sample) bam_string = " ".join(bam_list) cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAMS/' os.chdir(cur_dir) out_bam = sample + '.merged.transcriptome.bam' if n > 1: batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' --export=novosort="' \ + novosort + '",threads="' + threads + '",ram="' + ram + 'G",out_bam="' + out_bam \ + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' + novo_merge_rmdup_slurm log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n") subprocess.call(batch, shell=True) else: link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.transcriptome.bam;' log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n' + link_bam + '\n') subprocess.call(link_bam, shell=True) sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n') return 0
def cutadapter(sample, end1, end2, config_file): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' # designed to be run in a subdirectory, keep original file names sf1 = end1 sf2 = end2 end1 = os.path.basename(sf1) end2 = os.path.basename(sf2) if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.cutadapt.log' (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual, mqual) = parse_config(config_file) cut_th = threads if int(cut_th) >= 4: cut_th = str(int(int(threads) / 2)) cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \ + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \ + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc if r1adapt == '' and r2adapt == '': cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \ + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \ + ' >> ' + loc + ' 2>> ' + loc log(loc, date_time() + cutadapt_cmd + "\n") call(cutadapt_cmd, shell=True) return 0
def fastqc(fastqc_tool, sample, end1, end2, t): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.fastqc.log' fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 log(loc, date_time() + fastqc_cmd + "\n") f = Popen(fastqc_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True) # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score # didn't fit call('sleep 20s', shell=True) if str(f.poll()) == '1': log( loc, date_time() + 'fastqc returned an error. Check your inputs and try again!\n') exit(1) return 0
def organize_dirs(self): # check for existing BAM, QC and LOG dirs one level up try: if not os.path.isdir('../' + self.bam_dir): mk_bam_dir = 'mkdir ../' + self.bam_dir log(self.loc, date_time() + 'Making BAM directory ' + mk_bam_dir + '\n') call(mk_bam_dir, shell=True) if not os.path.isdir('../' + self.qc_dir): mk_qc_dir = 'mkdir ../' + self.qc_dir log(self.loc, date_time() + 'Making QC directory ' + mk_qc_dir + '\n') call(mk_qc_dir, shell=True) if not os.path.isdir('../' + self.log_dir): mk_log_dir = 'mkdir ../' + self.log_dir log(self.loc, date_time() + 'Making LOGS directory ' + mk_log_dir + '\n') call(mk_log_dir, shell=True) reloc_files = 'mv ' + self.bam_dir + '* ../' + self.bam_dir + '; mv ' + self.log_dir + '* ../' \ + self.log_dir + '; mv ' + self.qc_dir + '* ../' + self.qc_dir log(self.loc, date_time() + 'Relocating files ' + reloc_files + '\n') call(reloc_files, shell=True) # need to reassign log file location since it's being moved! self.loc = '../' + self.loc rm_old = 'rmdir ' + ' '.join((self.bam_dir , self.log_dir, self.qc_dir)) log(self.loc, date_time() + 'Clearing out working dirs ' + rm_old + '\n') call(rm_old, shell=True) return 0 except: return 1
def align_stats(sample): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.aln.log' log(loc, date_time() + "Converting to table summary format\n") fh = open(sample + '/' + 'align_summary.txt', 'r') fo = open(sample + '.align.txt', 'w') fo.write( 'Sample\tMean insert size estimate(10k reads)\tStd dev read insert size estimate(10 k reads)\tStarting left reads\t% mapped\tmultimapped(mm)\tgt 20 mm\tStarting right reads\t% mapped\t% mm\tgt 20 mm\tOverall map rate\tAligned pairs\t% mm\t% discordant\t% condordant\n' + sample + '\t') fi = open(sample + '_subset.insert_metrics.hist') for i in range(0, 7, 1): skip = next(fi) stats = next(fi) fi.close() stat = stats.split('\t') fo.write('\t'.join([str(int(float(stat[4]))), str(int(float(stat[5])))])) next(fh) lstart = next(fh) m = re.search('(\d+)\n$', lstart) fo.write('\t' + m.group(1)) pct = next(fh) m = re.search('\(\s*(\S+) of input\)\n', pct) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm) fo.write('\t' + m.group(1) + '\t' + m.group(2)) next(fh) rstart = next(fh) m = re.search('(\d+)\n$', rstart) fo.write('\t' + m.group(1)) pct = next(fh) m = re.search('\(\s*(\S+) of input\)\n', pct) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm) fo.write('\t' + m.group(1) + '\t' + m.group(2)) ovr = next(fh) m = re.search('\s*(^\S+)', ovr) fo.write('\t' + m.group(1)) next(fh) aln = next(fh) m = re.search('(\d+)\n$', aln) fo.write('\t' + m.group(1)) mm = next(fh) m = re.search('\(\s*(\S+)\) have', mm) fo.write('\t' + m.group(1)) dc = next(fh) m = re.search('\(\s*(\S+)\) are', dc) fo.write('\t' + m.group(1)) cc = next(fh) m = re.search('^\s*(\S+)', cc) fo.write('\t' + m.group(1) + '\n') fo.close return 0
def gen_report(vcf, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') sample = parts[0] loc = 'LOGS/' + sample + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out_fn = sample + '.indels.vep.prioritized_impact.report.xls' out = open(out_fn, 'w') desired = { 'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0 } desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace( 'Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write( 'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n' ) if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN']] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out, ref_flag) out.close() log( loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def picard_rmdup(java_tool, picard_tool, picard_tmp, sample, log_dir, ram): picard_rmdup_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " MarkDuplicates CREATE_INDEX=true " \ "TMP_DIR=" + picard_tmp + " REMOVE_DUPLICATES=true ASSUME_SORTED=true " \ "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 INPUT=" + sample + ".srt.bam OUTPUT=" + sample \ + ".rmdup.srt.bam METRICS_FILE=" + sample + ".rmdup.srt.metrics VALIDATION_STRINGENCY=LENIENT " \ "> " + log_dir + sample + ".picard.rmdup.pe.log 2>&1" log(log_dir + sample + ".picard.rmdup.pe.log", date_time() + picard_rmdup_cmd + "\n") call(picard_rmdup_cmd, shell=True)
def novosort_merge_pe(config_file, sample_list): fh = open(sample_list, 'r') (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, rmdup, novo_merge_rmdup_slurm, novo_picard_merge_rmdup_slurm) = parse_config(config_file) for sample in fh: sample = sample.rstrip('\n') loc = sample + '.novosort_merge.log' (bam_list, bai_list, n) = list_bam(project, align, sample) bam_string = " ".join(bam_list) cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAM/' os.chdir(cur_dir) out_bam = sample + '.merged.final.bam' if n > 1: if rmdup == 'Y': job_loc = sample + '.novosort_merge.log' job_name = sample + '_novosort_merge' batch = 'sbatch -c ' + threads + ' -J ' + job_name + ' --mem ' + ram + 'G -o ' + job_loc \ + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \ + 'G",out_bam="' + out_bam + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' \ + novo_merge_rmdup_slurm log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n") subprocess.call(batch, shell=True) else: # run legacy pipe for removing dups using picard picard_tmp = 'picard_tmp' job_loc = sample + '.novosort_merge.picard_rmdup.log' job_name = sample + '_novosort_merge.picard_rmdup' # setting max records in ram to half of ram recs = str(int((int(ram) / 2) * (1000000000 / 200))) in_bam = sample + '.merged.bam' in_bai = sample + '.merged.bam.bai' mets = sample + '.rmdup.srt.metrics' batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' -J ' + job_name \ + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \ + 'G",in_bam="' + in_bam + '",bam_string="' + bam_string + '",loc="' + job_loc \ + '",java_tool="' + java_tool + '",picard_tool="' + picard_tool + '",tmp="' + picard_tmp \ + '",recs="' + recs + '",out_bam="' + out_bam + '",mets="' + mets + '",in_bai="' + in_bai \ + '" ' + novo_picard_merge_rmdup_slurm sys.stderr.write(date_time() + 'Merging with novosort and rmdup with picard for legacy reasons!\n' + batch + '\n') subprocess.call(batch, shell=True) else: link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.final.bam; ln -s ' + bai_list[0] + ' ' \ + sample + '.merged.final.bam.bai' log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n' + link_bam + '\n') subprocess.call(link_bam, shell=True) sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n') return 0
def bwa_mem_pe(bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool, samtools_ref, sample, log_dir, threads): bwa_cmd = "(" + bwa_tool + " mem -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " " + end1 + " " \ + end2 + " | " + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir \ + sample + ".bwa.pe.log 2>&1" loc = log_dir + sample + ".bwa.pe.log" log(loc, date_time() + bwa_cmd + "\n") try: subprocess.check_output(bwa_cmd, shell=True) except: exit(1) return 0
def bwt2_pe(bwt_tool, bwt_ref, end1, end2, samtools_tool, samtools_ref, sample, t, log_dir): bwt_cmd = "(" + bwt_tool + " --fr -p " + t + " -I 0 -X 500 -x " + bwt_ref + " -1 " + end1 + " -2 " + end2 + " | " \ + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir + sample \ + ".bwt.pe.log 2>&1" loc = log_dir + sample + ".bwt.pe.log" log(loc, date_time() + bwt_cmd + "\n") try: call(bwt_cmd, shell=True) except: return 1 return 0
def watch_mem(proc_obj, sample, loc): from time import sleep while proc_obj.poll() is None: mem_pct = psutil.virtual_memory().percent log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + '\n') if mem_pct >= 99: log(loc, date_time() + 'Memory exceeded while running VEP.') return 1 sleep(30) return proc_obj.poll()
def parseFASTQC(FASTQC, loc): try: fh = open(FASTQC, 'r') skip_lines(fh, 8) len_range = next(fh) info = len_range.rstrip('\n').split('-') fh.close() return info[1].rstrip('\t') except: log(loc, date_time() + 'Unable to open/process file ' + FASTQC) exit(1)
def wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus, pair, config_file): config_data = json.loads(open(config_file, 'r').read()) exome = config_data['refs']['exome'] loc = 'LOGS/' + pair + '_' + pair + '.genome_as_exome.scalpel.log' cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \ + normal_bam + ' --window 600 --two-pass --bed ' + exome + ' --ref ' + fasta + ' 2> ' + loc log(loc, date_time() + cmd + '\n') check = call(cmd, shell=True) if check != 0: return 1, pair return 0, pair
def picard_sort_pe(java_tool, picard_tool, picard_tmp, sample, log_dir): picard_sort_pe_cmd = java_tool + " -Xmx8g -jar " + picard_tool + " SortSam CREATE_INDEX=true TMP_DIR=" \ + picard_tmp + " INPUT=" + sample + ".bam OUTPUT=" + sample + ".srt.bam SORT_ORDER=" \ "coordinate VALIDATION_STRINGENCY=LENIENT > " + log_dir + sample + ".picard.sort.pe.log 2>&1" log(log_dir + sample + ".picard.sort.pe.log", date_time() + picard_sort_pe_cmd + "\n") try: subprocess.check_output(picard_sort_pe_cmd, shell=True) except: log(log_dir + sample + ".picard.sort.pe.log", 'Picard sort failed for sample ' + sample + '. Check for borg!\n') exit(1) return 0
def watch_mem(proc_obj, source, sample, loc): from time import sleep while proc_obj.poll() is None: mem_pct = psutil.virtual_memory().percent log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + ' from source ' + source + '\n') if mem_pct >= 99: log(loc, date_time() + 'Memory exceeded while running VEP.') return 1 sleep(30) return proc_obj.poll()
def parseINS(INS, loc): try: fh = open(INS, 'r') skip_lines(fh, 7) line = next(fh) line = line.rstrip('\n') stats = line.split('\t') fh.close() return stats[0], stats[1], stats[4], stats[5] except: log(loc, date_time() + 'Unable to open/process file ' + INS + '\n') exit(1)
def bwa_mem_pe(bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool, samtools_ref, sample, log_dir, threads): bwa_cmd = "(" + bwa_tool + " mem -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " " + end1 + " " \ + end2 + " | " + samtools_tool + " view -bT " + samtools_ref + " - > " + sample + ".bam) > " + log_dir \ + sample + ".bwa.pe.log 2>&1" loc = log_dir + sample + ".bwa.pe.log" log(loc, date_time() + bwa_cmd + "\n") try: subprocess.check_output(bwa_cmd, shell=True).decode() except: exit(1) return 0
def fastqc(fastqc_tool, sample, end1, end2, t): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.fastqc.log' fastqc_cmd = fastqc_tool + ' -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 log(loc, date_time() + fastqc_cmd + "\n") f = call(fastqc_cmd, shell=True) # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred # score didn't fit return 0
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram): loc = log_dir + sample + ".picard.insert_size.log" picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \ + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \ + sample + ".insert_metrics.hist >> " + log_dir + sample + ".picard.insert_size.log 2>&1" log(loc, date_time() + picard_insert_size_cmd + "\n") try: call(picard_insert_size_cmd, shell=True) return 0 except: log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n') return 1
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram): loc = log_dir + sample + ".picard.insert_size.log" picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \ + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \ + sample + ".insert_metrics.hist >> " + log_dir + sample + ".picard.insert_size.log 2>&1" log(loc , date_time() + picard_insert_size_cmd + "\n") try: call(picard_insert_size_cmd, shell=True) return 0 except: log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n') return 1
def parseSTAR(STAR, loc): try: fh = open(STAR, 'r') stats = [] skip_lines(fh, 5) num_rds = next(fh) num_rds = processSTAR(num_rds) stats.append(num_rds) skip_lines(fh, 3) uniq = next(fh) uniq = processSTAR(uniq) stats.append(uniq) next(fh) sjt = next(fh) sjt = processSTAR(sjt) stats.append(sjt) skip_lines(fh, 4) nsj = next(fh) nsj = processSTAR(nsj) stats.append(nsj) mm = next(fh) mm = processSTAR(mm) stats.append(mm) delrate = next(fh) delrate = processSTAR(delrate) stats.append(delrate) next(fh) ins = next(fh) ins = processSTAR(ins) stats.append(ins) skip_lines(fh, 3) mml = next(fh) mml = processSTAR(mml) stats.append(mml) next(fh) mmml = next(fh) mmml = processSTAR(mmml) stats.append(mmml) next(fh) unmap = next(fh) unmap = processSTAR(unmap) unmap2 = next(fh) unmap2 = processSTAR(unmap2) unmap3 = next(fh) unmap3 = processSTAR(unmap3) unmap_tot = (float(unmap.rstrip('%')) + float(unmap2.rstrip('%')) + float(unmap3.rstrip('%'))) unmap_tot = round(unmap_tot, 2) stats.append(str(unmap_tot) + '%') fh.close() return stats except: log(loc, date_time() + 'Unable to open/process file ' + STAR + '\n') exit(1)
def scalpel_indel(tumor_id, normal_id, log_dir, config_file): (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed, wg, project_dir, project, align) \ = parse_config(config_file) sample_pair = tumor_id + '_' + normal_id loc = log_dir + sample_pair + '.scalpel.log' bam_dir = project_dir + project + '/' + align tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam' normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam' if wg == 'n': scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \ + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc sys.stderr.write(date_time() + 'Starting indel calls for ' + sample_pair + '\n') log( loc, date_time() + 'Starting indel calls for ' + sample_pair + ' in capture mode with command:\n' + scalpel_cmd + '\n') check = call(scalpel_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'Indel calling failed for pair ' + sample_pair + ' with command:\n' + scalpel_cmd + '\n') exit(1) else: check = wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus, sample_pair, config_file) if check[0] != 0: sys.stderr.write('Scalpel failed for ' + normal_id + ' at ' + tumor_id + '\n') exit(1) log( loc, date_time() + 'Indel calling complete for pair ' + sample_pair + ' moving output files\n') mv_cmd = 'mv outdir/main/* .; rmdir outdir/main;' log(loc, date_time() + mv_cmd + '\n') call(mv_cmd, shell=True) sys.stderr.write(date_time() + 'Completed indel calls for ' + sample_pair + '\n') if dustmask_flag == 'Y': log(loc, date_time() + 'Filter dustmask flag given\n') check = filter_indel(bedtools, dustmask_bed, sample_pair, loc) if check != 0: sys.stderr.write(date_time() + 'Dustmask failed for ' + sample_pair + '\n') exit(1) else: log(loc, date_time() + 'Dustmask complete for ' + sample_pair + '\n') sys.stderr.write(date_time() + 'Indel call completed\n') return 0
def parseCUTADAPT(CUTADAPT, loc): try: fh = open(CUTADAPT, 'r') flag = 0 stats = [] while flag == 0: cur = next(fh) if re.search('Total read', cur): # total read pairs stats.append(process_parens(cur)) cur = next(fh) # r1a pct stats.append(process_parens(cur)) cur = next(fh) # r2a pct stats.append(process_parens(cur)) cur = next(fh) # too short stats.append(process_parens(cur)) cur = next(fh) # too rp pass stats.append(process_parens(cur)) next(fh) flag = 1 tot_bp_line = next(fh) info = tot_bp_line.split() tot_bp = int(info[-2].replace(',', '')) # total bp stats.append(str(tot_bp)) next(fh) next(fh) next(fh) # calculate trimmed base pers per read as a pct r1_qt_line = next(fh) info = r1_qt_line.split() r1_pct = round(float(info[-2].replace(',', ''))/tot_bp * 100, 2) #r1 trimmed stats.append(str(r1_pct) + '%') r2_qt_line = next(fh) info = r2_qt_line.split() r2_pct = round(float(info[-2].replace(',', ''))/tot_bp * 100, 2) # r2 trimmed stats.append(str(r2_pct) + '%') # total written tw = next(fh) stats.append(process_parens(tw)) fh.close() return stats except: log(loc, date_time() + 'Unable to open/process file ' + CUTADAPT + '\n') exit(1)
def picard_sort_pe(java_tool, picard_tool, picard_tmp, sample, log_dir): picard_sort_pe_cmd = java_tool + " -Xmx8g -jar " + picard_tool + " SortSam CREATE_INDEX=true TMP_DIR=" \ + picard_tmp + " INPUT=" + sample + ".bam OUTPUT=" + sample + ".srt.bam SORT_ORDER=" \ "coordinate VALIDATION_STRINGENCY=LENIENT > " + log_dir + sample + ".picard.sort.pe.log 2>&1" log(log_dir + sample + ".picard.sort.pe.log", date_time() + picard_sort_pe_cmd + "\n") try: subprocess.check_output(picard_sort_pe_cmd, shell=True).decode() except: log(log_dir + sample + ".picard.sort.pe.log", 'Picard sort failed for sample ' + sample + '. Check for borg!\n') exit(1) return 0
def platypus_germline(config_file, sample, log_dir, cflag): loc = log_dir + sample + ".platypus.log" # here for safety as python is confusing about whether variables exist outside of if-else statements or not platypus_cmd = '' if cflag == 'y': (platypus, fasta, threads, project_dir, project, align) = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \ + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1" else: (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \ = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' if not (os.path.isfile(bam + '.bai') or os.path.isfile(bam[:-1] + 'i')): log(loc, date_time() + bam + ' not indexed. Indexing\n') cmd = samtools + ' index ' + bam log(loc, date_time() + cmd + '\n') subprocess.call(cmd, shell=True) platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \ + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \ + " --logFileName=" + loc + " >> " + loc + " 2>&1" log(loc, date_time() + platypus_cmd + "\n") f = 0 try: f = subprocess.call(platypus_cmd, shell=True) except: log(loc, 'platypus germline variant calling failed for sample ' + sample + '\n') return f return 0
def picard_insert_size(java_tool, picard_tool, sample, log_dir): picard_insert_size_cmd = java_tool + " -Xmx2g -jar " + picard_tool + " CollectInsertSizeMetrics I=" + sample \ + ".srt.bam H=" + sample + ".insert_metrics.pdf O=" + sample + ".insert_metrics.hist > " \ + log_dir + sample + ".picard.insert_size.log 2>&1" log(log_dir + sample + ".picard.insert_size.log", date_time() + picard_insert_size_cmd + "\n") call(picard_insert_size_cmd, shell=True) # open file and return insert size fh = open(sample + ".insert_metrics.hist", 'r') for i in range(0, 7, 1): skip = next(fh) stats = next(fh) fh.close() stat = stats.split('\t') return stat[4], stat[5]
def picard_mark_dups(config_file, sample, log_dir, suffix): root = os.path.basename(sample) loc = log_dir + root + ".picard.mark_dup.log" (java_tool, picard_tool, mem) = parse_config(config_file) picard_tmp = 'picard_tmp' picard_mark_dups_cmd = 'mkdir ' + picard_tmp + ';' + java_tool + " -Djava.io.tmpdir=" + picard_tmp + " -Xmx" \ + mem + "g -jar " + picard_tool + " MarkDuplicates I=" + sample + suffix + " O=" + sample \ + ".dup_marked.bam CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=" + sample \ + ".output.metrics > " + loc + " 2>&1; rm -rf " + picard_tmp log(loc, date_time() + picard_mark_dups_cmd + "\n") check = call(picard_mark_dups_cmd, shell=True) if check == 0: return 0 else: return 1
def fastqc(fastqc_tool, sample, end1, end2, t): # casual logging - look for a LOGS directory, otherwise assume current dir log_dir = './' if os.path.isdir('LOGS'): log_dir = 'LOGS/' loc = log_dir + sample + '.fastqc.log' fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc log(loc, date_time() + fastqc_cmd + "\n") check = call(fastqc_cmd, shell=True) # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score # didn't fit if check != 0: log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n') exit(1) return 0
def parsePICARD(PICARD, loc): try: fh = open(PICARD, 'r') skip_lines(fh, 6) keys = next(fh) keys = keys.rstrip('\n').split('\t') vals = next(fh) vals = vals.rstrip('\n').split('\t') qc_dict = {} for i in range(0, len(keys), 1): qc_dict[keys[i]] = vals[i] fh .close() return qc_dict except: log(loc, date_time() + 'Unable to open/process file ' + PICARD + '\n') exit(1)
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem): meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[ 4] + "\tSM:" + meta[0] + "\tPL:illumina" loc = log_dir + sample + ".mmu.star.pe.log" mk_srt_tmp = 'mkdir TMP' subprocess.call(mk_srt_tmp, shell=True) # split threads for star and novosort as well as memory nmem = 2 ncpu = 2 threads = int(threads) sthreads = threads if threads >= 10: if threads == 10: sthreads = 6 ncpu = 4 else: if threads % 2.0 == 0.0: sthreads = int(threads / 2) ncpu = int(threads / 2) else: sthreads = int(math.ceil(threads / 2.0)) ncpu = int(math.floor(threads / 2.0)) else: sthreads = int(sthreads) - 2 mem = int(mem) if mem > 42: nmem = mem - 40 star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \ + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\ + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \ "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \ "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \ "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + " | " + novosort + " - -n -c " \ + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \ + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \ + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz" log(loc, date_time() + star_cmd + '\n') try: subprocess.call(star_cmd, shell=True) except: log( loc, date_time() + 'Star alignment and filter against against mouse genome failed\n') exit(1) log(loc, date_time() + 'Filtering completed, replacing fastq file\n') rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \ + ';rm -rf TMP' check = subprocess.call(rn_fq, shell=True) if check != 0: log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n') exit(1) return 0
def filter_wrap(mmu_filter, bwa_tool, RGRP, bwa_ref, end1, end2, samtools_tool, samtools_ref, sample, log_dir, threads): loc = log_dir + sample + ".mmu.bwa.pe.log" bwa_cmd = "(" + bwa_tool + " mem -O 60 -L 0 -E 10 -t " + threads + " -R \"" + RGRP + "\" -v 2 " + bwa_ref + " "\ + end1 + " " + end2 + " 2>> " + loc + " | " + samtools_tool + " view -bT " \ + samtools_ref + " - 2>> " + loc + " | tee " + sample + ".mmu.bam | python " \ + mmu_filter + " -s " + sample + " -n 0 -t DNA | gzip -4 -c - > " \ + sample + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz" log(loc, date_time() + bwa_cmd + "\n") try: subprocess.check_output(bwa_cmd, shell=True).decode() log(loc, date_time() + 'Filtering completed, replacing fastq file\n') rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 subprocess.call(rn_fq, shell=True) except: sys.stderr.write('Filtering failed\n.') exit(1) return 0
def annot_vcf_vep_pipe(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source): (vep_tool, vep_cache, fasta, report, dbsnp, vcache, threads, intvl, dustmask_flag) = parse_config(config_file) fasta = ref_mnt + '/' + fasta vep_cache = ref_mnt + '/' + vep_cache intvl = ref_mnt + '/' + intvl # scale back on the forking a bit if int(threads) > 2: threads = str(int(threads)/2 - 1) # parse sample file, use only last if pairs samp_fh = open(sample_pairs, 'r') # track to prevent repeat annotation if same sample used as comparison for line in samp_fh: info = line.rstrip('\n').split('\t') sample = info[0] mk_log_dir = 'mkdir LOGS' subprocess.call(mk_log_dir, shell=True) loc = 'LOGS/' + sample + '.vep_anno.log' in_vcf = sample + in_suffix out_vcf = sample + out_suffix if source == 'scalpel': pass_filter(sample, in_suffix, dustmask_flag) in_vcf = sample + '.somatic_indel.PASS.vcf' run_vep = 'perl ' + vep_tool + ' --cache -i ' + in_vcf + ' --vcf -o ' + out_vcf + ' --symbol --vcf_info_field' \ ' ANN --canonical --variant_class --no_whole_genome --offline --maf_exac --no_whole_genome ' \ '--fork ' + threads + ' --fasta ' + fasta + ' --dir_cache ' + vep_cache + ' --cache_version ' + vcache \ + ' 2>> ' + loc + ' >> ' + loc log(loc, date_time() + 'Annotating sample ' + sample + in_suffix + '\n') check = subprocess.call(run_vep, shell=True) if check != 0: log(loc, date_time() + 'VEP annotation for ' + sample + in_suffix + ' failed\n') exit(1) else: log(loc, date_time() + 'VEP annotation ' + sample + in_suffix + ' successful!\n') if source == 'mutect': check = gen_snv_report(out_vcf, sample + '.out.keep', intvl) if check != 0: log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n') exit(1) else: check = gen_indel_report(out_vcf) if check != 0: log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n') exit(1) return 0
def scalpel_indel(pairs, log_dir, config_file, ref_mnt): (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed) = parse_config(config_file) bed = ref_mnt + '/' + bed fasta = ref_mnt + '/' + fasta dustmask_bed = ref_mnt + '/' + dustmask_bed # use get_merged_bams api sample_list = 'sample_list.txt' if not os.path.isfile(sample_list): create_sample_list(pairs) sys.stderr.write(date_time() + 'Sample pairs list not created - creating one since this is being run likely ' 'outside of pipeline') get_merged_bams(config_file, sample_list) fh = open(pairs, 'r') for line in fh: cur = line.rstrip('\n').split('\t') loc = log_dir + cur[0] + '.scalpel.log' tumor_bam = cur[1] + '.merged.final.bam' normal_bam = cur[2] + '.merged.final.bam' scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \ + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc sys.stderr.write(date_time() + 'Starting indel calls for ' + cur[0] + '\n') log(loc, date_time() + 'Starting indel calls for ' + cur[0] + ' with command:\n' + scalpel_cmd + '\n') check = call(scalpel_cmd, shell=True) if check != 0: sys.stderr.write(date_time() + 'Indel calling failed for pair ' + cur[0] + ' with command:\n' + scalpel_cmd + '\n') log(loc, date_time() + 'Indel calling complete for pair ' + cur[0] + ' moving output files\n') mv_cmd = 'mkdir ' + cur[0] + '; mv outdir/main/* ' + cur[0] + '; rm -rf outdir/main;' log(loc, date_time() + mv_cmd + '\n') call(mv_cmd, shell=True) sys.stderr.write(date_time() + 'Completed indel calls for ' + cur[0] + '\n') if dustmask_flag == 'Y': log(loc, date_time() + 'Filter dustmask flag given\n') check = filter_indel(bedtools, dustmask_bed, cur[0]) if check != 0: sys.stderr.write(date_time() + 'Dustmask failed for ' + cur[0] + '\n') exit(1) else: log(loc, date_time() + 'Dustmask complete for ' + cur[0] + '\n') fh.close() sys.stderr.write(date_time() + 'Indel call completed\n') return 0
def express_quant(sample, config_file, x, s): loc = sample + '.express.log' if os.path.isdir('LOGS'): loc = 'LOGS/' + loc (stranded, strand, express, transcriptome) = parse_config(config_file) bam = 'BAMS/' + sample + '.merged.transcriptome.bam' if stranded == 'N': express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -m '\ + x + ' -s ' + s + ' --logtostderr 2>> ' + loc else: express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check --'\ + strand + ' -m ' + x + ' -s ' + s + ' --logtostderr 2>> ' + loc log(loc, date_time() + express_cmd + '\n') check = subprocess.call(express_cmd, shell=True) rename_express_out = 'mv results.xprs ' + sample + '.express_quantification.txt; mv params.xprs ' + sample\ + '.params.xprs' check += subprocess.call(rename_express_out, shell=True) log(loc, date_time() + 'Completed qc. Renaming files\n') return check
def qc_bam(sample, config_file): # job_list = [] loc = sample + '.bam_qc.log' if os.path.isdir('LOGS'): loc = 'LOGS/' + loc (java, ram, picard, refFlat, intervals, strand, threads) = parse_config(config_file) # recalc ram to be a bit lower ram = str(int(round(int(ram) * 0.75))) st_dict = {'N': 'NONE', 'fr-stranded': 'FIRST_READ_TRANSCRIPTION_STRAND', 'rf-stranded': 'SECOND_READ_TRANSCRIPTION_STRAND'} picard_cmd = java + ' -Xmx' + ram + 'g -XX:+UseConcMarkSweepGC -XX:ParallelGCThreads=' + threads + \ ' -XX:MaxGCPauseMillis=10000 -jar ' + picard + ' CollectRnaSeqMetrics REF_FLAT=' + refFlat \ + ' STRAND=' + st_dict[strand] + ' CHART=' + sample + '.pos_v_cov.pdf I=' + sample \ + '.Aligned.sortedByCoord.out.bam O=' + sample + '.picard_RNAseq_qc.txt RIBOSOMAL_INTERVALS=' \ + intervals + ' VALIDATION_STRINGENCY=SILENT 2>> ' + loc + ' >> ' + loc log(loc, date_time() + picard_cmd + '\n') subprocess.call(picard_cmd, shell=True) return 0
def novosort_sort_pe(novosort, sample, log_dir, t, mem, stype): samp_root = os.path.basename(sample) temp = 'novosort_tmp' novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \ + "G --tmpdir " + temp + " --output " + sample + ".srt.bam --index " + sample + ".bam > " \ + log_dir + samp_root + ".novosort.sort.pe.log 2>&1" if stype == 'name': novosort_sort_pe_cmd = 'mkdir ' + temp + ';' + novosort + " --threads " + t + " --ram " + mem \ + "G --tmpdir " + temp + " --output " + sample + ".nsrt.bam -n " + sample + ".bam > " \ + log_dir + samp_root + ".novosort.sort.pe.log 2>&1" log(log_dir + samp_root + ".novosort.sort.pe.log", date_time() + novosort_sort_pe_cmd + "\n") f = 0 try: f = subprocess.call(novosort_sort_pe_cmd, shell=True) rm_tmp = 'rm -rf novosort_tmp' subprocess.call(rm_tmp, shell=True) except: log(log_dir + sample + ".novosort.sort.pe.log", 'novosort sort failed for sample ' + sample + '\n') exit(1) return f
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem): meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina" loc = log_dir + sample + ".mmu.star.pe.log" mk_srt_tmp = 'mkdir TMP' subprocess.call(mk_srt_tmp, shell=True) # split threads for star and novosort as well as memory nmem = 2 ncpu = 2 threads = int(threads) sthreads = threads if threads >= 10: if threads == 10: sthreads = 6 ncpu = 4 else: if threads % 2.0 == 0.0: sthreads = int(threads/2) ncpu = int(threads/2) else: sthreads = int(math.ceil(threads/2.0)) ncpu = int(math.floor(threads/2.0)) else: sthreads = int(sthreads) - 2 mem = int(mem) if mem > 42: nmem = mem - 40 star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \ + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\ + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \ "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \ "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \ "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + " | " + novosort + " - -n -c " \ + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \ + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \ + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz" log(loc, date_time() + star_cmd + '\n') try: subprocess.call(star_cmd, shell=True) except: log(loc, date_time() + 'Star alignment and filter against against mouse genome failed\n') exit(1) log(loc, date_time() + 'Filtering completed, replacing fastq file\n') rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \ + ';rm -rf TMP' check = subprocess.call(rn_fq, shell=True) if check != 0: log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n') exit(1) return 0
def star(STAR, genome, end1, end2, sample, log_dir, th, sf): loc = log_dir + sample + ".star.log" meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina" star_cmd = STAR + " --runMode alignReads --twopassMode Basic --outFileNamePrefix " + sample + ". --runThreadN " \ + th + " --genomeDir " + genome + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat \ --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --outFilterType BySJout \ --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 8 \ --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --quantTranscriptomeBan " \ "Singleend --outSAMattrRGline " + RGRP if sf == 'N': # add XS tag is input is not stranded star_cmd += ' --outSAMattributes NH HI AS nM XS' star_cmd += ' 2>> ' + loc + ' >> ' + loc + '; mv *Log* ' + log_dir log(loc, date_time() + star_cmd + "\n") check = call(star_cmd, shell=True) if check == 0: return 0 else: return 1
def star(STAR, genome, end1, end2, sample, log_dir, th, sf): loc = log_dir + sample + ".star.log" meta = sample.split('_') RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[ 4] + "\tSM:" + meta[0] + "\tPL:illumina" star_cmd = STAR + " --runMode alignReads --twopassMode Basic --outFileNamePrefix " + sample + ". --runThreadN " \ + th + " --genomeDir " + genome + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat \ --quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM SortedByCoordinate --outFilterType BySJout \ --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 8 \ --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --quantTranscriptomeBan " \ "Singleend --outSAMattrRGline " + RGRP if sf == 'N': # add XS tag is input is not stranded star_cmd += ' --outSAMattributes NH HI AS nM XS' star_cmd += ' 2>> ' + loc + ' >> ' + loc + '; mv *Log* ' + log_dir log(loc, date_time() + star_cmd + "\n") check = call(star_cmd, shell=True) if check == 0: return 0 else: return 1
def gen_report(vcf, out, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') mut_dict = create_mutect_ind(out) log(loc, date_time() + 'Created index for added mutect info\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t' 'codon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0] ann_list = [_.split('|') for _ in record.info['ANN']] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def platypus_germline(config_file, sample, log_dir, cflag): loc = log_dir + sample + ".platypus.log" # here for safety as python is confusing about whether variables exist outside of if-else statements or not platypus_cmd = '' if cflag == 'y': (platypus, fasta, threads, project_dir, project, align) = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \ + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1" else: (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \ = parse_config(config_file, cflag) bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam' if not (os.path.isfile(bam + '.bai') or os.path.isfile(bam[:-1] + 'i')): log(loc, date_time() + bam + ' not indexed. Indexing\n') cmd = samtools + ' index ' + bam log(loc, date_time() + cmd + '\n') subprocess.call(cmd, shell=True) platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \ + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \ + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \ + " --logFileName=" + loc + " >> " + loc + " 2>&1" log(loc, date_time() + platypus_cmd + "\n") f = 0 try: f = subprocess.call(platypus_cmd, shell=True) except: log( loc, 'platypus germline variant calling failed for sample ' + sample + '\n') return f return 0
def novosort_sort_pe(novosort, sample, log_dir, threads, ram, rmdup): if rmdup == 'Y': logfile = sample + ".novosort.rmdup.sort.pe.log" novosort_sort_cmd = 'mkdir novosort_tmp;' + novosort + " -c " + threads + " -m " + ram \ + "G --tmpdir novosort_tmp --rd --kt -o " + sample + ".rmdup.srt.bam --index "\ + sample + ".bam > " + log_dir + logfile + " 2>&1" log(log_dir + logfile, date_time() + novosort_sort_cmd + "\n") else: logfile = sample + ".novosort.sort.pe.log" novosort_sort_cmd = 'mkdir novosort_tmp;' + novosort + " --threads " + threads + " --ram " \ + ram + "G --tmpdir novosort_tmp -o " + sample + ".srt.bam --index " \ + sample + ".bam > " + log_dir + logfile + " 2>&1" log(log_dir + logfile, date_time() + novosort_sort_cmd + "\n") f = 0 try: f = subprocess.call(novosort_sort_cmd, shell=True) rm_tmp = 'rm -rf novosort_tmp' subprocess.call(rm_tmp, shell=True) except: log(log_dir + logfile, 'novosort sort failed for sample ' + sample + '\n') exit(1) return f
def annot_vcf_vep_pipe(config_file, sample_pair, in_suffix, out_suffix, in_mutect, source): (vep_tool, vep_cache, fasta, report, dbsnp, vcache, plugin_dir, threads, intvl, dustmask_flag, wg_flag, tx_index, project_dir, project, analysis) = parse_config(config_file) # scale back on the forking a bit if int(threads) > 2: # threads = str(int(threads)/2 - 1) threads = str(int(threads) - 1) # track to prevent repeat annotation if same sample used as comparison loc = 'LOGS/' + sample_pair + '.vep91_anno.log' ana_dir = project_dir + project + '/' + analysis + '/' + sample_pair + '/OUTPUT' in_vcf = ana_dir + '/' + sample_pair + in_suffix out_vcf = sample_pair + out_suffix if source == 'scalpel': z_check = pass_filter(ana_dir, sample_pair, in_suffix, dustmask_flag) if z_check == 0: log( loc, date_time() + '0 variant calls PASS scalpel\'s filters, skipping annotation!\n' ) return 0 in_vcf = sample_pair + '.somatic_indel.PASS.vcf' # run_vep = '' buffer_size = '5000' run_cmd = run_vep(vep_tool, in_vcf, out_vcf, buffer_size, threads, fasta, vep_cache, vcache, loc, plugin_dir) log( loc, date_time() + 'Annotating sample ' + sample_pair + in_suffix + ' ' + run_cmd + '\n') # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart check = subprocess.Popen(run_cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid) check_run = watch_mem(check, source, sample_pair, loc) if check_run != 0: buffer_size = str(int(buffer_size) / 2) clean_up = 'rm ' + out_vcf + '*' log( loc, date_time() + 'VEP failed. Status of run was ' + str(check_run) + ' Trying smaller buffer size of ' + buffer_size + '\n' + clean_up + '\n') os.killpg(os.getpgid(check.pid), signal.SIGINT) subprocess.call(clean_up, shell=True) run_cmd = run_vep(vep_tool, in_vcf, out_vcf, buffer_size, threads, fasta, vep_cache, vcache, loc, plugin_dir) log( loc, date_time() + 'Annotating sample ' + sample_pair + in_suffix + '\n') check = subprocess.call(run_cmd, shell=True) if check != 0: log(loc, date_time() + 'VEP failed for sample ' + sample_pair + '\n') exit(1) else: log( loc, date_time() + 'VEP annotation of ' + sample_pair + in_suffix + ' successful!\n') if vep_cache == '84': from annotation.deprecated.vep_substitution_report import gen_report as gen_snv_report from annotation.deprecated.vep_indel_report import gen_report as gen_indel_report else: from annotation.VEP91_substitution_report import gen_report as gen_snv_report from annotation.VEP91_indel_report import gen_report as gen_indel_report if source == 'mutect': if wg_flag == 'y': intvl = 'n' check = gen_snv_report(out_vcf, ana_dir + '/' + sample_pair + in_mutect, intvl, tx_index, vcache) if check != 0: log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n') exit(1) else: check = gen_indel_report(out_vcf, tx_index, vcache) if check != 0: log(loc, date_time() + 'Report generation for ' + out_vcf + ' failed\n') exit(1) return 0