def VarDict(variant_id, vardict_variants): if variant_id in vardict_variants: vardict_variant_i = vardict_variants[variant_id] if (vardict_variant_i.filters == 'PASS') and ('Somatic' in vardict_variant_i.info): vardict_classification = 1 elif 'Somatic' in vardict_variant_i.info: vardict_filters = vardict_variant_i.filters.split(';') disqualifying_filters = \ ('d7' in vardict_filters or 'd5' in vardict_filters) or \ ('DIFF0.2' in vardict_filters) or \ ('LongAT' in vardict_filters) or \ ('MAF0.05' in vardict_filters) or \ ('MSI6' in vardict_filters) or \ ('NM4' in vardict_filters or 'NM4.25' in vardict_filters) or \ ('pSTD' in vardict_filters) or \ ('SN1.5' in vardict_filters) or \ ( 'P0.05' in vardict_filters and float(vardict_variant_i.get_info_value('SSF') ) >= 0.15 ) or \ ( ('v3' in vardict_filters or 'v4' in vardict_filters) and int(vardict_variant_i.get_sample_value('VD', 0))<3 ) no_bad_filter = not disqualifying_filters filter_fail_times = len(vardict_filters) if no_bad_filter and filter_fail_times <= 2: vardict_classification = 0.5 else: vardict_classification = 0 else: vardict_classification = 0 # Somatic Score: score_vardict = vardict_variant_i.get_info_value('SSF') if score_vardict: score_vardict = float(score_vardict) score_vardict = genome.p2phred(score_vardict, max_phred=100) else: score_vardict = nan # MSI, MSILEN, and SHIFT3: msi = find_MSI(vardict_variant_i) msilen = find_MSILEN(vardict_variant_i) shift3 = find_SHIFT3(vardict_variant_i) else: vardict_classification = 0 msi = msilen = shift3 = score_vardict = nan return vardict_classification, msi, msilen, shift3, score_vardict
def VarDict(variant_id, vardict_variants): if variant_id in vardict_variants: vardict_variant_i = vardict_variants[ variant_id ] if (vardict_variant_i.filters == 'PASS') and ('Somatic' in vardict_variant_i.info): vardict_classification = 1 elif 'Somatic' in vardict_variant_i.info: vardict_filters = vardict_variant_i.filters.split(';') disqualifying_filters = \ ('d7' in vardict_filters or 'd5' in vardict_filters) or \ ('DIFF0.2' in vardict_filters) or \ ('LongAT' in vardict_filters) or \ ('MAF0.05' in vardict_filters) or \ ('MSI6' in vardict_filters) or \ ('NM4' in vardict_filters or 'NM4.25' in vardict_filters) or \ ('pSTD' in vardict_filters) or \ ('SN1.5' in vardict_filters) or \ ( 'P0.05' in vardict_filters and float(vardict_variant_i.get_info_value('SSF') ) >= 0.15 ) or \ ( ('v3' in vardict_filters or 'v4' in vardict_filters) and int(vardict_variant_i.get_sample_value('VD', 0))<3 ) no_bad_filter = not disqualifying_filters filter_fail_times = len(vardict_filters) if no_bad_filter and filter_fail_times<=2: vardict_classification = 0.5 else: vardict_classification = 0 else: vardict_classification = 0 # Somatic Score: score_vardict = vardict_variant_i.get_info_value('SSF') if score_vardict: score_vardict = float(score_vardict) score_vardict = genome.p2phred(score_vardict, max_phred=100) else: score_vardict = nan # MSI, MSILEN, and SHIFT3: msi = find_MSI(vardict_variant_i) msilen = find_MSILEN(vardict_variant_i) shift3 = find_SHIFT3(vardict_variant_i) else: vardict_classification = 0 msi = msilen = shift3 = score_vardict = nan return vardict_classification, msi, msilen, shift3, score_vardict
def JSM(variant_id, jsm_variants): if variant_id in jsm_variants: jsm_variant_i = jsm_variants[ variant_id ] jointsnvmix2_classification = 1 aaab = float( jsm_variant_i.get_info_value('AAAB') ) aabb = float( jsm_variant_i.get_info_value('AABB') ) jointsnvmix2_p = 1 - aaab - aabb score_jointsnvmix2 = genome.p2phred(jointsnvmix2_p, max_phred=50) else: jointsnvmix2_classification = 0 score_jointsnvmix2 = nan return jointsnvmix2_classification, score_jointsnvmix2
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, nbam_fn=None, tbam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, jsm=None, sniper=None, vardict=None, muse=None, lofreq=None, scalpel=None, strelka=None, tnscope=None, platypus=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa) tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 10 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if jsm: jsm = genome.open_textfile(jsm) jsm_line = genome.skip_vcf_header(jsm) if sniper: sniper = genome.open_textfile(sniper) sniper_line = genome.skip_vcf_header(sniper) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if muse: muse = genome.open_textfile(muse) muse_line = genome.skip_vcf_header(muse) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) if tnscope: tnscope = genome.open_textfile(tnscope) tnscope_line = genome.skip_vcf_header(tnscope) if platypus: platypus = genome.open_textfile(platypus) platypus_line = genome.skip_vcf_header(platypus) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan # Keep track of NumCallers: num_callers = 0 #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if jsm: got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate( my_coordinate, jsm_line, jsm, chrom_seq) if sniper: got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate( my_coordinate, sniper_line, sniper, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if muse: got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate( my_coordinate, muse_line, muse, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if tnscope: got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate( my_coordinate, tnscope_line, tnscope, chrom_seq) if platypus: got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate( my_coordinate, platypus_line, platypus, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the BAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) #################### Collect Caller Vcf ####################: if mutect: mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = nlod = tlod = tandem = ecnt = nan if varscan: varscan_classification = annotate_caller.VarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = nan if jsm: jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM( variant_id, jsm_variants) num_callers += jointsnvmix2_classification else: jointsnvmix2_classification = score_jointsnvmix2 = nan if sniper: sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper( variant_id, sniper_variants) num_callers += sniper_classification else: sniper_classification = score_somaticsniper = nan if vardict: vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = score_vardict = nan if muse: muse_classification = annotate_caller.MuSE( variant_id, muse_variants) num_callers += muse_classification else: muse_classification = nan if lofreq: lofreq_classification = annotate_caller.LoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.Scalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = somatic_evs = qss = tqss = nan if tnscope: tnscope_classification = annotate_caller.TNscope( variant_id, tnscope_variants) num_callers += tnscope_classification else: tnscope_classification = nan if platypus: platypus_classification = annotate_caller.countPASS( variant_id, platypus_variants) num_callers += platypus_classification else: platypus_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants: judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### ######### nBamFeatures = sequencing_features.from_bam( nbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) tBamFeatures = sequencing_features.from_bam( tbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) n_ref = nBamFeatures['ref_for'] + nBamFeatures[ 'ref_rev'] n_alt = nBamFeatures['alt_for'] + nBamFeatures[ 'alt_rev'] t_ref = tBamFeatures['ref_for'] + tBamFeatures[ 'ref_rev'] t_alt = tBamFeatures['alt_for'] + tBamFeatures[ 'alt_rev'] sor = sequencing_features.somaticOddRatio( n_ref, n_alt, t_ref, t_alt) # Calculate VarScan'2 SCC directly without using VarScan2 output: try: score_varscan2 = genome.p2phred( stats.fisher_exact( ((t_alt, n_alt), (t_ref, n_ref)), alternative='greater')[1]) except ValueError: score_varscan2 = nan # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_VarScan2 = varscan_classification, \ if_JointSNVMix2 = jointsnvmix2_classification, \ if_SomaticSniper = sniper_classification, \ if_VarDict = vardict_classification, \ MuSE_Tier = muse_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ if_Strelka = strelka_classification, \ if_TNscope = tnscope_classification, \ if_Platypus = platypus_classification, \ Strelka_Score = somatic_evs, \ Strelka_QSS = qss, \ Strelka_TQSS = tqss, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ SNVMix2_Score = rescale(score_jointsnvmix2, 'phred', p_scale, 1001), \ Sniper_Score = rescale(score_somaticsniper, 'phred', p_scale, 1001), \ VarDict_Score = rescale(score_vardict, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ N_DP = nBamFeatures['dp'], \ nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'], \ nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'], \ nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'], \ nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'], \ nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'], \ nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'], \ nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'], \ nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'], \ nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'], \ nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'], \ nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'], \ nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'], \ nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'], \ nBAM_Concordance_FET = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ N_REF_FOR = nBamFeatures['ref_for'], \ N_REF_REV = nBamFeatures['ref_rev'], \ N_ALT_FOR = nBamFeatures['alt_for'], \ N_ALT_REV = nBamFeatures['alt_rev'], \ nBAM_StrandBias_FET = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'], \ nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'], \ nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'], \ nBAM_Clipping_FET = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ nBAM_MQ0 = nBamFeatures['MQ0'], \ nBAM_Other_Reads = nBamFeatures['noise_read_count'], \ nBAM_Poor_Reads = nBamFeatures['poor_read_count'], \ nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'], \ nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'], \ nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'], \ nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'], \ nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'], \ nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'], \ M2_NLOD = nlod, \ M2_TLOD = tlod, \ M2_STR = tandem, \ M2_ECNT = ecnt, \ SOR = sor, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect, varscan, jsm, sniper, vardict, muse, lofreq, scalpel, strelka, tnscope, platypus) [opened_file.close() for opened_file in opened_files if opened_file]
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 6 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the tBAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) # Reset num_caller to 0 for each variant in the same coordinate num_callers = 0 #################### Collect Caller Vcf ####################: if mutect: mutect_classification, tlod, ecnt = annotate_caller.ssMuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = tlod = ecnt = nan if varscan: varscan_classification, score_varscan2 = annotate_caller.ssVarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = score_varscan2 = nan if vardict: vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan if lofreq: lofreq_classification = annotate_caller.ssLoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.ssScalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification = annotate_caller.ssStrelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants.keys(): judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### INFO EXTRACTION FROM BAM FILES ########## ######### # Tumor tBAM file: tBamFeatures = sequencing_features.from_bam( bam, my_coordinate, ref_base, first_alt, min_mq, min_bq) # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring. seq_span_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 41), my_coordinate[1] + 40) seq_left_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 81), my_coordinate[1]) seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[1], my_coordinate[1] + 81) if len(seq_span_80bp) > 20: LC_spanning = sequencing_features.subLC( seq_span_80bp, 20) else: LC_spanning = math.nan if len(seq_left_80bp) > 20: left_LC = sequencing_features.subLC( seq_left_80bp, 20) else: left_LC = math.nan if len(seq_right_80bp) > 20: right_LC = sequencing_features.subLC( seq_right_80bp, 20) else: right_LC = math.nan LC_adjacent = min(left_LC, right_LC) LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_Strelka = strelka_classification, \ if_VarScan2 = varscan_classification, \ if_VarDict = vardict_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ Seq_Complexity_Span = LC_spanning_phred, \ Seq_Complexity_Adj = LC_adjacent_phred, \ M2_TLOD = tlod, \ M2_ECNT = ecnt, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_p_MannWhitneyU_MQ = '%g' % tBamFeatures['p_mannwhitneyu_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_p_MannWhitneyU_BQ = '%g' % tBamFeatures['p_mannwhitneyu_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka) [opened_file.close() for opened_file in opened_files if opened_file]
def tsv2vcf(tsv_fn, vcf_fn, tools, pass_score=0.5, lowqual_score=0.1, hom_threshold=0.85, het_threshold=0.01, single_mode=False, paired_mode=True, normal_sample_name='NORMAL', tumor_sample_name='TUMOR', print_reject=True, phred_scaled=True): tools_code = {'CGA': 'M', 'MuTect': 'M', 'MuTect2': 'M', 'VarScan2': 'V', 'JointSNVMix2': 'J', 'SomaticSniper': 'S', 'VarDict': 'D', 'MuSE': 'U', 'LoFreq': 'L', 'Scalpel': 'P', 'Strelka': 'K', 'TNscope': 'T', 'Platypus': 'Y'} mvjsdu = '' for tool_i in tools: assert tool_i in tools_code.keys() mvjsdu = mvjsdu + tools_code[tool_i] total_num_tools = len(mvjsdu) tool_string = ', '.join( tools ) with open(tsv_fn) as tsv, open(vcf_fn, 'w') as vcf: # First line is a header: tsv_i = tsv.readline().rstrip() tsv_header = tsv_i.split('\t') # Make the header items into indices (single/paired have different tool names) toolcode2index = {} for n,item in enumerate(tsv_header): if 'if_MuTect' == item: toolcode2index['M'] = n elif 'if_VarScan2' == item: toolcode2index['V'] = n elif 'if_JointSNVMix2' == item: toolcode2index['J'] = n elif 'if_SomaticSniper' == item: toolcode2index['S'] = n elif 'if_VarDict' == item: toolcode2index['D'] = n elif 'MuSE_Tier' == item: toolcode2index['U'] = n MuSE_Tier = tsv_header.index('MuSE_Tier') elif 'if_LoFreq' == item: toolcode2index['L'] = n elif 'if_Scalpel' == item: toolcode2index['P'] = n elif 'if_Strelka' == item: toolcode2index['K'] = n elif 'if_TNscope' == item: toolcode2index['T'] = n elif 'if_Platypus' == item: toolcode2index['Y'] = n ALT = tsv_header.index('ALT') CHROM = tsv_header.index('CHROM') ID = tsv_header.index('ID') POS = tsv_header.index('POS') REF = tsv_header.index('REF') T_ALT_FOR = tsv_header.index('T_ALT_FOR') T_ALT_REV = tsv_header.index('T_ALT_REV') tBAM_ALT_BQ = tsv_header.index('tBAM_ALT_BQ') tBAM_ALT_Concordant = tsv_header.index('tBAM_ALT_Concordant') tBAM_ALT_Discordant = tsv_header.index('tBAM_ALT_Discordant') tBAM_ALT_MQ = tsv_header.index('tBAM_ALT_MQ') tBAM_ALT_NM = tsv_header.index('tBAM_ALT_NM') tBAM_Concordance_FET = tsv_header.index('tBAM_Concordance_FET') tBAM_MQ0 = tsv_header.index('tBAM_MQ0') tBAM_REF_BQ = tsv_header.index('tBAM_REF_BQ') tBAM_REF_Concordant = tsv_header.index('tBAM_REF_Concordant') tBAM_REF_Discordant = tsv_header.index('tBAM_REF_Discordant') tBAM_REF_MQ = tsv_header.index('tBAM_REF_MQ') tBAM_REF_NM = tsv_header.index('tBAM_REF_NM') tBAM_StrandBias_FET = tsv_header.index('tBAM_StrandBias_FET') tBAM_Z_Ranksums_BQ = tsv_header.index('tBAM_Z_Ranksums_BQ') tBAM_Z_Ranksums_MQ = tsv_header.index('tBAM_Z_Ranksums_MQ') T_REF_FOR = tsv_header.index('T_REF_FOR') T_REF_REV = tsv_header.index('T_REF_REV') if not single_mode: N_ALT_FOR = tsv_header.index('N_ALT_FOR') N_ALT_REV = tsv_header.index('N_ALT_REV') nBAM_ALT_BQ = tsv_header.index('nBAM_ALT_BQ') nBAM_ALT_Concordant = tsv_header.index('nBAM_ALT_Concordant') nBAM_ALT_MQ = tsv_header.index('nBAM_ALT_MQ') nBAM_ALT_NM = tsv_header.index('nBAM_ALT_NM') nBAM_Concordance_FET = tsv_header.index('nBAM_Concordance_FET') nBAM_MQ0 = tsv_header.index('nBAM_MQ0') nBAM_REF_BQ = tsv_header.index('nBAM_REF_BQ') nBAM_REF_Concordant = tsv_header.index('nBAM_REF_Concordant') nBAM_REF_Discordant = tsv_header.index('nBAM_REF_Discordant') nBAM_REF_MQ = tsv_header.index('nBAM_REF_MQ') nBAM_REF_NM = tsv_header.index('nBAM_REF_NM') nBAM_StrandBias_FET = tsv_header.index('nBAM_StrandBias_FET') nBAM_Z_Ranksums_BQ = tsv_header.index('nBAM_Z_Ranksums_BQ') nBAM_Z_Ranksums_MQ = tsv_header.index('nBAM_Z_Ranksums_MQ') N_REF_FOR = tsv_header.index('N_REF_FOR') N_REF_REV = tsv_header.index('N_REF_REV') try: SCORE = tsv_header.index('SCORE') except ValueError: pass # Create vcf headers: vcf.write('##fileformat=VCFv4.1\n') vcf.write(version_line + '\n') vcf.write('##FILTER=<ID=LowQual,Description="Less confident somatic mutation calls with probability value at least {}">\n'.format(lowqual_score) ) vcf.write('##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation calls with probability value at least {}">\n'.format(pass_score) ) vcf.write('##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation with ONCOSCORE below 2">\n') vcf.write('##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic mutation in primary">\n') vcf.write('##INFO=<ID={COMBO},Number={NUM},Type=Integer,Description="Calling decision of the {NUM} algorithms: {TOOL_STRING}">\n'.format(COMBO=mvjsdu, NUM=total_num_tools, TOOL_STRING=tool_string) ) vcf.write('##INFO=<ID=NUM_TOOLS,Number=1,Type=Float,Description="Number of tools called it Somatic">\n') if single_mode: vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Variant Allele Fraction">\n') vcf.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') vcf.write('##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="ref forward, ref reverse, alt forward, alt reverse">\n') vcf.write('##FORMAT=<ID=CD4,Number=4,Type=Integer,Description="ref concordant, ref discordant, alt concordant, alt discordant">\n') vcf.write('##FORMAT=<ID=refMQ,Number=1,Type=Float,Description="average mapping score for reference reads">\n') vcf.write('##FORMAT=<ID=altMQ,Number=1,Type=Float,Description="average mapping score for alternate reads">\n') vcf.write('##FORMAT=<ID=refBQ,Number=1,Type=Float,Description="average base quality score for reference reads">\n') vcf.write('##FORMAT=<ID=altBQ,Number=1,Type=Float,Description="average base quality score for alternate reads">\n') vcf.write('##FORMAT=<ID=refNM,Number=1,Type=Float,Description="average edit distance for reference reads">\n') vcf.write('##FORMAT=<ID=altNM,Number=1,Type=Float,Description="average edit distance for alternate reads">\n') vcf.write('##FORMAT=<ID=fetSB,Number=1,Type=Float,Description="Strand bias FET">\n') vcf.write('##FORMAT=<ID=fetCD,Number=1,Type=Float,Description="Concordance FET">\n') vcf.write('##FORMAT=<ID=zMQ,Number=1,Type=Float,Description="z-score rank sum of mapping quality">\n') vcf.write('##FORMAT=<ID=zBQ,Number=1,Type=Float,Description="z-score rank sum of base quality">\n') vcf.write('##FORMAT=<ID=MQ0,Number=1,Type=Integer,Description="Number of reads with mapping quality of 0">\n') vcf.write('##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">\n') if single_mode: vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n'.format(tumor_sample_name) ) elif paired_mode: vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\t{}\n'.format(normal_sample_name, tumor_sample_name) ) # Start writing content: tsv_i = tsv.readline().rstrip() while tsv_i: tsv_item = tsv_i.split('\t') try: score = float( tsv_item[SCORE] ) except NameError: score = nan if phred_scaled: scaled_score = p2phred(1-score, max_phred = 255) else: scaled_score = score try: # Non-PASS MuSE calls are made into fractions. if tsv_item[MuSE_Tier] != '1': if_MuSE = '0' else: if_MuSE = '1' except NameError: if_MuSE = '.' MVJS = [] num_tools = 0 for tool_i in mvjsdu: if_Tool = tsv_item[ toolcode2index[tool_i] ] if if_Tool == '1': if_Tool = '1' elif if_Tool == 'nan': if_Tool = '.' else: if_Tool = '0' MVJS.append( if_Tool ) num_tools = num_tools + int(if_Tool) MVJS = ','.join(MVJS) info_string = '{COMBO}={MVJSD};NUM_TOOLS={NUM_TOOLS}'.format( COMBO=mvjsdu, MVJSD=MVJS, NUM_TOOLS=num_tools ) # NORMAL if not single_mode: n_ref_mq = tsv_item[nBAM_REF_MQ] if tsv_item[nBAM_REF_MQ] != 'nan' else '.' n_alt_mq = tsv_item[nBAM_ALT_MQ] if tsv_item[nBAM_ALT_MQ] != 'nan' else '.' n_ref_bq = tsv_item[nBAM_REF_BQ] if tsv_item[nBAM_REF_BQ] != 'nan' else '.' n_alt_bq = tsv_item[nBAM_ALT_BQ] if tsv_item[nBAM_ALT_BQ] != 'nan' else '.' n_ref_nm = tsv_item[nBAM_REF_NM] if tsv_item[nBAM_REF_NM] != 'nan' else '.' n_alt_nm = tsv_item[nBAM_ALT_NM] if tsv_item[nBAM_ALT_NM] != 'nan' else '.' n_MQ0 = tsv_item[nBAM_MQ0] if tsv_item[nBAM_MQ0] != 'nan' else '.' n_sb = tsv_item[nBAM_StrandBias_FET] if tsv_item[nBAM_StrandBias_FET] != 'nan' else '.' n_cd = tsv_item[nBAM_Concordance_FET] if tsv_item[nBAM_Concordance_FET] != 'nan' else '.' n_bqb = tsv_item[nBAM_Z_Ranksums_BQ] if tsv_item[nBAM_Z_Ranksums_BQ] != 'nan' else '.' n_mqb = tsv_item[nBAM_Z_Ranksums_MQ] if tsv_item[nBAM_Z_Ranksums_MQ] != 'nan' else '.' n_ref_for = tsv_item[N_REF_FOR] if tsv_item[N_REF_FOR] != 'nan' else '0' n_ref_rev = tsv_item[N_REF_REV] if tsv_item[N_REF_REV] != 'nan' else '0' n_alt_for = tsv_item[N_ALT_FOR] if tsv_item[N_ALT_FOR] != 'nan' else '0' n_alt_rev = tsv_item[N_ALT_REV] if tsv_item[N_ALT_REV] != 'nan' else '0' n_ref_con = tsv_item[nBAM_REF_Concordant] if tsv_item[nBAM_REF_Concordant] != 'nan' else '0' n_ref_dis = tsv_item[nBAM_REF_Discordant] if tsv_item[nBAM_REF_Discordant] != 'nan' else '0' n_alt_con = tsv_item[nBAM_ALT_Concordant] if tsv_item[nBAM_ALT_Concordant] != 'nan' else '0' n_alt_dis = tsv_item[nBAM_ALT_Concordant] if tsv_item[nBAM_ALT_Concordant] != 'nan' else '0' # DP4toGT: gt = dp4_to_gt(n_ref_for, n_ref_rev, n_alt_for, n_alt_rev, hom_threshold, het_threshold) # 4-number strings: dp4_string = ','.join(( n_ref_for, n_ref_rev, n_alt_for, n_alt_rev )) cd4_string = ','.join(( n_ref_con, n_ref_dis, n_alt_con, n_alt_dis )) try: vaf = ( int(n_alt_for) + int(n_alt_rev) ) / ( int(n_alt_for) + int(n_alt_rev) + int(n_ref_for) + int(n_ref_rev) ) except ZeroDivisionError: vaf = 0 vaf = '%.3g' % vaf normal_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=n_ref_mq, altMQ=n_alt_mq, refBQ=n_ref_bq, altBQ=n_alt_bq, refNM=n_ref_nm, altNM=n_alt_nm, fetSB=n_sb, fetCD=n_cd, zMQ=n_mqb, zBQ=n_bqb, MQ0=n_MQ0, VAF=vaf) ### TUMOR ### t_ref_mq = tsv_item[tBAM_REF_MQ] if tsv_item[tBAM_REF_MQ] != 'nan' else '.' t_alt_mq = tsv_item[tBAM_ALT_MQ] if tsv_item[tBAM_ALT_MQ] != 'nan' else '.' t_ref_bq = tsv_item[tBAM_REF_BQ] if tsv_item[tBAM_REF_BQ] != 'nan' else '.' t_alt_bq = tsv_item[tBAM_ALT_BQ] if tsv_item[tBAM_ALT_BQ] != 'nan' else '.' t_ref_nm = tsv_item[tBAM_REF_NM] if tsv_item[tBAM_REF_NM] != 'nan' else '.' t_alt_nm = tsv_item[tBAM_ALT_NM] if tsv_item[tBAM_ALT_NM] != 'nan' else '.' t_MQ0 = tsv_item[tBAM_MQ0] if tsv_item[tBAM_MQ0] != 'nan' else '.' t_sb = tsv_item[tBAM_StrandBias_FET] if tsv_item[tBAM_StrandBias_FET] != 'nan' else '.' t_cd = tsv_item[tBAM_Concordance_FET] if tsv_item[tBAM_Concordance_FET] != 'nan' else '.' t_bqb = tsv_item[tBAM_Z_Ranksums_BQ] if tsv_item[tBAM_Z_Ranksums_BQ] != 'nan' else '.' t_mqb = tsv_item[tBAM_Z_Ranksums_MQ] if tsv_item[tBAM_Z_Ranksums_MQ] != 'nan' else '.' t_ref_for = tsv_item[T_REF_FOR] if tsv_item[T_REF_FOR] != 'nan' else '0' t_ref_rev = tsv_item[T_REF_REV] if tsv_item[T_REF_REV] != 'nan' else '0' t_alt_for = tsv_item[T_ALT_FOR] if tsv_item[T_ALT_FOR] != 'nan' else '0' t_alt_rev = tsv_item[T_ALT_REV] if tsv_item[T_ALT_REV] != 'nan' else '0' t_ref_con = tsv_item[tBAM_REF_Concordant] if tsv_item[tBAM_REF_Concordant] != 'nan' else '0' t_ref_dis = tsv_item[tBAM_REF_Discordant] if tsv_item[tBAM_REF_Discordant] != 'nan' else '0' t_alt_con = tsv_item[tBAM_ALT_Concordant] if tsv_item[tBAM_ALT_Concordant] != 'nan' else '0' t_alt_dis = tsv_item[tBAM_ALT_Discordant] if tsv_item[tBAM_ALT_Discordant] != 'nan' else '0' # DP4toGT: gt = dp4_to_gt(t_ref_for, t_ref_rev, t_alt_for, t_alt_rev, hom_threshold, het_threshold) # 4-number strings: dp4_string = ','.join(( t_ref_for, t_ref_rev, t_alt_for, t_alt_rev )) cd4_string = ','.join(( t_ref_con, t_ref_dis, t_alt_con, t_alt_dis )) try: vd = int(t_alt_for) + int(t_alt_rev) vaf = vd / ( vd + int(t_ref_for) + int(t_ref_rev) ) except ZeroDivisionError: vd = 0 vaf = 0 vaf = '%.3g' % vaf # Add VAF to info string if and only if there is one single sample in the VCF sample if single_mode: info_string = info_string + ';AF={}'.format(vaf) tumor_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format(GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=t_ref_mq, altMQ=t_alt_mq, refBQ=t_ref_bq, altBQ=t_alt_bq, refNM=t_ref_nm, altNM=t_alt_nm, fetSB=t_sb, fetCD=t_cd, zMQ=t_mqb, zBQ=t_bqb, MQ0=t_MQ0, VAF=vaf) field_string = 'GT:DP4:CD4:refMQ:altMQ:refBQ:altBQ:refNM:altNM:fetSB:fetCD:zMQ:zBQ:MQ0:VAF' if score is nan: scaled_score = 0 # PASS if score >= pass_score or (score is nan and num_tools > 0.5*total_num_tools): vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'PASS', 'SOMATIC;'+info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write( vcf_line + '\n' ) # Low Qual elif score >= lowqual_score or (score is nan and num_tools >= 1 and num_tools >= 0.33*total_num_tools): vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'LowQual', info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write( vcf_line + '\n' ) # REJECT elif print_reject: vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'REJECT', info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write( vcf_line + '\n' ) # Next line: tsv_i = tsv.readline().rstrip()
def tsv2vcf(tsv_fn, vcf_fn, tools, pass_score=0.5, lowqual_score=0.1, hom_threshold=0.85, het_threshold=0.01, single_mode=False, paired_mode=True, normal_sample_name='NORMAL', tumor_sample_name='TUMOR', print_reject=True, phred_scaled=True): tools_code = { 'CGA': 'M', 'MuTect': 'M', 'MuTect2': 'M', 'VarScan2': 'V', 'JointSNVMix2': 'J', 'SomaticSniper': 'S', 'VarDict': 'D', 'MuSE': 'U', 'LoFreq': 'L', 'Scalpel': 'P', 'Strelka': 'K', 'TNscope': 'T', 'Platypus': 'Y' } mvjsdu = '' for tool_i in tools: assert tool_i in tools_code.keys() mvjsdu = mvjsdu + tools_code[tool_i] total_num_tools = len(mvjsdu) tool_string = ', '.join(tools) with open(tsv_fn) as tsv, open(vcf_fn, 'w') as vcf: # First line is a header: tsv_i = tsv.readline().rstrip() tsv_header = tsv_i.split('\t') # Make the header items into indices (single/paired have different tool names) toolcode2index = {} for n, item in enumerate(tsv_header): if 'if_MuTect' == item: toolcode2index['M'] = n elif 'if_VarScan2' == item: toolcode2index['V'] = n elif 'if_JointSNVMix2' == item: toolcode2index['J'] = n elif 'if_SomaticSniper' == item: toolcode2index['S'] = n elif 'if_VarDict' == item: toolcode2index['D'] = n elif 'MuSE_Tier' == item: toolcode2index['U'] = n MuSE_Tier = tsv_header.index('MuSE_Tier') elif 'if_LoFreq' == item: toolcode2index['L'] = n elif 'if_Scalpel' == item: toolcode2index['P'] = n elif 'if_Strelka' == item: toolcode2index['K'] = n elif 'if_TNscope' == item: toolcode2index['T'] = n elif 'if_Platypus' == item: toolcode2index['Y'] = n ALT = tsv_header.index('ALT') CHROM = tsv_header.index('CHROM') ID = tsv_header.index('ID') POS = tsv_header.index('POS') REF = tsv_header.index('REF') T_ALT_FOR = tsv_header.index('T_ALT_FOR') T_ALT_REV = tsv_header.index('T_ALT_REV') tBAM_ALT_BQ = tsv_header.index('tBAM_ALT_BQ') tBAM_ALT_Concordant = tsv_header.index('tBAM_ALT_Concordant') tBAM_ALT_Discordant = tsv_header.index('tBAM_ALT_Discordant') tBAM_ALT_MQ = tsv_header.index('tBAM_ALT_MQ') tBAM_ALT_NM = tsv_header.index('tBAM_ALT_NM') tBAM_Concordance_FET = tsv_header.index('tBAM_Concordance_FET') tBAM_MQ0 = tsv_header.index('tBAM_MQ0') tBAM_REF_BQ = tsv_header.index('tBAM_REF_BQ') tBAM_REF_Concordant = tsv_header.index('tBAM_REF_Concordant') tBAM_REF_Discordant = tsv_header.index('tBAM_REF_Discordant') tBAM_REF_MQ = tsv_header.index('tBAM_REF_MQ') tBAM_REF_NM = tsv_header.index('tBAM_REF_NM') tBAM_StrandBias_FET = tsv_header.index('tBAM_StrandBias_FET') tBAM_Z_Ranksums_BQ = tsv_header.index('tBAM_Z_Ranksums_BQ') tBAM_Z_Ranksums_MQ = tsv_header.index('tBAM_Z_Ranksums_MQ') T_REF_FOR = tsv_header.index('T_REF_FOR') T_REF_REV = tsv_header.index('T_REF_REV') if not single_mode: N_ALT_FOR = tsv_header.index('N_ALT_FOR') N_ALT_REV = tsv_header.index('N_ALT_REV') nBAM_ALT_BQ = tsv_header.index('nBAM_ALT_BQ') nBAM_ALT_Concordant = tsv_header.index('nBAM_ALT_Concordant') nBAM_ALT_MQ = tsv_header.index('nBAM_ALT_MQ') nBAM_ALT_NM = tsv_header.index('nBAM_ALT_NM') nBAM_Concordance_FET = tsv_header.index('nBAM_Concordance_FET') nBAM_MQ0 = tsv_header.index('nBAM_MQ0') nBAM_REF_BQ = tsv_header.index('nBAM_REF_BQ') nBAM_REF_Concordant = tsv_header.index('nBAM_REF_Concordant') nBAM_REF_Discordant = tsv_header.index('nBAM_REF_Discordant') nBAM_REF_MQ = tsv_header.index('nBAM_REF_MQ') nBAM_REF_NM = tsv_header.index('nBAM_REF_NM') nBAM_StrandBias_FET = tsv_header.index('nBAM_StrandBias_FET') nBAM_Z_Ranksums_BQ = tsv_header.index('nBAM_Z_Ranksums_BQ') nBAM_Z_Ranksums_MQ = tsv_header.index('nBAM_Z_Ranksums_MQ') N_REF_FOR = tsv_header.index('N_REF_FOR') N_REF_REV = tsv_header.index('N_REF_REV') try: SCORE = tsv_header.index('SCORE') except ValueError: pass # Create vcf headers: vcf.write('##fileformat=VCFv4.1\n') vcf.write(version_line + '\n') vcf.write( '##FILTER=<ID=LowQual,Description="Less confident somatic mutation calls with probability value at least {}">\n' .format(lowqual_score)) vcf.write( '##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation calls with probability value at least {}">\n' .format(pass_score)) vcf.write( '##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation with ONCOSCORE below 2">\n' ) vcf.write( '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic mutation in primary">\n' ) vcf.write( '##INFO=<ID={COMBO},Number={NUM},Type=Integer,Description="Calling decision of the {NUM} algorithms: {TOOL_STRING}">\n' .format(COMBO=mvjsdu, NUM=total_num_tools, TOOL_STRING=tool_string)) vcf.write( '##INFO=<ID=NUM_TOOLS,Number=1,Type=Float,Description="Number of tools called it Somatic">\n' ) if single_mode: vcf.write( '##INFO=<ID=AF,Number=1,Type=Float,Description="Variant Allele Fraction">\n' ) vcf.write( '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') vcf.write( '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="ref forward, ref reverse, alt forward, alt reverse">\n' ) vcf.write( '##FORMAT=<ID=CD4,Number=4,Type=Integer,Description="ref concordant, ref discordant, alt concordant, alt discordant">\n' ) vcf.write( '##FORMAT=<ID=refMQ,Number=1,Type=Float,Description="average mapping score for reference reads">\n' ) vcf.write( '##FORMAT=<ID=altMQ,Number=1,Type=Float,Description="average mapping score for alternate reads">\n' ) vcf.write( '##FORMAT=<ID=refBQ,Number=1,Type=Float,Description="average base quality score for reference reads">\n' ) vcf.write( '##FORMAT=<ID=altBQ,Number=1,Type=Float,Description="average base quality score for alternate reads">\n' ) vcf.write( '##FORMAT=<ID=refNM,Number=1,Type=Float,Description="average edit distance for reference reads">\n' ) vcf.write( '##FORMAT=<ID=altNM,Number=1,Type=Float,Description="average edit distance for alternate reads">\n' ) vcf.write( '##FORMAT=<ID=fetSB,Number=1,Type=Float,Description="Strand bias FET">\n' ) vcf.write( '##FORMAT=<ID=fetCD,Number=1,Type=Float,Description="Concordance FET">\n' ) vcf.write( '##FORMAT=<ID=zMQ,Number=1,Type=Float,Description="z-score rank sum of mapping quality">\n' ) vcf.write( '##FORMAT=<ID=zBQ,Number=1,Type=Float,Description="z-score rank sum of base quality">\n' ) vcf.write( '##FORMAT=<ID=MQ0,Number=1,Type=Integer,Description="Number of reads with mapping quality of 0">\n' ) vcf.write( '##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Frequency">\n' ) if single_mode: vcf.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n'. format(tumor_sample_name)) elif paired_mode: vcf.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\t{}\n' .format(normal_sample_name, tumor_sample_name)) # Start writing content: tsv_i = tsv.readline().rstrip() while tsv_i: tsv_item = tsv_i.split('\t') try: score = float(tsv_item[SCORE]) except NameError: score = nan if phred_scaled: scaled_score = p2phred(1 - score, max_phred=255) else: scaled_score = score try: # Non-PASS MuSE calls are made into fractions. if tsv_item[MuSE_Tier] != '1': if_MuSE = '0' else: if_MuSE = '1' except NameError: if_MuSE = '.' MVJS = [] num_tools = 0 for tool_i in mvjsdu: if_Tool = tsv_item[toolcode2index[tool_i]] if if_Tool == '1': if_Tool = '1' elif if_Tool == 'nan': if_Tool = '.' else: if_Tool = '0' MVJS.append(if_Tool) num_tools = num_tools + int(if_Tool) MVJS = ','.join(MVJS) info_string = '{COMBO}={MVJSD};NUM_TOOLS={NUM_TOOLS}'.format( COMBO=mvjsdu, MVJSD=MVJS, NUM_TOOLS=num_tools) # NORMAL if not single_mode: n_ref_mq = tsv_item[ nBAM_REF_MQ] if tsv_item[nBAM_REF_MQ] != 'nan' else '.' n_alt_mq = tsv_item[ nBAM_ALT_MQ] if tsv_item[nBAM_ALT_MQ] != 'nan' else '.' n_ref_bq = tsv_item[ nBAM_REF_BQ] if tsv_item[nBAM_REF_BQ] != 'nan' else '.' n_alt_bq = tsv_item[ nBAM_ALT_BQ] if tsv_item[nBAM_ALT_BQ] != 'nan' else '.' n_ref_nm = tsv_item[ nBAM_REF_NM] if tsv_item[nBAM_REF_NM] != 'nan' else '.' n_alt_nm = tsv_item[ nBAM_ALT_NM] if tsv_item[nBAM_ALT_NM] != 'nan' else '.' n_MQ0 = tsv_item[ nBAM_MQ0] if tsv_item[nBAM_MQ0] != 'nan' else '.' n_sb = tsv_item[nBAM_StrandBias_FET] if tsv_item[ nBAM_StrandBias_FET] != 'nan' else '.' n_cd = tsv_item[nBAM_Concordance_FET] if tsv_item[ nBAM_Concordance_FET] != 'nan' else '.' n_bqb = tsv_item[nBAM_Z_Ranksums_BQ] if tsv_item[ nBAM_Z_Ranksums_BQ] != 'nan' else '.' n_mqb = tsv_item[nBAM_Z_Ranksums_MQ] if tsv_item[ nBAM_Z_Ranksums_MQ] != 'nan' else '.' n_ref_for = tsv_item[ N_REF_FOR] if tsv_item[N_REF_FOR] != 'nan' else '0' n_ref_rev = tsv_item[ N_REF_REV] if tsv_item[N_REF_REV] != 'nan' else '0' n_alt_for = tsv_item[ N_ALT_FOR] if tsv_item[N_ALT_FOR] != 'nan' else '0' n_alt_rev = tsv_item[ N_ALT_REV] if tsv_item[N_ALT_REV] != 'nan' else '0' n_ref_con = tsv_item[nBAM_REF_Concordant] if tsv_item[ nBAM_REF_Concordant] != 'nan' else '0' n_ref_dis = tsv_item[nBAM_REF_Discordant] if tsv_item[ nBAM_REF_Discordant] != 'nan' else '0' n_alt_con = tsv_item[nBAM_ALT_Concordant] if tsv_item[ nBAM_ALT_Concordant] != 'nan' else '0' n_alt_dis = tsv_item[nBAM_ALT_Concordant] if tsv_item[ nBAM_ALT_Concordant] != 'nan' else '0' # DP4toGT: gt = dp4_to_gt(n_ref_for, n_ref_rev, n_alt_for, n_alt_rev, hom_threshold, het_threshold) # 4-number strings: dp4_string = ','.join( (n_ref_for, n_ref_rev, n_alt_for, n_alt_rev)) cd4_string = ','.join( (n_ref_con, n_ref_dis, n_alt_con, n_alt_dis)) try: vaf = (int(n_alt_for) + int(n_alt_rev)) / (int(n_alt_for) + int(n_alt_rev) + int(n_ref_for) + int(n_ref_rev)) except ZeroDivisionError: vaf = 0 vaf = '%.3g' % vaf normal_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format( GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=n_ref_mq, altMQ=n_alt_mq, refBQ=n_ref_bq, altBQ=n_alt_bq, refNM=n_ref_nm, altNM=n_alt_nm, fetSB=n_sb, fetCD=n_cd, zMQ=n_mqb, zBQ=n_bqb, MQ0=n_MQ0, VAF=vaf) ### TUMOR ### t_ref_mq = tsv_item[ tBAM_REF_MQ] if tsv_item[tBAM_REF_MQ] != 'nan' else '.' t_alt_mq = tsv_item[ tBAM_ALT_MQ] if tsv_item[tBAM_ALT_MQ] != 'nan' else '.' t_ref_bq = tsv_item[ tBAM_REF_BQ] if tsv_item[tBAM_REF_BQ] != 'nan' else '.' t_alt_bq = tsv_item[ tBAM_ALT_BQ] if tsv_item[tBAM_ALT_BQ] != 'nan' else '.' t_ref_nm = tsv_item[ tBAM_REF_NM] if tsv_item[tBAM_REF_NM] != 'nan' else '.' t_alt_nm = tsv_item[ tBAM_ALT_NM] if tsv_item[tBAM_ALT_NM] != 'nan' else '.' t_MQ0 = tsv_item[tBAM_MQ0] if tsv_item[tBAM_MQ0] != 'nan' else '.' t_sb = tsv_item[tBAM_StrandBias_FET] if tsv_item[ tBAM_StrandBias_FET] != 'nan' else '.' t_cd = tsv_item[tBAM_Concordance_FET] if tsv_item[ tBAM_Concordance_FET] != 'nan' else '.' t_bqb = tsv_item[tBAM_Z_Ranksums_BQ] if tsv_item[ tBAM_Z_Ranksums_BQ] != 'nan' else '.' t_mqb = tsv_item[tBAM_Z_Ranksums_MQ] if tsv_item[ tBAM_Z_Ranksums_MQ] != 'nan' else '.' t_ref_for = tsv_item[ T_REF_FOR] if tsv_item[T_REF_FOR] != 'nan' else '0' t_ref_rev = tsv_item[ T_REF_REV] if tsv_item[T_REF_REV] != 'nan' else '0' t_alt_for = tsv_item[ T_ALT_FOR] if tsv_item[T_ALT_FOR] != 'nan' else '0' t_alt_rev = tsv_item[ T_ALT_REV] if tsv_item[T_ALT_REV] != 'nan' else '0' t_ref_con = tsv_item[tBAM_REF_Concordant] if tsv_item[ tBAM_REF_Concordant] != 'nan' else '0' t_ref_dis = tsv_item[tBAM_REF_Discordant] if tsv_item[ tBAM_REF_Discordant] != 'nan' else '0' t_alt_con = tsv_item[tBAM_ALT_Concordant] if tsv_item[ tBAM_ALT_Concordant] != 'nan' else '0' t_alt_dis = tsv_item[tBAM_ALT_Discordant] if tsv_item[ tBAM_ALT_Discordant] != 'nan' else '0' # DP4toGT: gt = dp4_to_gt(t_ref_for, t_ref_rev, t_alt_for, t_alt_rev, hom_threshold, het_threshold) # 4-number strings: dp4_string = ','.join((t_ref_for, t_ref_rev, t_alt_for, t_alt_rev)) cd4_string = ','.join((t_ref_con, t_ref_dis, t_alt_con, t_alt_dis)) try: vd = int(t_alt_for) + int(t_alt_rev) vaf = vd / (vd + int(t_ref_for) + int(t_ref_rev)) except ZeroDivisionError: vd = 0 vaf = 0 vaf = '%.3g' % vaf # Add VAF to info string if and only if there is one single sample in the VCF sample if single_mode: info_string = info_string + ';AF={}'.format(vaf) tumor_sample_string = '{GT}:{DP4}:{CD4}:{refMQ}:{altMQ}:{refBQ}:{altBQ}:{refNM}:{altNM}:{fetSB}:{fetCD}:{zMQ}:{zBQ}:{MQ0}:{VAF}'.format( GT=gt, DP4=dp4_string, CD4=cd4_string, refMQ=t_ref_mq, altMQ=t_alt_mq, refBQ=t_ref_bq, altBQ=t_alt_bq, refNM=t_ref_nm, altNM=t_alt_nm, fetSB=t_sb, fetCD=t_cd, zMQ=t_mqb, zBQ=t_bqb, MQ0=t_MQ0, VAF=vaf) field_string = 'GT:DP4:CD4:refMQ:altMQ:refBQ:altBQ:refNM:altNM:fetSB:fetCD:zMQ:zBQ:MQ0:VAF' if score is nan: scaled_score = 0 # PASS if score >= pass_score or (score is nan and num_tools > 0.5 * total_num_tools): vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'PASS', 'SOMATIC;' + info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write(vcf_line + '\n') # Low Qual elif score >= lowqual_score or ( score is nan and num_tools >= 1 and num_tools >= 0.33 * total_num_tools): vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'LowQual', info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write(vcf_line + '\n') # REJECT elif print_reject: vcf_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format( tsv_item[CHROM], tsv_item[POS], tsv_item[ID], tsv_item[REF], tsv_item[ALT], '%.4f' % scaled_score, 'REJECT', info_string, field_string) if single_mode: vcf_line = vcf_line + '\t' + tumor_sample_string elif paired_mode: vcf_line = vcf_line + '\t' + normal_sample_string + '\t' + tumor_sample_string vcf.write(vcf_line + '\n') # Next line: tsv_i = tsv.readline().rstrip()
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, nbam_fn=None, tbam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, jsm=None, sniper=None, vardict=None, muse=None, lofreq=None, scalpel=None, strelka=None, tnscope=None, platypus=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa) tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header( truth ) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header( cosmic ) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header( dbsnp ) # 10 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header( mutect ) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header( varscan ) if jsm: jsm = genome.open_textfile(jsm) jsm_line = genome.skip_vcf_header( jsm ) if sniper: sniper = genome.open_textfile(sniper) sniper_line = genome.skip_vcf_header( sniper ) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header( vardict ) if muse: muse = genome.open_textfile(muse) muse_line = genome.skip_vcf_header( muse ) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header( lofreq ) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header( scalpel ) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header( strelka ) if tnscope: tnscope = genome.open_textfile(tnscope) tnscope_line = genome.skip_vcf_header( tnscope ) if platypus: platypus = genome.open_textfile(platypus) platypus_line = genome.skip_vcf_header( platypus ) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match( genome.pattern_chr_position, my_line ) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write( out_header.replace('{','').replace('}','') + '\n' ) while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line( my_line ) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append( vcf_i ) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line( my_line ) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match( genome.pattern_chr_position, my_line ) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format(mysites) ) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append( vcf_i ) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates( bed_item[0], int(bed_item[1])+1, int(bed_item[2]) ) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates( pos_item[0], int(pos_item[1]), int(pos_item[1]) ) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates( fai_item[0], 1, int(fai_item[1]) ) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append( ref_base ) alt_bases.append( first_alt ) indel_lengths.append( indel_length ) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value('COMMON') == '1' else 0 num_cases = variant_i.get_info_value('CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set( my_identifier_i ) all_my_identifiers.append( my_identifier_i ) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [None] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan # Keep track of NumCallers: num_callers = 0 #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(my_coordinate, varscan_line, varscan, chrom_seq) if jsm: got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate(my_coordinate, jsm_line, jsm, chrom_seq) if sniper: got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate(my_coordinate, sniper_line, sniper, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(my_coordinate, vardict_line, vardict, chrom_seq) if muse: got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate(my_coordinate, muse_line, muse, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(my_coordinate, strelka_line, strelka, chrom_seq) if tnscope: got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate(my_coordinate, tnscope_line, tnscope, chrom_seq) if platypus: got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate(my_coordinate, platypus_line, platypus, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the BAM file(s), variant by variant from the input: for ith_call, my_call in enumerate( variants_at_my_coordinate ): if is_vcf: # The particular line in the input VCF file: variant_id = ( (my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase ) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ( (my_coordinate[0], my_coordinate[1]), ref_base, first_alt ) #################### Collect Caller Vcf ####################: if mutect: mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect(variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = nlod = tlod = tandem = ecnt = nan if varscan: varscan_classification = annotate_caller.VarScan(variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = nan if jsm: jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM(variant_id, jsm_variants) num_callers += jointsnvmix2_classification else: jointsnvmix2_classification = score_jointsnvmix2 = nan if sniper: sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper(variant_id, sniper_variants) num_callers += sniper_classification else: sniper_classification = score_somaticsniper = nan if vardict: vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict(variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = score_vardict = nan if muse: muse_classification = annotate_caller.MuSE(variant_id, muse_variants) num_callers += muse_classification else: muse_classification = nan if lofreq: lofreq_classification = annotate_caller.LoFreq(variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.Scalpel(variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka(variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = somatic_evs = qss = tqss = nan if tnscope: tnscope_classification = annotate_caller.TNscope(variant_id, tnscope_variants) num_callers += tnscope_classification else: tnscope_classification = nan if platypus: platypus_classification = annotate_caller.countPASS(variant_id, platypus_variants) num_callers += platypus_classification else: platypus_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants: judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP(variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add( ID_i ) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add( ID_i ) ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### ######### nBamFeatures = sequencing_features.from_bam(nbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) tBamFeatures = sequencing_features.from_bam(tbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev'] n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt) # Calculate VarScan'2 SCC directly without using VarScan2 output: try: score_varscan2 = genome.p2phred( stats.fisher_exact( ((t_alt, n_alt), (t_ref, n_ref)), alternative='greater' )[1] ) except ValueError: score_varscan2 = nan # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join(my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_VarScan2 = varscan_classification, \ if_JointSNVMix2 = jointsnvmix2_classification, \ if_SomaticSniper = sniper_classification, \ if_VarDict = vardict_classification, \ MuSE_Tier = muse_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ if_Strelka = strelka_classification, \ if_TNscope = tnscope_classification, \ if_Platypus = platypus_classification, \ Strelka_Score = somatic_evs, \ Strelka_QSS = qss, \ Strelka_TQSS = tqss, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ SNVMix2_Score = rescale(score_jointsnvmix2, 'phred', p_scale, 1001), \ Sniper_Score = rescale(score_somaticsniper, 'phred', p_scale, 1001), \ VarDict_Score = rescale(score_vardict, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ N_DP = nBamFeatures['dp'], \ nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'], \ nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'], \ nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'], \ nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'], \ nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'], \ nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'], \ nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'], \ nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'], \ nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'], \ nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'], \ nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'], \ nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'], \ nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'], \ nBAM_Concordance_FET = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ N_REF_FOR = nBamFeatures['ref_for'], \ N_REF_REV = nBamFeatures['ref_rev'], \ N_ALT_FOR = nBamFeatures['alt_for'], \ N_ALT_REV = nBamFeatures['alt_rev'], \ nBAM_StrandBias_FET = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'], \ nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'], \ nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'], \ nBAM_Clipping_FET = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ nBAM_MQ0 = nBamFeatures['MQ0'], \ nBAM_Other_Reads = nBamFeatures['noise_read_count'], \ nBAM_Poor_Reads = nBamFeatures['poor_read_count'], \ nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'], \ nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'], \ nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'], \ nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'], \ nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'], \ nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'], \ M2_NLOD = nlod, \ M2_TLOD = tlod, \ M2_STR = tandem, \ M2_ECNT = ecnt, \ SOR = sor, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect, varscan, jsm, sniper, vardict, muse, lofreq, scalpel, strelka, tnscope, platypus) [opened_file.close() for opened_file in opened_files if opened_file]