def vcf(infileList, outfile): with open(outfile, 'w') as vcfout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as vcfin: line_i = vcfin.readline() while line_i.startswith('#'): if not headerWritten: vcfout.write(line_i) line_i = vcfin.readline() # Turn off header writing from now on: headerWritten = True while line_i: vcfout.write(line_i) line_i = vcfin.readline() return 0
def tsv(infileList, outfile): with open(outfile, 'w') as tsvout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as tsvin: # First line is a header line_i = tsvin.readline() if not headerWritten: tsvout.write(line_i) # Turn off header writing from now on: headerWritten = True line_i = tsvin.readline() while line_i: tsvout.write(line_i) line_i = tsvin.readline() return 0
def convert(infile, outfile): with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() # This is the #CHROM line: headers = line_i.split('\t') num_columns = len(headers) vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() while line_i: items = line_i.split('\t') items[8] = 'GT:' + items[8] for i in range(9, num_columns): items[i] = '0/1:' + items[i] line_out = '\t'.join( items ) vcf_out.write( line_out + '\n' ) line_i = vcf_in.readline().rstrip()
def copy(infile, outfile): with genome.open_textfile(infile) as filein, open(outfile, 'w') as fileout: line_i = filein.readline() while line_i: fileout.write(line_i) line_i = filein.readline()
def convert(infile, outfile): idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format, idx_SM1, idx_SM2 = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # VCF header while line_i.startswith('#'): vcfout.write(line_i + '\n') line_i = vcf.readline().rstrip() while line_i: # Print "SomaticSniper" into the INFO field if it is called so, otherwise never mind. item = line_i.split('\t') # In the REF field, non-GCTA characters should be changed to N to fit the VCF standard: item[idx_ref] = re.sub(r'[^GCTA]', 'N', item[idx_ref], flags=re.I) line_i = '\t'.join(item) vcfout.write(line_i + '\n') line_i = vcf.readline().rstrip()
def convert(infile, outfile): with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write(line_i + '\n') line_i = vcf_in.readline().rstrip() # This is the #CHROM line: headers = line_i.split('\t') num_columns = len(headers) vcf_out.write(line_i + '\n') line_i = vcf_in.readline().rstrip() while line_i: items = line_i.split('\t') items[8] = 'GT:' + items[8] for i in range(9, num_columns): items[i] = '0/1:' + items[i] line_out = '\t'.join(items) vcf_out.write(line_out + '\n') line_i = vcf_in.readline().rstrip()
def tsv(infileList, outfile, bgzip=False): with open(outfile, 'w') as tsvout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as tsvin: # First line is a header line_i = tsvin.readline() if not headerWritten: tsvout.write(line_i) # Turn off header writing from now on: headerWritten = True line_i = tsvin.readline() while line_i: tsvout.write(line_i) line_i = tsvin.readline() if bgzip: bgzip_compress(outfile, True) actual_outfile = outfile + '.gz' else: actual_outfile = outfile return actual_outfile
def vcf(infileList, outfile, bgzip=False): with open(outfile, 'w') as vcfout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as vcfin: line_i = vcfin.readline() while line_i.startswith('#'): if not headerWritten: vcfout.write(line_i) line_i = vcfin.readline() # Turn off header writing from now on: headerWritten = True while line_i: vcfout.write(line_i) line_i = vcfin.readline() if bgzip: bgzip_compress(outfile, True) actual_outfile = outfile + '.gz' else: actual_outfile = outfile return actual_outfile
def spreader(infileList, outfiles, chunk=4, bgzip=False, threads=1): ''' Given an infile, it will spread its content into the outfiles "chunk" at a time, e.g,. If infile is a fastq file, and output is 3 fastq files, then the first 4 lines will go to the 1st output, the next 4 lines to go the 2nd output, the next 4 lines go to the 3rd output, and then the next 4 lines will go back to the 1st output, so on and so forth. ''' outs = [open(out_i, 'w') for out_i in outfiles] for infile in infileList: with genome.open_textfile(infile) as text_in: line_i = text_in.readline() while line_i: for out_i in outs: for i in range(chunk): out_i.write(line_i) line_i = text_in.readline() [out_i.close() for out_i in outs] if bgzip: pool = Pool(processes=threads) bash_async = pool.map_async(bgzip_compress, outfiles) actual_outfiles = bash_async.get() pool.close() else: actual_outfiles = outfiles return actual_outfiles
def convert(infile, outfile): idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format,idx_SM1,idx_SM2 = 0,1,2,3,4,5,6,7,8,9,10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # VCF header while line_i.startswith('#'): vcfout.write( line_i + '\n') line_i = vcf.readline().rstrip() while line_i: # Print "SomaticSniper" into the INFO field if it is called so, otherwise never mind. item = line_i.split('\t') # In the REF field, non-GCTA characters should be changed to N to fit the VCF standard: item[idx_ref] = re.sub( r'[^GCTA]', 'N', item[idx_ref], flags=re.I ) line_i = '\t'.join(item) vcfout.write( line_i + '\n' ) line_i = vcf.readline().rstrip()
def copy(infile, outfile): with genome.open_textfile(infile) as filein, open(outfile, 'w') as fileout: line_i = filein.readline() while line_i: fileout.write(line_i) line_i = filein.readline()
def remove_vcf_illegal_lines(invcf, outvcf): ''' In VarDict v1.7, there are lines with <XXX> in ALT without END in info, which will cause bedtools to fail. This program will check if these things exist, and if they do, remove them. If the input VCF has illegal lines, it will return the modified output VCF file excluding those lines. If the input VCF file does not have such illegal lines, it will return False. ''' hasIllegalLine = False with genome.open_textfile(invcf) as vcf: line_i = vcf.readline().rstrip() while line_i.startswith('#'): line_i = vcf.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) if re.match(r'<\w+>', vcf_i.altbase) and ( not vcf_i.get_info_value('END') ): hasIllegalLine = True break line_i = vcf.readline().rstrip() if hasIllegalLine: with genome.open_textfile(invcf) as vcf, open(outvcf, 'w') as out: line_i = vcf.readline().rstrip() while line_i.startswith('#'): out.write( line_i + '\n') line_i = vcf.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) if not ( re.match(r'<\w+>', vcf_i.altbase) and (not vcf_i.get_info_value('END')) ): out.write( line_i + '\n') line_i = vcf.readline().rstrip() return outvcf else: return hasIllegalLine
def bed(infileList, outfile): with open(outfile, 'w') as bedout: for file_i in infileList: with genome.open_textfile(file_i) as bedin: for line_i in bedin: bedout.write(line_i) return 0
def split_into_snv_and_indel(infile, snv_out, indel_out): with genome.open_textfile(infile) as vcf_in, open( snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('#'): snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line(line_i) if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase): if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write(line_i + '\n') elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write(line_i + '\n') else: item = line_i.split('\t') if ',' in vcf_i.altbase: alt_bases = vcf_i.altbase.split(',') elif '/' in vcf_i.altbase: alt_bases = vcf_i.altbase.split('/') for ith_base, altbase_i in enumerate(alt_bases): item[4] = altbase_i new_line = '\t'.join(item) if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: snv_out.write(new_line + '\n') elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write(new_line + '\n') line_i = vcf_in.readline().rstrip()
def bed(infileList, outfile, bgzip=False): with open(outfile, 'w') as bedout: for file_i in infileList: with genome.open_textfile(file_i) as bedin: for line_i in bedin: bedout.write(line_i) if bgzip: bgzip_compress(outfile, True) actual_outfile = outfile + '.gz' else: actual_outfile = outfile return actual_outfile
def tsv(infileList, outfile): with open(outfile, 'w') as tsvout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as tsvin: # First line is a header line_i = tsvin.readline() if not headerWritten: tsvout.write( line_i ) # Turn off header writing from now on: headerWritten = True line_i = tsvin.readline() while line_i: tsvout.write( line_i ) line_i = tsvin.readline()
def vcf(infileList, outfile): with open(outfile, 'w') as vcfout: headerWritten = False for file_i in infileList: with genome.open_textfile(file_i) as vcfin: line_i = vcfin.readline() while line_i.startswith('#'): if not headerWritten: vcfout.write( line_i ) line_i = vcfin.readline() # Turn off header writing from now on: headerWritten = True while line_i: vcfout.write( line_i ) line_i = vcfin.readline()
def convert(infile, outfile): with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # Skip headers from now on: while line_i.startswith('#'): if line_i.startswith('##FORMAT=<ID=DP4,'): line_i = '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">' elif line_i.startswith('##FORMAT=<ID=AD,'): line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">' vcfout.write(line_i + '\n') line_i = vcf.readline().rstrip() # Doing the work here: while line_i: vcf_i = genome.Vcf_line(line_i) num_samples = len(vcf_i.samples) if num_samples == 1: paired = False elif num_samples == 2: paired = True elif num_samples > 2: sys.stderr.write( 'We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all' ) paired = True elif num_samples == 0: raise Exception('No sample information here.') # Replace the wrong "G/A" with the correct "G,A" in ALT column: vcf_i.altbase = vcf_i.altbase.replace('/', ',') # vcf-validator is not going to accept multiple sequences in the REF, as is the case in VarScan2's indel output: vcf_i.refbase = re.sub(r'[^\w].*$', '', vcf_i.refbase) # Get rid of non-compliant characters in the ALT column: vcf_i.altbase = re.sub(r'[^\w,.]', '', vcf_i.altbase) # Eliminate dupliate entries in ALT: vcf_i.altbase = re.sub(r'(\w+),\1', r'\1', vcf_i.altbase) # Eliminate ALT entries when it matches with the REF column, to address vcf-validator complaints: if ',' in vcf_i.altbase: alt_item = vcf_i.altbase.split(',') if vcf_i.refbase in alt_item: bad_idx = alt_item.index(vcf_i.refbase) alt_item.pop(bad_idx) vcf_i.altbase = ','.join(alt_item) # To fix this vcf-validator complaints: # Could not parse the allele(s) [GTC], first base does not match the reference for n1, alt_i in enumerate(alt_item[1::]): if not alt_i.startswith(vcf_i.refbase): alt_item.pop(n1 + 1) vcf_i.altbase = ','.join(alt_item) # Combine AD:RD into AD: format_items = vcf_i.get_sample_variable() if 'AD' in format_items and 'RD' in format_items: rd_sm1 = vcf_i.get_sample_value('RD', 0) ad_sm1 = vcf_i.get_sample_value('AD', 0) try: rd_sm2 = vcf_i.get_sample_value('RD', 1) ad_sm2 = vcf_i.get_sample_value('AD', 1) except IndexError: rd_sm2 = ad_sm2 = 0 idx_ad = format_items.index('AD') idx_rd = format_items.index('RD') format_items.pop(idx_rd) vcf_i.field = ':'.join(format_items) item_normal = vcf_i.samples[0].split(':') item_normal[idx_ad] = '{},{}'.format(rd_sm1, ad_sm1) item_normal.pop(idx_rd) vcf_i.samples[0] = ':'.join(item_normal) if paired: item_tumor = vcf_i.samples[1].split(':') item_tumor[idx_ad] = '{},{}'.format(rd_sm2, ad_sm2) item_tumor.pop(idx_rd) vcf_i.samples[1] = ':'.join(item_tumor) # Reform the line: line_i = '\t'.join( (vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters, vcf_i.info, vcf_i.field, '\t'.join((vcf_i.samples)))) # VarScan2 output a line with REF allele as "M". GATK CombineVariants complain about that. if not re.search(r'[^GCTAU]', vcf_i.refbase, re.I): vcfout.write(line_i + '\n') # Next line: line_i = vcf.readline().rstrip()
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header( truth ) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header( cosmic ) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header( dbsnp ) # 6 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header( mutect ) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header( varscan ) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header( vardict ) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header( lofreq ) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header( scalpel ) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header( strelka ) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match( genome.pattern_chr_position, my_line ) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write( out_header.replace('{','').replace('}','') + '\n' ) while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line( my_line ) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append( vcf_i ) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line( my_line ) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match( genome.pattern_chr_position, my_line ) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format(mysites) ) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append( vcf_i ) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates( bed_item[0], int(bed_item[1])+1, int(bed_item[2]) ) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates( pos_item[0], int(pos_item[1]), int(pos_item[1]) ) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates( fai_item[0], 1, int(fai_item[1]) ) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append( ref_base ) alt_bases.append( first_alt ) indel_lengths.append( indel_length ) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value('COMMON') == '1' else 0 num_cases = variant_i.get_info_value('CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set( my_identifier_i ) all_my_identifiers.append( my_identifier_i ) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [None] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan # Keep track of NumCallers: num_callers = 0 #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate(my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate(my_coordinate, varscan_line, varscan, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate(my_coordinate, vardict_line, vardict, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate(my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate(my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate(my_coordinate, strelka_line, strelka, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate(my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate(my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate(my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the tBAM file(s), variant by variant from the input: for ith_call, my_call in enumerate( variants_at_my_coordinate ): if is_vcf: # The particular line in the input VCF file: variant_id = ( (my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase ) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ( (my_coordinate[0], my_coordinate[1]), ref_base, first_alt ) #################### Collect Caller Vcf ####################: if mutect: mutect_classification, tlod, ecnt = annotate_caller.ssMuTect(variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = tlod = ecnt = nan if varscan: varscan_classification, score_varscan2 = annotate_caller.ssVarScan(variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = score_varscan2 = nan if vardict: vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict(variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan if lofreq: lofreq_classification = annotate_caller.ssLoFreq(variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.ssScalpel(variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification = annotate_caller.ssStrelka(variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants.keys(): judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP(variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add( ID_i ) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC(variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add( ID_i ) ########## ######### INFO EXTRACTION FROM BAM FILES ########## ######### # Tumor tBAM file: tBamFeatures = sequencing_features.from_bam(bam, my_coordinate, ref_base, first_alt, min_mq, min_bq) # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join(my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_Strelka = strelka_classification, \ if_VarScan2 = varscan_classification, \ if_VarDict = vardict_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ M2_TLOD = tlod, \ M2_ECNT = ecnt, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka) [opened_file.close() for opened_file in opened_files if opened_file]
def convert(infile, snv_out, indel_out, is_tnscope): info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] if line_i.startswith('##INFO=<ID=SOR,'): line_i = re.sub(r'Float', 'String', line_i) snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() # This line will be #CHROM: snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) header = line_i.split('\t') if is_tnscope: # Doesn't matter which one is normal/tumor. These information are not used. normal_index, tumor_index = 1,0 else: normal_index = header.index(normal_name) - 9 tumor_index = header.index(tumor_name) - 9 # This will be the first variant line: line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) if ',' not in vcf_i.altbase: if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write( line_i + '\n' ) elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write( line_i + '\n' ) else: alt_bases = vcf_i.altbase.split(',') measures = [] still_measures = [] for measure_i in info_to_split: try: measures.append( vcf_i.get_info_value(measure_i).split(',') ) except AttributeError: measures.append( None ) for measure_i in info_to_keep: try: still_measures.append( vcf_i.get_info_value(measure_i) ) except AttributeError: still_measures.append( None ) for ith_base, altbase_i in enumerate(alt_bases): split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip(info_to_split, measures) if info_value != None ] still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip(info_to_keep, still_measures) if info_value != False ] split_infos.extend(still_infos) info_string = ';'.join( split_infos ) GT0 = vcf_i.get_sample_value('GT', idx=0) if GT0 != '0/0' and GT0 != '0/1': sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0]) else: sample_0 = vcf_i.samples[0] GT1 = vcf_i.get_sample_value('GT', idx=1) if GT1 != '0/0' and GT0 != '0/1': sample_1 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[1]) else: sample_1 = vcf_i.samples[1] new_line = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0, sample_1 )) if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: snv_out.write( new_line + '\n' ) elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1: indel_out.write( new_line + '\n') line_i = vcf_in.readline().rstrip()
def convert(infile, outfile): idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format, idx_SM1, idx_SM2 = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # VCF header while line_i.startswith('#'): if line_i.startswith('##FORMAT=<ID=AD,'): line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">' vcfout.write(line_i + '\n') line_i = vcf.readline().rstrip() while line_i: item = line_i.split('\t') format_items = item[idx_format].split(':') if 'AD' in format_items and 'RD' in format_items: # NORMAL idx_ad = format_items.index('AD') idx_rd = format_items.index('RD') format_items.pop(idx_rd) item_normal = item[idx_SM1].split(':') normal_ad = int(item_normal[idx_ad]) normal_rd = int(item_normal[idx_rd]) try: vaf = normal_ad / (normal_ad + normal_rd) except ZeroDivisionError: vaf = 0 if vaf > 0.8: normal_gt = '1/1' elif vaf > 0.25: normal_gt = '0/1' else: normal_gt = '0/0' item_normal[idx_ad] = '{},{}'.format(item_normal[idx_rd], item_normal[idx_ad]) item_normal.pop(idx_rd) item_normal = [normal_gt] + item_normal # TUMOR item_tumor = item[idx_SM2].split(':') tumor_ad = int(item_tumor[idx_ad]) tumor_rd = int(item_tumor[idx_rd]) try: vaf = tumor_ad / (tumor_ad + tumor_rd) except ZeroDivisionError: vaf = 0 if vaf > 0.8: tumor_gt = '1/1' else: tumor_gt = '0/1' item_tumor[idx_ad] = '{},{}'.format(item_tumor[idx_rd], item_tumor[idx_ad]) item_tumor.pop(idx_rd) item_tumor = [tumor_gt] + item_tumor # Rewrite item[idx_format] = 'GT:' + ':'.join(format_items) item[idx_SM1] = ':'.join(item_normal) item[idx_SM2] = ':'.join(item_tumor) line_i = '\t'.join(item) vcfout.write(line_i + '\n') line_i = vcf.readline().rstrip()
def convert(infile, outfile): idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format,idx_SM1,idx_SM2 = 0,1,2,3,4,5,6,7,8,9,10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # VCF header while line_i.startswith('#'): if line_i.startswith('##FORMAT=<ID=AD,'): line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">' vcfout.write( line_i + '\n') line_i = vcf.readline().rstrip() while line_i: item = line_i.split('\t') format_items = item[idx_format].split(':') if 'AD' in format_items and 'RD' in format_items: # NORMAL idx_ad = format_items.index('AD') idx_rd = format_items.index('RD') format_items.pop(idx_rd) item_normal = item[idx_SM1].split(':') normal_ad = int(item_normal[idx_ad]) normal_rd = int(item_normal[idx_rd]) try: vaf = normal_ad / (normal_ad + normal_rd) except ZeroDivisionError: vaf = 0 if vaf > 0.8: normal_gt = '1/1' elif vaf > 0.25: normal_gt = '0/1' else: normal_gt = '0/0' item_normal[idx_ad] = '{},{}'.format( item_normal[idx_rd] , item_normal[idx_ad] ) item_normal.pop(idx_rd) item_normal = [normal_gt] + item_normal # TUMOR item_tumor = item[idx_SM2].split(':') tumor_ad = int(item_tumor[idx_ad]) tumor_rd = int(item_tumor[idx_rd]) try: vaf = tumor_ad / (tumor_ad + tumor_rd) except ZeroDivisionError: vaf = 0 if vaf > 0.8: tumor_gt = '1/1' else: tumor_gt = '0/1' item_tumor[idx_ad] = '{},{}'.format( item_tumor[idx_rd] , item_tumor[idx_ad] ) item_tumor.pop(idx_rd) item_tumor = [tumor_gt] + item_tumor # Rewrite item[idx_format] = 'GT:' + ':'.join(format_items) item[idx_SM1] = ':'.join(item_normal) item[idx_SM2] = ':'.join(item_tumor) line_i = '\t'.join(item) vcfout.write(line_i+'\n') line_i = vcf.readline().rstrip()
def convert(infile, outfile, tbam, nbam): paired_mode = True if nbam else False # Get tumor and normal sample names from the bam files: nbam_header = genome.pysam_header(nbam) if nbam else None tbam_header = genome.pysam_header(tbam) # When MuTect is run in a "single sample mode," the "normal" will be named "none." n_samplename = nbam_header.SM() if nbam else ['none'] t_samplename = tbam_header.SM() if not ( len(n_samplename)==1 and len(t_samplename)==1 ): sys.stderr.write('There are multiple Sample Names present in the BAM file!') n_samplename = n_samplename[0] t_samplename = t_samplename[0] assert t_samplename or n_samplename if t_samplename and n_samplename: paired_mode = True else: paired_mode = False idx_chrom,idx_pos,idx_id,idx_ref,idx_alt,idx_qual,idx_filter,idx_info,idx_format = 0,1,2,3,4,5,6,7,8 idx_SM1, idx_SM2 = 9,10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() while line_i.startswith('#'): if line_i.startswith('##'): vcfout.write( line_i + '\n' ) elif line_i.startswith('#CHROM'): header_items = line_i.rstrip().split('\t') idxN = header_items.index(n_samplename) idxT = header_items.index(t_samplename) if paired_mode: header_items[idx_SM1] = 'NORMAL' header_items[idx_SM2] = 'TUMOR' else: # Keep up to the first sample column, then make sure it's labeled the TUMOR sample name header_items = header_items[:idx_SM1+1] header_items[idx_SM1] = args.tumor_sample_name replaced_header = '\t'.join(header_items) vcfout.write(replaced_header + '\n') line_i = vcf.readline().rstrip() while line_i: items_i = line_i.split('\t') if paired_mode: items_i[idx_SM1], items_i[idx_SM2] = items_i[idxN], items_i[idxT] else: items_i = items_i[:idx_SM1] + [items_i[idxT]] # Print the new stuff: new_line = '\t'.join( items_i ) # Have to get rid of "N" in REF, because after snpSift annotation, it changes the ALT and vcf-validator will complain. if not ( 'N' in items_i[idx_ref] ): vcfout.write( new_line + '\n' ) line_i = vcf.readline().rstrip()
def convert(infile, snv_out, indel_out): info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open( snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] line_i = vcf_in.readline().rstrip() snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') # This line will be #CHROM: header = line_i.split('\t') # This will be the first variant line: line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line(line_i) # If "germlinerisk" is the only flag, then make it PASS since there is no matched normal if vcf_i.filters == 'germline_risk': vcf_i.filters = 'PASS' if ',' not in vcf_i.altbase: item = line_i.split('\t') if item[6] == 'germline_risk': item[6] = 'PASS' new_line = '\t'.join(item) if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write(new_line + '\n') else: indel_out.write(new_line + '\n') else: alt_bases = vcf_i.altbase.split(',') measures = [] still_measures = [] for measure_i in info_to_split: try: measures.append( vcf_i.get_info_value(measure_i).split(',')) except AttributeError: measures.append(None) for measure_i in info_to_keep: try: still_measures.append(vcf_i.get_info_value(measure_i)) except AttributeError: still_measures.append(None) for ith_base, altbase_i in enumerate(alt_bases): split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip( info_to_split, measures) if info_value != None ] still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip( info_to_keep, still_measures) if info_value != False ] split_infos.extend(still_infos) info_string = ';'.join(split_infos) GT0 = vcf_i.get_sample_value('GT', idx=0) if GT0 != '0/0' and GT0 != '0/1': sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0]) else: sample_0 = vcf_i.samples[0] new_line = '\t'.join( (vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0)) if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: snv_out.write(new_line + '\n') else: indel_out.write(new_line + '\n') line_i = vcf_in.readline().rstrip()
def convert(infile, outfile, tbam, nbam): paired_mode = True if nbam else False # Get tumor and normal sample names from the bam files: nbam_header = genome.pysam_header(nbam) if nbam else None tbam_header = genome.pysam_header(tbam) # When MuTect is run in a "single sample mode," the "normal" will be named "none." n_samplename = nbam_header.SM() if nbam else ['none'] t_samplename = tbam_header.SM() if not (len(n_samplename) == 1 and len(t_samplename) == 1): sys.stderr.write( 'There are multiple Sample Names present in the BAM file!') n_samplename = n_samplename[0] t_samplename = t_samplename[0] assert t_samplename or n_samplename if t_samplename and n_samplename: paired_mode = True else: paired_mode = False idx_chrom, idx_pos, idx_id, idx_ref, idx_alt, idx_qual, idx_filter, idx_info, idx_format = 0, 1, 2, 3, 4, 5, 6, 7, 8 idx_SM1, idx_SM2 = 9, 10 with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() while line_i.startswith('#'): if line_i.startswith('##'): vcfout.write(line_i + '\n') elif line_i.startswith('#CHROM'): header_items = line_i.rstrip().split('\t') idxN = header_items.index(n_samplename) idxT = header_items.index(t_samplename) if paired_mode: header_items[idx_SM1] = 'NORMAL' header_items[idx_SM2] = 'TUMOR' else: # Keep up to the first sample column, then make sure it's labeled the TUMOR sample name header_items = header_items[:idx_SM1 + 1] header_items[idx_SM1] = args.tumor_sample_name replaced_header = '\t'.join(header_items) vcfout.write(replaced_header + '\n') line_i = vcf.readline().rstrip() while line_i: items_i = line_i.split('\t') if paired_mode: items_i[idx_SM1], items_i[idx_SM2] = items_i[idxN], items_i[ idxT] else: items_i = items_i[:idx_SM1] + [items_i[idxT]] # Print the new stuff: new_line = '\t'.join(items_i) # Have to get rid of "N" in REF, because after snpSift annotation, it changes the ALT and vcf-validator will complain. if not ('N' in items_i[idx_ref]): vcfout.write(new_line + '\n') line_i = vcf.readline().rstrip()
def split_into_snv_and_indel(infile, snv_out, indel_out): with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('#'): snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) if (',' not in vcf_i.altbase) and ('/' not in vcf_i.altbase): if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write( line_i + '\n' ) elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write( line_i + '\n' ) else: item = line_i.split('\t') if ',' in vcf_i.altbase: alt_bases = vcf_i.altbase.split(',') elif '/' in vcf_i.altbase: alt_bases = vcf_i.altbase.split('/') else: raise Exception('Check the line: {}'.format(line_i)) for ith_base, altbase_i in enumerate(alt_bases): if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: item_j = copy(item) item_j[4] = altbase_i new_line = '\t'.join(item_j) snv_out.write( new_line + '\n' ) elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1: item_j = copy(item) item_j[4] = altbase_i new_line = '\t'.join(item_j) indel_out.write( new_line + '\n') else: complex_variant = complex2indel.translate(vcf_i.refbase, altbase_i) if complex_variant: (new_ref, new_alt), offset = complex_variant if new_ref[0] == new_alt[0] and ( len(new_ref) == 1 or len(new_alt) == 1): item_j = copy(item) item_j[3] = new_ref item_j[4] = new_alt # This *may* cause the output VCF file to go out of order if offset != 0: item_j[1] = str( int(item[1]) + offset ) new_line = '\t'.join(item_j) indel_out.write( new_line + '\n') line_i = vcf_in.readline().rstrip()
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-infile', '--input-vcf-file', type=str, help='Input VCF file', required=True) parser.add_argument('-bam', '--bam-file', type=str, help='BAM file', required=True) parser.add_argument('-ref', '--genome-reference', type=str, help='.fasta file to get the ref base', required=True, default=None) parser.add_argument('-outfile', '--output-vcf-file', type=str, help='Output VCF file', required=True) parser.add_argument('-threshold', '--phasing-threshold', type=int, help='How far apart do we try to phase', required=False, default=1) args = parser.parse_args() infile = args.input_vcf_file bam = args.bam_file ref_fa = args.genome_reference outfile = args.output_vcf_file threshold = args.phasing_threshold with genome.open_textfile(infile) as infile, \ pysam.AlignmentFile(bam) as bam, \ open(outfile, 'w') as outfile, \ pysam.FastaFile(ref_fa) as ref_fa: my_line = infile.readline().rstrip() while my_line.startswith('##'): outfile.write( my_line + '\n' ) my_line = infile.readline().rstrip() # This is to read through and copy the #CHROM line assert my_line.startswith('#CHROM') outfile.write('##INFO=<ID=COORDINATES,Number=.,Type=Integer,Description="Coordinates of the bases">\n') outfile.write('##INFO=<ID=PDP,Number=.,Type=Integer,Description="Phased DP, one for reference, and each of the variant calls.">\n') outfile.write( my_line + '\n' )
def vcfs2variants(vcf_files, bam_files, sample_names): assert len(vcf_files) == len(sample_names) == len(bam_files) variantDict = {} i = 0 for vcf_file_i, bam_file_i, sample_name_i in zip(vcf_files, bam_files, sample_names): with genome.open_textfile(vcf_file_i) as vcf, pysam.AlignmentFile( bam_file_i) as bam: line_i = vcf.readline().rstrip() while line_i.startswith('#'): line_i = vcf.readline().rstrip() while line_i: vcf_obj = genome.Vcf_line(line_i) item = line_i.split('\t') contig_i = item[0] pos_i = int(item[1]) refbase = item[3] altbase = item[4] ID_field = item[2].split(';') filter_i = item[6].split(';') genes, amino_acid_changes, txn_ids = extract_snpEff(line_i) dbsnp_cosmic_ids = extract_dbsnp_cosmic(line_i) variant_id = ( contig_i, pos_i, refbase, altbase, ) vdp, rdp, odp, totaldp = vaf_from_bam(bam, (contig_i, pos_i), refbase, altbase, 1) try: vaf_i = vdp / totaldp except ZeroDivisionError: vaf_i = math.nan if variant_id not in variantDict: variantDict[variant_id] = {} variantDict[variant_id]['GENES'] = genes variantDict[variant_id]['AAChange'] = amino_acid_changes variantDict[variant_id]['TRANSCRIPT'] = txn_ids variantDict[variant_id]['DATABASE'] = dbsnp_cosmic_ids variantDict[variant_id][sample_name_i] = { 'FILTER': filter_i, 'VAF': vaf_i, 'VDP': vdp, 'DP': totaldp } line_i = vcf.readline().rstrip() i += 1 return variantDict
def convert(infile, outfile): with genome.open_textfile(infile) as vcf, open(outfile, 'w') as vcfout: line_i = vcf.readline().rstrip() # Skip headers from now on: while line_i.startswith('#'): if line_i.startswith('##FORMAT=<ID=DP4,'): line_i = '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">' elif line_i.startswith('##FORMAT=<ID=AD,'): line_i = '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">' vcfout.write( line_i + '\n') line_i = vcf.readline().rstrip() # Doing the work here: while line_i: vcf_i = genome.Vcf_line(line_i) num_samples = len( vcf_i.samples ) if num_samples == 1: paired = False elif num_samples == 2: paired = True elif num_samples > 2: sys.stderr.write('We found more than 2 sammples in this VCF file. It may be messed up, but I\'ll just assume the first 2 samples mean anything at all') paired = True elif num_samples == 0: raise Exception('No sample information here.') # Replace the wrong "G/A" with the correct "G,A" in ALT column: vcf_i.altbase = vcf_i.altbase.replace('/', ',') # vcf-validator is not going to accept multiple sequences in the REF, as is the case in VarScan2's indel output: vcf_i.refbase = re.sub( r'[^\w].*$', '', vcf_i.refbase ) # Get rid of non-compliant characters in the ALT column: vcf_i.altbase = re.sub(r'[^\w,.]', '', vcf_i.altbase) # Eliminate dupliate entries in ALT: vcf_i.altbase = re.sub(r'(\w+),\1', r'\1', vcf_i.altbase ) # Eliminate ALT entries when it matches with the REF column, to address vcf-validator complaints: if ',' in vcf_i.altbase: alt_item = vcf_i.altbase.split(',') if vcf_i.refbase in alt_item: bad_idx = alt_item.index(vcf_i.refbase) alt_item.pop(bad_idx) vcf_i.altbase = ','.join(alt_item) # To fix this vcf-validator complaints: # Could not parse the allele(s) [GTC], first base does not match the reference for n1,alt_i in enumerate(alt_item[1::]): if not alt_i.startswith( vcf_i.refbase ): alt_item.pop(n1+1) vcf_i.altbase = ','.join(alt_item) # Combine AD:RD into AD: format_items = vcf_i.get_sample_variable() if 'AD' in format_items and 'RD' in format_items: rd_sm1 = vcf_i.get_sample_value('RD', 0) ad_sm1 = vcf_i.get_sample_value('AD', 0) try: rd_sm2 = vcf_i.get_sample_value('RD', 1) ad_sm2 = vcf_i.get_sample_value('AD', 1) except IndexError: rd_sm2 = ad_sm2 = 0 idx_ad = format_items.index('AD') idx_rd = format_items.index('RD') format_items.pop(idx_rd) vcf_i.field = ':'.join(format_items) item_normal = vcf_i.samples[0].split(':') item_normal[idx_ad] = '{},{}'.format( rd_sm1, ad_sm1 ) item_normal.pop(idx_rd) vcf_i.samples[0] = ':'.join(item_normal) if paired: item_tumor = vcf_i.samples[1].split(':') item_tumor[idx_ad] = '{},{}'.format( rd_sm2, ad_sm2 ) item_tumor.pop(idx_rd) vcf_i.samples[1] = ':'.join(item_tumor) # Reform the line: line_i = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, vcf_i.altbase, vcf_i.qual, vcf_i.filters, vcf_i.info, vcf_i.field, '\t'.join((vcf_i.samples)) )) # VarScan2 output a line with REF allele as "M". GATK CombineVariants complain about that. if not re.search(r'[^GCTAU]', vcf_i.refbase, re.I): vcfout.write(line_i+'\n') # Next line: line_i = vcf.readline().rstrip()
parser.add_argument('-infile', '--vcf-in', type=str, help='VCF in', required=True) parser.add_argument('-outfile', '--vcf-out', type=str, help='VCF out', required=True) parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever', required=True) parser.add_argument('-tumor', '--tumor-sample-name', type=str, help='tumor sample name', required=False, default='TUMOR') parser.add_argument('-trained', '--somaticseq-trained', action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False) args = parser.parse_args() vcf_in_fn = args.vcf_in vcf_out_fn = args.vcf_out caller_string = args.callers_classification_string tumor = args.tumor_sample_name somaticseq_trained = args.somaticseq_trained with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout: line_in = vcfin.readline().rstrip('\n') while line_in.startswith('##'): if line_in.startswith('##SomaticSeq='): line_out = line_in + '-SEQC2' elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)): line_out = re.sub('##INFO=', '##FORMAT=', line_in) else: line_out = line_in vcfout.write( line_out + '\n' )
def convert(infile, snv_out, indel_out): info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open(snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] line_i = vcf_in.readline().rstrip() snv_out.write( line_i + '\n' ) indel_out.write( line_i + '\n' ) # This line will be #CHROM: header = line_i.split('\t') # This will be the first variant line: line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line( line_i ) # If "germlinerisk" is the only flag, then make it PASS since there is no matched normal if vcf_i.filters == 'germline_risk': vcf_i.filters = 'PASS' if ',' not in vcf_i.altbase: item = line_i.split('\t') if item[6] == 'germline_risk': item[6] = 'PASS' new_line = '\t'.join( item ) if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write( new_line + '\n' ) elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write( new_line + '\n' ) else: alt_bases = vcf_i.altbase.split(',') measures = [] still_measures = [] for measure_i in info_to_split: try: measures.append( vcf_i.get_info_value(measure_i).split(',') ) except AttributeError: measures.append( None ) for measure_i in info_to_keep: try: still_measures.append( vcf_i.get_info_value(measure_i) ) except AttributeError: still_measures.append( None ) for ith_base, altbase_i in enumerate(alt_bases): split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip(info_to_split, measures) if info_value != None ] still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip(info_to_keep, still_measures) if info_value != False ] split_infos.extend(still_infos) info_string = ';'.join( split_infos ) GT0 = vcf_i.get_sample_value('GT', idx=0) if GT0 != '0/0' and GT0 != '0/1': sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0]) else: sample_0 = vcf_i.samples[0] new_line = '\t'.join(( vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0 )) if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: snv_out.write( new_line + '\n' ) elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1: indel_out.write( new_line + '\n') line_i = vcf_in.readline().rstrip()
min_refMQ = args.min_refMQ min_altMQ = args.min_altMQ min_refBQ = args.min_refBQ min_altBQ = args.min_altBQ max_refNM = args.max_refNM max_altNM = args.max_altNM max_fetSB = args.max_fetSB max_fetCD = args.max_fetCD max_zMQ = args.max_zMQ max_zBQ = args.max_zBQ max_MQ0 = args.max_MQ0 min_VAF = args.min_VAF min_DP = args.min_DP min_varDP = args.min_varDP with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write(line_i + '\n') line_i = vcf_in.readline().rstrip() vcf_out.write(line_i + '\n') # This line will be #CHROM: header = line_i.split('\t') sample_index = header.index(sample) - 9 # This will be the first variant line:
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, bam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, vardict=None, lofreq=None, scalpel=None, strelka=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() bam = pysam.AlignmentFile(bam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 6 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the tBAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) # Reset num_caller to 0 for each variant in the same coordinate num_callers = 0 #################### Collect Caller Vcf ####################: if mutect: mutect_classification, tlod, ecnt = annotate_caller.ssMuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = tlod = ecnt = nan if varscan: varscan_classification, score_varscan2 = annotate_caller.ssVarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = score_varscan2 = nan if vardict: vardict_classification, msi, msilen, shift3, t_pmean, t_pstd, t_qstd = annotate_caller.ssVarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = t_pmean = t_pstd = t_qstd = nan if lofreq: lofreq_classification = annotate_caller.ssLoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.ssScalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification = annotate_caller.ssStrelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants.keys(): judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### INFO EXTRACTION FROM BAM FILES ########## ######### # Tumor tBAM file: tBamFeatures = sequencing_features.from_bam( bam, my_coordinate, ref_base, first_alt, min_mq, min_bq) # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Linguistic sequence complexity in a +/-80bp window, but substring calculation stops at 20-bp substring. seq_span_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 41), my_coordinate[1] + 40) seq_left_80bp = ref_fa.fetch( my_coordinate[0], max(0, my_coordinate[1] - 81), my_coordinate[1]) seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[1], my_coordinate[1] + 81) if len(seq_span_80bp) > 20: LC_spanning = sequencing_features.subLC( seq_span_80bp, 20) else: LC_spanning = math.nan if len(seq_left_80bp) > 20: left_LC = sequencing_features.subLC( seq_left_80bp, 20) else: left_LC = math.nan if len(seq_right_80bp) > 20: right_LC = sequencing_features.subLC( seq_right_80bp, 20) else: right_LC = math.nan LC_adjacent = min(left_LC, right_LC) LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_Strelka = strelka_classification, \ if_VarScan2 = varscan_classification, \ if_VarDict = vardict_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ Seq_Complexity_Span = LC_spanning_phred, \ Seq_Complexity_Adj = LC_adjacent_phred, \ M2_TLOD = tlod, \ M2_ECNT = ecnt, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_p_MannWhitneyU_MQ = '%g' % tBamFeatures['p_mannwhitneyu_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_p_MannWhitneyU_BQ = '%g' % tBamFeatures['p_mannwhitneyu_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_p_MannWhitneyU_EndPos = '%g' % tBamFeatures['p_mannwhitneyu_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, bam, truth, cosmic, dbsnp, mutect, varscan, vardict, lofreq, scalpel, strelka) [opened_file.close() for opened_file in opened_files if opened_file]
def convert(infile, snv_out, indel_out, is_tnscope): info_to_split = 'NLOD', 'TLOD' info_to_keep = 'STR', 'ECNT' with genome.open_textfile(infile) as vcf_in, open( snv_out, 'w') as snv_out, open(indel_out, 'w') as indel_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): if line_i.startswith('##normal_sample='): normal_name = line_i.split('=')[1] if line_i.startswith('##tumor_sample='): tumor_name = line_i.split('=')[1] if line_i.startswith('##INFO=<ID=SOR,'): line_i = re.sub(r'Float', 'String', line_i) snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') line_i = vcf_in.readline().rstrip() # This line will be #CHROM: snv_out.write(line_i + '\n') indel_out.write(line_i + '\n') header = line_i.split('\t') if is_tnscope: # Doesn't matter which one is normal/tumor. These information are not used. normal_index, tumor_index = 1, 0 else: normal_index = header.index(normal_name) - 9 tumor_index = header.index(tumor_name) - 9 # This will be the first variant line: line_i = vcf_in.readline().rstrip() while line_i: vcf_i = genome.Vcf_line(line_i) if ',' not in vcf_i.altbase: if len(vcf_i.refbase) == 1 and len(vcf_i.altbase) == 1: snv_out.write(line_i + '\n') elif len(vcf_i.refbase) == 1 or len(vcf_i.altbase) == 1: indel_out.write(line_i + '\n') else: alt_bases = vcf_i.altbase.split(',') measures = [] still_measures = [] for measure_i in info_to_split: try: measures.append( vcf_i.get_info_value(measure_i).split(',')) except AttributeError: measures.append(None) for measure_i in info_to_keep: try: still_measures.append(vcf_i.get_info_value(measure_i)) except AttributeError: still_measures.append(None) for ith_base, altbase_i in enumerate(alt_bases): split_infos = [ '{}={}'.format(info_variable, info_value[ith_base]) for info_variable, info_value in zip( info_to_split, measures) if info_value != None ] still_infos = [ '{}={}'.format(info_variable, info_value) for info_variable, info_value in zip( info_to_keep, still_measures) if info_value != False ] split_infos.extend(still_infos) info_string = ';'.join(split_infos) GT0 = vcf_i.get_sample_value('GT', idx=0) if GT0 != '0/0' and GT0 != '0/1': sample_0 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[0]) else: sample_0 = vcf_i.samples[0] GT1 = vcf_i.get_sample_value('GT', idx=1) if GT1 != '0/0' and GT0 != '0/1': sample_1 = re.sub(r'^[^:]+', '0/1', vcf_i.samples[1]) else: sample_1 = vcf_i.samples[1] new_line = '\t'.join( (vcf_i.chromosome, str(vcf_i.position), vcf_i.identifier, vcf_i.refbase, altbase_i, vcf_i.qual, vcf_i.filters, info_string, vcf_i.field, sample_0, sample_1)) if len(vcf_i.refbase) == 1 and len(altbase_i) == 1: snv_out.write(new_line + '\n') elif len(vcf_i.refbase) == 1 or len(altbase_i) == 1: indel_out.write(new_line + '\n') line_i = vcf_in.readline().rstrip()
def vcf2tsv(is_vcf=None, is_bed=None, is_pos=None, nbam_fn=None, tbam_fn=None, truth=None, cosmic=None, dbsnp=None, mutect=None, varscan=None, jsm=None, sniper=None, vardict=None, muse=None, lofreq=None, scalpel=None, strelka=None, tnscope=None, platypus=None, dedup=True, min_mq=1, min_bq=5, min_caller=0, ref_fa=None, p_scale=None, outfile=None): # Convert contig_sequence to chrom_seq dict: fai_file = ref_fa + '.fai' chrom_seq = genome.faiordict2contigorder(fai_file, 'fai') # Determine input format: if is_vcf: mysites = is_vcf elif is_bed: mysites = is_bed elif is_pos: mysites = is_pos else: mysites = fai_file logger.info('No position supplied. Will evaluate the whole genome.') # Re-scale output or not: if p_scale == None: logger.info('NO RE-SCALING') elif p_scale.lower() == 'phred': p_scale = 'phred' elif p_scale.lower() == 'fraction': p_scale = 'fraction' else: p_scale = None logger.info('NO RE-SCALING') # Define NaN and Inf: nan = float('nan') inf = float('inf') pattern_chr_position = genome.pattern_chr_position ## Running with genome.open_textfile(mysites) as my_sites, open(outfile, 'w') as outhandle: my_line = my_sites.readline().rstrip() nbam = pysam.AlignmentFile(nbam_fn, reference_filename=ref_fa) tbam = pysam.AlignmentFile(tbam_fn, reference_filename=ref_fa) ref_fa = pysam.FastaFile(ref_fa) if truth: truth = genome.open_textfile(truth) truth_line = genome.skip_vcf_header(truth) if cosmic: cosmic = genome.open_textfile(cosmic) cosmic_line = genome.skip_vcf_header(cosmic) if dbsnp: dbsnp = genome.open_textfile(dbsnp) dbsnp_line = genome.skip_vcf_header(dbsnp) # 10 Incorporate callers: get thru the #'s if mutect: mutect = genome.open_textfile(mutect) mutect_line = genome.skip_vcf_header(mutect) if varscan: varscan = genome.open_textfile(varscan) varscan_line = genome.skip_vcf_header(varscan) if jsm: jsm = genome.open_textfile(jsm) jsm_line = genome.skip_vcf_header(jsm) if sniper: sniper = genome.open_textfile(sniper) sniper_line = genome.skip_vcf_header(sniper) if vardict: vardict = genome.open_textfile(vardict) vardict_line = genome.skip_vcf_header(vardict) if muse: muse = genome.open_textfile(muse) muse_line = genome.skip_vcf_header(muse) if lofreq: lofreq = genome.open_textfile(lofreq) lofreq_line = genome.skip_vcf_header(lofreq) if scalpel: scalpel = genome.open_textfile(scalpel) scalpel_line = genome.skip_vcf_header(scalpel) if strelka: strelka = genome.open_textfile(strelka) strelka_line = genome.skip_vcf_header(strelka) if tnscope: tnscope = genome.open_textfile(tnscope) tnscope_line = genome.skip_vcf_header(tnscope) if platypus: platypus = genome.open_textfile(platypus) platypus_line = genome.skip_vcf_header(platypus) # Get through all the headers: while my_line.startswith('#') or my_line.startswith('track='): my_line = my_sites.readline().rstrip() # First coordinate, for later purpose of making sure the input is sorted properly coordinate_i = re.match(genome.pattern_chr_position, my_line) coordinate_i = coordinate_i.group() if coordinate_i else '' # First line: outhandle.write(out_header.replace('{', '').replace('}', '') + '\n') while my_line: # If VCF, get all the variants with the same coordinate into a list: if is_vcf: my_vcf = genome.Vcf_line(my_line) my_coordinates = [(my_vcf.chromosome, my_vcf.position)] variants_at_my_coordinate = [] alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) # As long as the "coordinate" stays the same, it will keep reading until it's different. while my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): my_line = my_sites.readline().rstrip() my_vcf = genome.Vcf_line(my_line) ########## This block is code is to ensure the input VCF file is properly sorted ## coordinate_j = re.match(genome.pattern_chr_position, my_line) coordinate_j = coordinate_j.group() if coordinate_j else '' if genome.whoisbehind(coordinate_i, coordinate_j, chrom_seq) == 1: raise Exception( '{} does not seem to be properly sorted.'.format( mysites)) coordinate_i = coordinate_j ################################################################################### if my_coordinates[0] == (my_vcf.chromosome, my_vcf.position): alt_bases = my_vcf.altbase.split(',') for alt_i in alt_bases: vcf_i = copy(my_vcf) vcf_i.altbase = alt_i variants_at_my_coordinate.append(vcf_i) elif is_bed: bed_item = my_line.split('\t') my_coordinates = genomic_coordinates(bed_item[0], int(bed_item[1]) + 1, int(bed_item[2])) elif is_pos: pos_item = my_line.split('\t') my_coordinates = genomic_coordinates(pos_item[0], int(pos_item[1]), int(pos_item[1])) elif fai_file: fai_item = my_line.split('\t') my_coordinates = genomic_coordinates(fai_item[0], 1, int(fai_item[1])) ##### ##### ##### ##### ##### ##### for my_coordinate in my_coordinates: ######## If VCF, can get ref base, variant base, as well as other identifying information ######## if is_vcf: ref_bases = [] alt_bases = [] indel_lengths = [] all_my_identifiers = [] for variant_i in variants_at_my_coordinate: ref_base = variant_i.refbase first_alt = variant_i.altbase.split(',')[0] indel_length = len(first_alt) - len(ref_base) ref_bases.append(ref_base) alt_bases.append(first_alt) indel_lengths.append(indel_length) # Extract these information if they exist in the VCF file, but they could be re-written if dbSNP/COSMIC are supplied. if_dbsnp = 1 if re.search(r'rs[0-9]+', variant_i.identifier) else 0 if_cosmic = 1 if re.search(r'COS[MN][0-9]+', variant_i.identifier) else 0 if_common = 1 if variant_i.get_info_value( 'COMMON') == '1' else 0 num_cases = variant_i.get_info_value( 'CNT') if variant_i.get_info_value('CNT') else nan if variant_i.identifier == '.': my_identifier_i = set() else: my_identifier_i = variant_i.identifier.split(';') my_identifier_i = set(my_identifier_i) all_my_identifiers.append(my_identifier_i) ## If not, 1) get ref_base, first_alt from other VCF files. # 2) Create placeholders for dbSNP and COSMIC that can be overwritten with dbSNP/COSMIC VCF files (if provided) else: variants_at_my_coordinate = [ None ] # Just to have something to iterate ref_base = first_alt = indel_length = None # Could be re-written if dbSNP/COSMIC are supplied. If not, they will remain NaN. if_dbsnp = if_cosmic = if_common = num_cases = nan # Keep track of NumCallers: num_callers = 0 #################################### Find the same coordinate in those VCF files #################################### if mutect: got_mutect, mutect_variants, mutect_line = genome.find_vcf_at_coordinate( my_coordinate, mutect_line, mutect, chrom_seq) if varscan: got_varscan, varscan_variants, varscan_line = genome.find_vcf_at_coordinate( my_coordinate, varscan_line, varscan, chrom_seq) if jsm: got_jsm, jsm_variants, jsm_line = genome.find_vcf_at_coordinate( my_coordinate, jsm_line, jsm, chrom_seq) if sniper: got_sniper, sniper_variants, sniper_line = genome.find_vcf_at_coordinate( my_coordinate, sniper_line, sniper, chrom_seq) if vardict: got_vardict, vardict_variants, vardict_line = genome.find_vcf_at_coordinate( my_coordinate, vardict_line, vardict, chrom_seq) if muse: got_muse, muse_variants, muse_line = genome.find_vcf_at_coordinate( my_coordinate, muse_line, muse, chrom_seq) if lofreq: got_lofreq, lofreq_variants, lofreq_line = genome.find_vcf_at_coordinate( my_coordinate, lofreq_line, lofreq, chrom_seq) if scalpel: got_scalpel, scalpel_variants, scalpel_line = genome.find_vcf_at_coordinate( my_coordinate, scalpel_line, scalpel, chrom_seq) if strelka: got_strelka, strelka_variants, strelka_line = genome.find_vcf_at_coordinate( my_coordinate, strelka_line, strelka, chrom_seq) if tnscope: got_tnscope, tnscope_variants, tnscope_line = genome.find_vcf_at_coordinate( my_coordinate, tnscope_line, tnscope, chrom_seq) if platypus: got_platypus, platypus_variants, platypus_line = genome.find_vcf_at_coordinate( my_coordinate, platypus_line, platypus, chrom_seq) if truth: got_truth, truth_variants, truth_line = genome.find_vcf_at_coordinate( my_coordinate, truth_line, truth, chrom_seq) if dbsnp: got_dbsnp, dbsnp_variants, dbsnp_line = genome.find_vcf_at_coordinate( my_coordinate, dbsnp_line, dbsnp, chrom_seq) if cosmic: got_cosmic, cosmic_variants, cosmic_line = genome.find_vcf_at_coordinate( my_coordinate, cosmic_line, cosmic, chrom_seq) # Now, use pysam to look into the BAM file(s), variant by variant from the input: for ith_call, my_call in enumerate(variants_at_my_coordinate): if is_vcf: # The particular line in the input VCF file: variant_id = ((my_call.chromosome, my_call.position), my_call.refbase, my_call.altbase) ref_base = ref_bases[ith_call] first_alt = alt_bases[ith_call] indel_length = indel_lengths[ith_call] my_identifiers = all_my_identifiers[ith_call] else: variant_id = ((my_coordinate[0], my_coordinate[1]), ref_base, first_alt) #################### Collect Caller Vcf ####################: if mutect: mutect_classification, nlod, tlod, tandem, ecnt = annotate_caller.MuTect( variant_id, mutect_variants) num_callers += mutect_classification else: mutect_classification = nlod = tlod = tandem = ecnt = nan if varscan: varscan_classification = annotate_caller.VarScan( variant_id, varscan_variants) num_callers += varscan_classification else: varscan_classification = nan if jsm: jointsnvmix2_classification, score_jointsnvmix2 = annotate_caller.JSM( variant_id, jsm_variants) num_callers += jointsnvmix2_classification else: jointsnvmix2_classification = score_jointsnvmix2 = nan if sniper: sniper_classification, score_somaticsniper = annotate_caller.SomaticSniper( variant_id, sniper_variants) num_callers += sniper_classification else: sniper_classification = score_somaticsniper = nan if vardict: vardict_classification, msi, msilen, shift3, score_vardict = annotate_caller.VarDict( variant_id, vardict_variants) num_callers += vardict_classification else: vardict_classification = msi = msilen = shift3 = score_vardict = nan if muse: muse_classification = annotate_caller.MuSE( variant_id, muse_variants) num_callers += muse_classification else: muse_classification = nan if lofreq: lofreq_classification = annotate_caller.LoFreq( variant_id, lofreq_variants) num_callers += lofreq_classification else: lofreq_classification = nan if scalpel: scalpel_classification = annotate_caller.Scalpel( variant_id, scalpel_variants) num_callers += scalpel_classification else: scalpel_classification = nan if strelka: strelka_classification, somatic_evs, qss, tqss = annotate_caller.Strelka( variant_id, strelka_variants) num_callers += strelka_classification else: strelka_classification = somatic_evs = qss = tqss = nan if tnscope: tnscope_classification = annotate_caller.TNscope( variant_id, tnscope_variants) num_callers += tnscope_classification else: tnscope_classification = nan if platypus: platypus_classification = annotate_caller.countPASS( variant_id, platypus_variants) num_callers += platypus_classification else: platypus_classification = nan # Potentially write the output only if it meets this threshold: if num_callers >= min_caller: ########## Ground truth file ########## if truth: if variant_id in truth_variants: judgement = 1 my_identifiers.add('TruePositive') else: judgement = 0 my_identifiers.add('FalsePositive') else: judgement = nan ########## dbSNP ########## Will overwrite dbSNP info from input VCF file if dbsnp: if_dbsnp, if_common, rsID = annotate_caller.dbSNP( variant_id, dbsnp_variants) for ID_i in rsID: my_identifiers.add(ID_i) ########## COSMIC ########## Will overwrite COSMIC info from input VCF file if cosmic: if_cosmic, num_cases, cosmicID = annotate_caller.COSMIC( variant_id, cosmic_variants) for ID_i in cosmicID: my_identifiers.add(ID_i) ########## ######### ######### INFO EXTRACTION FROM BAM FILES ########## ######### ######### nBamFeatures = sequencing_features.from_bam( nbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) tBamFeatures = sequencing_features.from_bam( tbam, my_coordinate, ref_base, first_alt, min_mq, min_bq) n_ref = nBamFeatures['ref_for'] + nBamFeatures[ 'ref_rev'] n_alt = nBamFeatures['alt_for'] + nBamFeatures[ 'alt_rev'] t_ref = tBamFeatures['ref_for'] + tBamFeatures[ 'ref_rev'] t_alt = tBamFeatures['alt_for'] + tBamFeatures[ 'alt_rev'] sor = sequencing_features.somaticOddRatio( n_ref, n_alt, t_ref, t_alt) # Calculate VarScan'2 SCC directly without using VarScan2 output: try: score_varscan2 = genome.p2phred( stats.fisher_exact( ((t_alt, n_alt), (t_ref, n_ref)), alternative='greater')[1]) except ValueError: score_varscan2 = nan # Homopolymer eval: homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref_base, first_alt) # Fill the ID field of the TSV/VCF my_identifiers = ';'.join( my_identifiers) if my_identifiers else '.' ### out_line = out_header.format( \ CHROM = my_coordinate[0], \ POS = my_coordinate[1], \ ID = my_identifiers, \ REF = ref_base, \ ALT = first_alt, \ if_MuTect = mutect_classification, \ if_VarScan2 = varscan_classification, \ if_JointSNVMix2 = jointsnvmix2_classification, \ if_SomaticSniper = sniper_classification, \ if_VarDict = vardict_classification, \ MuSE_Tier = muse_classification, \ if_LoFreq = lofreq_classification, \ if_Scalpel = scalpel_classification, \ if_Strelka = strelka_classification, \ if_TNscope = tnscope_classification, \ if_Platypus = platypus_classification, \ Strelka_Score = somatic_evs, \ Strelka_QSS = qss, \ Strelka_TQSS = tqss, \ VarScan2_Score = rescale(score_varscan2, 'phred', p_scale, 1001), \ SNVMix2_Score = rescale(score_jointsnvmix2, 'phred', p_scale, 1001), \ Sniper_Score = rescale(score_somaticsniper, 'phred', p_scale, 1001), \ VarDict_Score = rescale(score_vardict, 'phred', p_scale, 1001), \ if_dbsnp = if_dbsnp, \ COMMON = if_common, \ if_COSMIC = if_cosmic, \ COSMIC_CNT = num_cases, \ Consistent_Mates = tBamFeatures['consistent_mates'], \ Inconsistent_Mates = tBamFeatures['inconsistent_mates'], \ N_DP = nBamFeatures['dp'], \ nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'], \ nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'], \ nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'], \ nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'], \ nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'], \ nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'], \ nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'], \ nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'], \ nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'], \ nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'], \ nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'], \ nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'], \ nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'], \ nBAM_Concordance_FET = rescale(nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ N_REF_FOR = nBamFeatures['ref_for'], \ N_REF_REV = nBamFeatures['ref_rev'], \ N_ALT_FOR = nBamFeatures['alt_for'], \ N_ALT_REV = nBamFeatures['alt_rev'], \ nBAM_StrandBias_FET = rescale(nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'], \ nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'], \ nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'], \ nBAM_Clipping_FET = rescale(nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ nBAM_MQ0 = nBamFeatures['MQ0'], \ nBAM_Other_Reads = nBamFeatures['noise_read_count'], \ nBAM_Poor_Reads = nBamFeatures['poor_read_count'], \ nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'], \ nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'], \ nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'], \ nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'], \ nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'], \ nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'], \ M2_NLOD = nlod, \ M2_TLOD = tlod, \ M2_STR = tandem, \ M2_ECNT = ecnt, \ SOR = sor, \ MSI = msi, \ MSILEN = msilen, \ SHIFT3 = shift3, \ MaxHomopolymer_Length = homopolymer_length, \ SiteHomopolymer_Length = site_homopolymer_length, \ T_DP = tBamFeatures['dp'], \ tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'], \ tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'], \ tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'], \ tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'], \ tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'], \ tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'], \ tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'], \ tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'], \ tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'], \ tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'], \ tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'], \ tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'], \ tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'], \ tBAM_Concordance_FET = rescale(tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001), \ T_REF_FOR = tBamFeatures['ref_for'], \ T_REF_REV = tBamFeatures['ref_rev'], \ T_ALT_FOR = tBamFeatures['alt_for'], \ T_ALT_REV = tBamFeatures['alt_rev'], \ tBAM_StrandBias_FET = rescale(tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001), \ tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'], \ tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'], \ tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'], \ tBAM_Clipping_FET = rescale(tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001), \ tBAM_MQ0 = tBamFeatures['MQ0'], \ tBAM_Other_Reads = tBamFeatures['noise_read_count'], \ tBAM_Poor_Reads = tBamFeatures['poor_read_count'], \ tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'], \ tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'], \ tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'], \ tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'], \ tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'], \ tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'], \ InDel_Length = indel_length, \ TrueVariant_or_False = judgement ) # Print it out to stdout: outhandle.write(out_line + '\n') # Read into the next line: if not is_vcf: my_line = my_sites.readline().rstrip() ########## Close all open files if they were opened ########## opened_files = (ref_fa, nbam, tbam, truth, cosmic, dbsnp, mutect, varscan, jsm, sniper, vardict, muse, lofreq, scalpel, strelka, tnscope, platypus) [opened_file.close() for opened_file in opened_files if opened_file]
def convert(infile, snv_out, indel_out): with genome.open_textfile(infile) as vcf, open( snv_out, 'w') as snpout, open(indel_out, 'w') as indelout: line_i = vcf.readline().rstrip() while line_i.startswith('##'): if re.match(r'^##INFO=<ID=(LSEQ|RSEQ),', line_i): line_i = line_i.replace('Number=G', 'Number=1') elif line_i.startswith('##FORMAT=<ID=BIAS,'): line_i = line_i.replace('Number=1', 'Number=.') elif line_i.startswith('##FORMAT=<ID=PSTD,') or \ line_i.startswith('##FORMAT=<ID=QSTD,') or \ line_i.startswith('##INFO=<ID=SOR,'): line_i = line_i.replace('Type=Float', 'Type=String') snpout.write(line_i + '\n') indelout.write(line_i + '\n') line_i = vcf.readline().rstrip() addition_header = [] addition_header.append( '##INFO=<ID=Germline,Number=0,Type=Flag,Description="VarDict Germline">' ) addition_header.append( '##INFO=<ID=StrongSomatic,Number=0,Type=Flag,Description="VarDict Strong Somatic">' ) addition_header.append( '##INFO=<ID=LikelySomatic,Number=0,Type=Flag,Description="VarDict Likely Somatic">' ) addition_header.append( '##INFO=<ID=LikelyLOH,Number=0,Type=Flag,Description="VarDict Likely LOH">' ) addition_header.append( '##INFO=<ID=StrongLOH,Number=0,Type=Flag,Description="VarDict Strong LOH">' ) addition_header.append( '##INFO=<ID=AFDiff,Number=0,Type=Flag,Description="VarDict AF Diff">' ) addition_header.append( '##INFO=<ID=Deletion,Number=0,Type=Flag,Description="VarDict Deletion">' ) addition_header.append( '##INFO=<ID=SampleSpecific,Number=0,Type=Flag,Description="VarDict SampleSpecific">' ) addition_header.append( '##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">' ) for item_i in addition_header: snpout.write(item_i + '\n') indelout.write(item_i + '\n') # This is the #CHROM line header_main_item = line_i.split('\t') num_header = len(header_main_item) if num_header == 10: paired = False elif num_header == 11: paired = True snpout.write(line_i + '\n') indelout.write(line_i + '\n') line_i = vcf.readline().rstrip() while line_i: vcfcall = genome.Vcf_line(line_i) # Fix the occasional error where ALT and REF are the same: if vcfcall.refbase != vcfcall.altbase: # In the REF/ALT field, non-GCTA characters should be changed to N to fit the VCF standard: vcfcall.refbase = re.sub(r'[^GCTA]', 'N', vcfcall.refbase, flags=re.I) vcfcall.altbase = re.sub(r'[^GCTA]', 'N', vcfcall.altbase, flags=re.I) ## To be consistent with other tools, Combine AD:RD or ALD:RD into DP4. # VarDict puts Tumor first and Normal next # Also, the old version has no ALD (somatic.pl). The new version has ALD (paired.pl). format_field = vcfcall.field.split(':') idx_rd = format_field.index('RD') tumor_sample = vcfcall.samples[0].split(':') tumor_dp4 = tumor_sample.pop(idx_rd) if paired: normal_sample = vcfcall.samples[1].split(':') normal_dp4 = normal_sample.pop(idx_rd) format_field.pop(idx_rd) # As right now, the old version has no ALD. The new version has ALD. # If the VCF has no ALD, then the AD means the same thing ALD is supposed to mean. try: idx_ad = format_field.index('ALD') except ValueError: idx_ad = format_field.index('AD') if paired: normal_dp4 = normal_dp4 + ',' + normal_sample.pop(idx_ad) tumor_dp4 = tumor_dp4 + ',' + tumor_sample.pop(idx_ad) format_field.pop(idx_ad) # Re-format the strings: format_field.append('DP4') if paired: normal_sample.append(normal_dp4) tumor_sample.append(tumor_dp4) if paired: normal_sample = ':'.join(normal_sample) tumor_sample = ':'.join(tumor_sample) new_format_string = ':'.join(format_field) # VarDict's END tag has caused problem with GATK CombineVariants. Simply get rid of it. vcfcall.info = re.sub(r'END=[0-9]+;', '', vcfcall.info) if paired: line_i = '\t'.join( (vcfcall.chromosome, str(vcfcall.position), vcfcall.identifier, vcfcall.refbase, vcfcall.altbase, vcfcall.qual, vcfcall.filters, vcfcall.info, new_format_string, normal_sample, tumor_sample)) else: line_i = '\t'.join( (vcfcall.chromosome, str(vcfcall.position), vcfcall.identifier, vcfcall.refbase, vcfcall.altbase, vcfcall.qual, vcfcall.filters, vcfcall.info, new_format_string, tumor_sample)) # Write to snp and indel into different files: if 'TYPE=SNV' in vcfcall.info: snpout.write(line_i + '\n') elif 'TYPE=Deletion' in vcfcall.info or 'TYPE=Insertion' in vcfcall.info: indelout.write(line_i + '\n') elif 'TYPE=Complex' in vcfcall.info and (len( vcfcall.refbase) == len(vcfcall.altbase)): i = 0 for ref_i, alt_i in zip(vcfcall.refbase, vcfcall.altbase): if ref_i != alt_i: if paired: line_i = '\t'.join( (vcfcall.chromosome, str(vcfcall.position + i), vcfcall.identifier, ref_i, alt_i, vcfcall.qual, vcfcall.filters, vcfcall.info, new_format_string, normal_sample, tumor_sample)) else: line_i = '\t'.join( (vcfcall.chromosome, str(vcfcall.position + i), vcfcall.identifier, ref_i, alt_i, vcfcall.qual, vcfcall.filters, vcfcall.info, new_format_string, tumor_sample)) snpout.write(line_i + '\n') i += 1 # Continue: line_i = vcf.readline().rstrip()
parser.add_argument('-infile', '--vcf-in', type=str, help='VCF in', required=True) parser.add_argument('-outfile', '--vcf-out', type=str, help='VCF out', required=True) parser.add_argument('-callers', '--callers-classification-string', type=str, help='MVJSD or whatever', required=True) parser.add_argument('-tumor', '--tumor-sample-name', type=str, help='tumor sample name', required=False, default='TUMOR') parser.add_argument('-trained', '--somaticseq-trained', action='store_true', help='If true, will use the QUAL as SomaticSeq score. Otherwise, SCORE will be .', required=False, default=False) args = parser.parse_args() vcf_in_fn = args.vcf_in vcf_out_fn = args.vcf_out caller_string = args.callers_classification_string tumor = args.tumor_sample_name somaticseq_trained = args.somaticseq_trained with genome.open_textfile(vcf_in_fn) as vcfin, open(vcf_out_fn, 'w') as vcfout: line_in = vcfin.readline().rstrip('\n') while line_in.startswith('##'): if line_in.startswith('##SomaticSeq='): line_out = line_in + '-SEQC2' elif line_in.startswith('##INFO=<ID=NUM_TOOLS') or line_in.startswith('##INFO=<ID={COMBO}'.format(COMBO=caller_string)): line_out = re.sub('##INFO=', '##FORMAT=', line_in) else: line_out = line_in vcfout.write( line_out + '\n' )
if args.pileup_DP4: header_append.append( '##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">' ) format_append.append('plDP4') if args.pileup_variant_allele_frequency: header_append.append( '##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">' ) format_append.append('plVAF') # Start Working by opening files: try: my_vcf = genome.open_textfile(my_vcf) Tpileup = genome.open_textfile(Tpileup) outhandle = open(outfile, 'w') Npileup = genome.open_textfile(Npileup) except AttributeError: pass if Npileup: npileup_line = Npileup.readline().rstrip('\n') if Tpileup: tpileup_line = Tpileup.readline().rstrip('\n') # Add the extra headers: out_vcf_headers = genome.vcf_header_modifier(my_vcf, addons=header_append)
header_append = [] format_append = [] if args.pileup_DP4: header_append.append('##FORMAT=<ID=plDP4,Number=4,Type=Integer,Description="DP4 from pileup: ref forward, ref reverse, alt forward, alt reverse">') format_append.append('plDP4') if args.pileup_variant_allele_frequency: header_append.append('##FORMAT=<ID=plVAF,Number=1,Type=Float,Description="Variant allele frequency calculated from pileup">') format_append.append('plVAF') # Start Working by opening files: try: my_vcf = genome.open_textfile(my_vcf) Tpileup = genome.open_textfile(Tpileup) outhandle = open(outfile, 'w') Npileup = genome.open_textfile(Npileup) except AttributeError: pass if Npileup: npileup_line = Npileup.readline().rstrip('\n') if Tpileup: tpileup_line = Tpileup.readline().rstrip('\n') # Add the extra headers: out_vcf_headers = genome.vcf_header_modifier( my_vcf, addons=header_append )
min_altMQ = args.min_altMQ min_refBQ = args.min_refBQ min_altBQ = args.min_altBQ max_refNM = args.max_refNM max_altNM = args.max_altNM max_fetSB = args.max_fetSB max_fetCD = args.max_fetCD max_zMQ = args.max_zMQ max_zBQ = args.max_zBQ max_MQ0 = args.max_MQ0 min_VAF = args.min_VAF min_DP = args.min_DP min_varDP = args.min_varDP with genome.open_textfile(infile) as vcf_in, open(outfile, 'w') as vcf_out: line_i = vcf_in.readline().rstrip() while line_i.startswith('##'): vcf_out.write( line_i + '\n' ) line_i = vcf_in.readline().rstrip() vcf_out.write( line_i + '\n' ) # This line will be #CHROM: header = line_i.split('\t') sample_index = header.index(sample) - 9 # This will be the first variant line: