class VCFWriter: def __init__(self, reference_file_path, contigs, sample_name, output_dir, filename): self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path) self.contigs = contigs vcf_header = self.get_vcf_header(sample_name, contigs) self.vcf_file = VariantFile(output_dir + filename + '.vcf', 'w', header=vcf_header) def write_vcf_records(self, called_variant): contig, ref_start, ref_end, ref_seq, alleles, genotype = called_variant alleles = tuple([ref_seq]) + tuple(alleles) vcf_record = self.vcf_file.new_record(contig=str(contig), start=ref_start, stop=ref_end, id='.', qual=60, filter='PASS', alleles=alleles, GT=genotype, GQ=60) self.vcf_file.write(vcf_record) def get_vcf_header(self, sample_name, contigs): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) sqs = self.fasta_handler.get_chromosome_names() for sq in sqs: if sq not in contigs: continue sq_id = sq ln = self.fasta_handler.get_chromosome_sequence_length(sq) header.contigs.add(sq_id, length=ln) header.add_sample(sample_name) return header
class VCFWriter: def __init__(self, bam_file_path, sample_name, output_dir): self.bam_handler = BamHandler(bam_file_path) bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0] vcf_header = self.get_vcf_header(sample_name) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + bam_file_name + '_' + time_str + '.vcf', 'w', header=vcf_header) def write_vcf_records(self, called_variants): for variant in called_variants: alleles = tuple([variant.ref]) + tuple(variant.alternate_alleles) # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq) vcf_record = self.vcf_file.new_record(contig=str( variant.chromosome_name), start=variant.pos_start, stop=variant.pos_end, id='.', qual=60, filter='PASS', alleles=alleles, GT=variant.genotype, GQ=60) self.vcf_file.write(vcf_record) def get_vcf_header(self, sample_name): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) bam_sqs = self.bam_handler.get_header_sq() for sq in bam_sqs: id = sq['SN'] ln = sq['LN'] items = [('ID', id), ('length', ln)] header.add_meta(key='contig', items=items) header.add_sample(sample_name) return header
def create_sample_format_from_info_lofreq(sample, input_name, output_name, skip_gt=False): input_vcf = VariantFile(input_name, 'r') input_vcf.header.formats.add("AF", number=1, type='Float', description="Allele Frequency") input_vcf.header.formats.add( "AD", number=".", type='String', description= "Allelic sample depths for the ref and alt alleles in the order listed" ) input_vcf.header.formats.add( "DP", number=1, type='Integer', description= "Approximate read depth (reads with MQ=255 or with bad mates are filtered)" ) input_vcf.header.formats.add( "DP4", number=4, type='Integer', description= "Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases" ) input_vcf.header.formats.add("GT", number=".", type="String", description="Genotype") input_vcf.header.add_sample(sample) output_vcf = VariantFile(output_name, 'w', header=input_vcf.header) for record in input_vcf: ad = record.info["AD"] af = record.info["AF"] dp = record.info["DP"] fields = { "AF": af, "DP4": record.info["DP4"], "DP": dp, "AD": ad, "GT": (record.alleles[1], record.alleles[0]) } new_record = output_vcf.new_record(record.chrom, record.start, record.stop, record.alleles, record.id, record.qual, record.filter, record.info, [fields]) #, output_vcf.write(new_record)
def run_process(opts, mutect2_vcf, pindel_vcf): outputvcf = opts.output # Open VCF mutect2 = VariantFile(mutect2_vcf) pindel = VariantFile(pindel_vcf) # Add pindel header to new header new_header = mutect2.header new_header_keys = new_header.info.keys() for item in pindel.header.info.iteritems(): if item[1].name in new_header_keys: continue else: new_header.info.add(item[1].name, item[1].number, item[1].type, item[1].description) # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header) pindel_record_list = list() for p in pindel.fetch(): tmp = vcf_out.new_record() tmp.chrom = p.chrom tmp.pos = p.pos tmp.ref = p.ref tmp.alts = p.alts for key in p.info.keys(): tmp.info[key] = p.info[key] for key in p.format.keys(): tmp.samples[0][key] = p.samples[0][key] tmp.samples[0]["AF"] = float(tmp.samples[0]["AD"][1]) / float(tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1]) tmp.info["DP"] = tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1] pindel_record_list.append(tmp) oldchrom = 1 for record in mutect2.fetch(): chrom = record.chrom pos = record.pos alts = record.alts for i,record2 in enumerate(pindel_record_list): oldchrom = int(record2.chrom.replace("chr","")) if record2.chrom == chrom and record2.pos == pos and record2.alts == alts: del(pindel_record_list[i]) elif record2.chrom == chrom and record2.pos > pos: break elif record2.chrom == chrom and record2.pos < pos: vcf_out.write(record2) del(pindel_record_list[i]) elif oldchrom < int(chrom.replace("chr","")): vcf_out.write(record2) del(pindel_record_list[i]) vcf_out.write(record)
def build_new_record(maf: Dict[str, str], vcf: VariantFile, tag: str) -> VariantRecord: """ Generates a new VCF minimal record from the MAF dictionary. :param maf: The MAF record as a dictionary. :param vcf: The VarianFile object. :param tag: The FILTER tag to use. """ alleles = ( maf["Reference_Allele"], maf["Tumor_Seq_Allele1"], ) record = vcf.new_record( contig=str(maf["Chromosome"]), start=int(maf["Start_position"]) - 1, stop=len(maf["Reference_Allele"]) + int(maf["Start_position"]) - 1, filter=(tag, ), alleles=alleles, ) return record
def telomere_pruning(telomere_depth_bed, telomere_annotation, fasta, small_variant_vcf, output_vcf, min_depth, min_gq, min_vaf): """ Find regions to delete in the telomere :param telomere_depth_bed: :param telomere_annotation: :param fasta: :param output_vcf: :return: """ sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: READING DEPTH BED FILE " + "\n") sys.stderr.flush() assembly_fasta_file = FastaFile(fasta) # directionary to keep track of depth at each position of the telomere position_wise_depth = defaultdict() # outputs regions that are going to be edited telomere_edit_regions = open("CHM13_v1_telomere_edit_regions.bed", "w") # read the depth file depth_bed_file = open(telomere_depth_bed, "r") small_variant_vcf = VariantFile(small_variant_vcf) output_vcf_file = VariantFile(output_vcf, 'w', header=small_variant_vcf.header) # populate the position dictionary for bed_record in depth_bed_file: contig, position, depth = bed_record.rstrip().split("\t") # the bedfile has an offset of 1 position_wise_depth[(contig, int(position) - 1)] = int(depth) sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: DEPTH BED LOADED. " + "\n") sys.stderr.flush() sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: READING ANNOTATION BED. " + "\n") sys.stderr.flush() regions_of_deletion = defaultdict(lambda: list) telomere_regions = defaultdict(lambda: list) contig_length_dict = defaultdict() telomere_annotation_bed_file = open(telomere_annotation, "r") all_vcf_records = [] for bed_record in telomere_annotation_bed_file: contig, start_pos, end_pos, contig_length = bed_record.rstrip().split( "\t") contig_length_dict[contig] = contig_length if contig not in regions_of_deletion.keys(): regions_of_deletion[contig] = [] if contig not in telomere_regions.keys(): telomere_regions[contig] = [] telomere_regions[contig].append((int(start_pos), int(end_pos))) start_pos = int(start_pos) end_pos = int(end_pos) contig_length = int(contig_length) if start_pos == 0: # this is the left side of the telomere, so scan left to right if (contig, 0) in position_wise_depth.keys() and position_wise_depth[ (contig, 0)] >= min_depth: # it has full coverage, so simply do nothing. continue # otherwise scan to the point we hit min_depth record_start_pos = 0 current_position = 1 while True: current_depth = 0 if (contig, current_position) in position_wise_depth.keys(): current_depth = position_wise_depth[(contig, current_position)] if current_depth >= min_depth or current_position == end_pos: break current_position += 1 record_end_position = current_position length_of_record = record_end_position - record_start_pos + 1 # pad the reference allele by one place reference_allele = assembly_fasta_file.fetch( reference=contig, start=record_start_pos, end=record_end_position + 1) # alternate allele is the last base of the alternate_allele = reference_allele[-1] sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: PRUNING: " + contig + " " + str(record_start_pos) + " " + str(record_end_position) + " " + str(length_of_record) + "\n") sys.stderr.flush() regions_of_deletion[contig].append( (record_start_pos, record_end_position)) telomere_edit_regions.write(contig + "\t" + str(record_start_pos) + "\t" + str(record_end_position) + "\n") # write this deletion to the VCF file alleles = [reference_allele, alternate_allele] vcf_record = output_vcf_file.new_record(contig=contig, start=record_start_pos, stop=record_end_position + 1, id='.', qual=60, filter='PASS', alleles=alleles, GT=[1, 1], GQ=60, VAF=[1.0]) all_vcf_records.append((vcf_record.contig, vcf_record.start, vcf_record.stop, vcf_record)) elif end_pos == contig_length: # this is the right side of the telomere, so scan right to left if (contig, end_pos) in position_wise_depth.keys( ) and position_wise_depth[(contig, end_pos)] >= min_depth: # it has full coverage, so simply do nothing. continue record_end_position = end_pos current_position = end_pos - 1 while True: current_depth = 0 if (contig, current_position) in position_wise_depth.keys(): current_depth = position_wise_depth[(contig, current_position)] if current_depth >= min_depth or current_position == start_pos: break current_position -= 1 record_start_pos = current_position length_of_record = record_end_position - record_start_pos + 1 # pad the reference allele by one place reference_allele = assembly_fasta_file.fetch( reference=contig, start=record_start_pos - 1, end=record_end_position) # alternate allele is the last base of the alternate_allele = reference_allele[0] sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: PRUNING: " + contig + " " + str(record_start_pos) + " " + str(record_end_position) + " " + str(length_of_record) + "\n") sys.stderr.flush() regions_of_deletion[contig].append( (record_start_pos, record_end_position)) telomere_edit_regions.write(contig + "\t" + str(record_start_pos) + "\t" + str(record_end_position) + "\n") # write this deletion to the VCF file alleles = [reference_allele, alternate_allele] vcf_record = output_vcf_file.new_record(contig=contig, start=record_start_pos - 1, stop=record_end_position, id='.', qual=60, filter='PASS', alleles=alleles, GT=[1, 1], GQ=60, VAF=[1.0]) all_vcf_records.append((vcf_record.contig, vcf_record.start, vcf_record.stop, vcf_record)) sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') + "] INFO: READING SMALL VARIANT VCF. " + "\n") sys.stderr.flush() # true_positive_positions = defaultdict(list) # # # filter the file for rec in small_variant_vcf.fetch(): contig_deletion_regions = list(regions_of_deletion[rec.chrom]) record_overlaps = False for region_start, region_end in contig_deletion_regions: if region_start <= rec.pos <= region_end: record_overlaps = True break # this small variant overalps with a region we are going to delete. if record_overlaps: continue telomere_region = list(telomere_regions[rec.chrom]) record_overlaps = False for region_start, region_end in telomere_region: if region_start <= rec.pos <= region_end: record_overlaps = True break # small variant is outside telomere region if record_overlaps is False: continue sample_vafs = [] for sample in rec.samples: sample_vafs = rec.samples[sample]['VAF'] selected_alleles = [rec.alleles[0]] selected_allele_vaf = [] for i in range(0, len(rec.alts)): if rec.pos < 10000: restoring_canonical = is_restoring_canonical_kmer( rec.contig, rec.start, rec.stop, rec.alleles[0], rec.alts[i], "CCCTAA", assembly_fasta_file, int(contig_length_dict[rec.contig])) else: restoring_canonical = is_restoring_canonical_kmer( rec.contig, rec.start, rec.stop, rec.alleles[0], rec.alts[i], "GGGTTA", assembly_fasta_file, int(contig_length_dict[rec.contig])) # if rec.contig == "chr19" and rec.pos < 200: # print(rec, end='') # print(restoring_canonical) # restoring canonical: 0 is no change, 1 is positive change, -1 means it's moving away from canonical if restoring_canonical == 1: found_positive_change = True selected_alleles.append(rec.alts[i]) selected_allele_vaf.append(sample_vafs[i]) elif restoring_canonical == 0: # meaning this allele has no affect on canonical k-mer restoration, so we simply fall back to set thresholds. if sample_vafs[i] >= min_vaf and rec.qual >= min_gq: selected_alleles.append(rec.alts[i]) selected_allele_vaf.append(sample_vafs[i]) # no allele passed the thresholds or is restoring canonical k-mer if len(selected_alleles) == 1: continue vcf_record = output_vcf_file.new_record(contig=rec.contig, start=rec.start, stop=rec.stop, id='.', qual=rec.qual, filter='PASS', alleles=selected_alleles, GT=[1, 1], GQ=rec.qual, VAF=selected_allele_vaf) all_vcf_records.append( (vcf_record.contig, vcf_record.start, vcf_record.stop, vcf_record)) telomere_edit_regions.write(rec.contig + "\t" + str(rec.start) + "\t" + str(rec.stop) + "\n") all_vcf_records = sorted(all_vcf_records, key=lambda x: (x[0], x[1], x[2])) for contig, start, stop, record in all_vcf_records: output_vcf_file.write(record)
def merg_vcf(h1_vcf, h2_vcf, output_dir, merge_genotype): vcf_positional_dict = defaultdict(lambda: defaultdict(list)) vcf_in1 = VariantFile(h1_vcf) vcf_out = VariantFile(output_dir + 'merged_file.vcf', 'w', header=vcf_in1.header) for rec in vcf_in1.fetch(): # ['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', # 'alleles', 'alts', 'chrom', 'contig', 'copy', 'filter', 'format', 'header', 'id', 'info', 'pos', 'qual', 'ref', 'rid', 'rlen', 'samples', 'start', 'stop', 'translate'] if 'PASS' in rec.filter.keys(): vcf_positional_dict[rec.chrom][rec.pos].append(rec) vcf_in2 = VariantFile(h2_vcf) for rec in vcf_in2.fetch(): if 'PASS' in rec.filter.keys(): vcf_positional_dict[rec.chrom][rec.pos].append(rec) for chrom in vcf_positional_dict.keys(): for pos in sorted(vcf_positional_dict[chrom].keys()): # this means that merging is needed at this position if len(vcf_positional_dict[chrom][pos]) == 1: for var in vcf_positional_dict[chrom][pos]: vcf_out.write(var) elif len(vcf_positional_dict[chrom][pos]) > 1: longest_ref = vcf_positional_dict[chrom][pos][0].ref longest_var = vcf_positional_dict[chrom][pos][0] for var in vcf_positional_dict[chrom][pos]: if len(var.ref) > len(longest_ref): longest_ref = var.ref longest_var = var alts = [longest_ref] gq = -1.0 qual = -1.0 gts = [] for var in vcf_positional_dict[chrom][pos]: for sample in var.samples: if gq < 0: gq = var.samples[sample]['GQ'] gq = min(gq, var.samples[sample]['GQ']) if var.samples[sample]['GT'] != [0, 0]: gts.append(var.samples[sample]['GT']) var_alts = list(var.alts) var_ref = var.ref if qual < 0: qual = var.qual qual = min(qual, var.qual) ref_suffix = longest_ref[len(var_ref):] for alt in var_alts: if alt + ref_suffix not in alts and len( alt + ref_suffix) > 0: alts.append(alt + ref_suffix) if len(alts) == 2: if merge_genotype: if len(gts) == 2: genotype = [1, 1] else: genotype = [0, 1] else: genotype = gts[0] else: genotype = [1, 2] vcf_record = vcf_out.new_record(contig=longest_var.contig, start=longest_var.start, stop=longest_var.stop, id=longest_var.id, qual=qual, filter=longest_var.filter, alleles=alts, GT=genotype, GQ=gq) vcf_out.write(vcf_record)
class VCFWriter: def __init__(self, reference_path, sample_name, output_dir, contigs): self.fasta_handler = PEPPER_SNP.FASTA_handler(reference_path) vcf_header = self.get_vcf_header(sample_name, contigs) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + "CANDIDATES_PEPPER" + '_' + time_str + '.vcf', 'w', header=vcf_header) def get_genotype(self, ref, alt1, alt2): alt1_gt = 1 alt2_gt = 2 if ref == alt1 or alt1 == '*': alt1_gt = 0 if ref == alt2 or alt2 == '*': alt2_gt = 0 if alt1 == alt2: alt2_gt = alt1_gt gt = sorted([alt1_gt, alt2_gt]) if gt == [0, 0]: return ref, [], [0, 0] if gt == [0, 1]: return ref, [alt1], [0, 1] if gt == [1, 1]: return ref, [alt1], [1, 1] if gt == [0, 2]: return ref, [alt2], [0, 1] if gt == [2, 2]: return ref, [alt2], [1, 1] if gt == [1, 2]: return ref, [alt1, alt2], [1, 2] return sorted([alt1_gt, alt2_gt]) def get_alleles(self, ref_base, alt_predictions): alts1 = set() alts2 = set() for alt1, alt2 in alt_predictions: if alt1 != '*' and alt1 != ref_base: alts1.add(alt1) if alt2 != '*' and alt2 != ref_base: alts2.add(alt2) return list(alts1), list(alts2) def write_vcf_records(self, chromosome_name, called_variants, reference_dict, positions): for pos in sorted(positions): ref_base = reference_dict[pos] if ref_base == 'n' or ref_base == 'N': continue alts1, alts2 = self.get_alleles(ref_base, called_variants[pos]) if alts1: alt1 = alts1[0] else: alt1 = ref_base if alts2: alt2 = alts2[0] else: alt2 = ref_base ref, alt_alleles, gt = self.get_genotype(ref_base, alt1, alt2) if gt == [0, 0]: continue # add extra alleles not used here for i in range(1, len(alts1)): alt_alleles.append(alts1[i]) # add extra alleles not used for i in range(1, len(alts2)): alt_alleles.append(alts2[i]) alleles = tuple([ref]) + tuple(set(alt_alleles)) # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq) vcf_record = self.vcf_file.new_record(contig=str(chromosome_name), start=pos, stop=pos + 1, id='.', qual=60, filter='PASS', alleles=alleles, GT=gt, GQ=60) self.vcf_file.write(vcf_record) def get_vcf_header(self, sample_name, contigs): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) sqs = self.fasta_handler.get_chromosome_names() for sq in sqs: if sq not in contigs: continue sq_id = sq ln = self.fasta_handler.get_chromosome_sequence_length(sq) header.contigs.add(sq_id, length=ln) header.add_sample(sample_name) return header
class VCFWriter: def __init__(self, bam_file_path, sample_name, output_dir): self.bam_handler = BamHandler(bam_file_path) bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0] vcf_header = self.get_vcf_header(sample_name) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + bam_file_name + '_' + time_str + '.vcf', 'w', header=vcf_header) def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype, qual, gq, rec_filter): alleles = tuple([ref]) + tuple(alts) genotype = self.get_genotype_tuple(genotype) end_pos = int(end_pos) + 1 st_pos = int(st_pos) vcf_record = self.vcf_file.new_record(contig=str(chrm), start=st_pos, stop=end_pos, id='.', qual=qual, filter=rec_filter, alleles=alleles, GT=genotype, GQ=gq) self.vcf_file.write(vcf_record) @staticmethod def prediction_label_to_allele(label): label_to_allele = { 0: ['0', '0'], 1: ['0', '1'], 2: ['1', '1'], 3: ['0', '2'], 4: ['2', '2'], 5: ['1', '2'] } return label_to_allele[label] @staticmethod def get_qual_and_gq(probabilities, predicted_class): qual = 1.0 - probabilities[0] phred_qual = min( 60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60) phred_qual = math.ceil(phred_qual * 100.0) / 100.0 gq = probabilities[predicted_class] phred_gq = min(60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60) phred_gq = math.ceil(phred_gq * 100.0) / 100.0 return phred_qual, phred_gq @staticmethod def solve_multiple_alts(alts, ref): type1, type2 = alts[0][1], alts[1][1] alt1, alt2 = alts[0][0], alts[1][0] if type1 == DEL_TYPE and type2 == DEL_TYPE: if len(alt2) > len(alt1): return alt2, ref, alt2[0] + alt2[len(alt1):] else: return alt1, ref, alt1[0] + alt1[len(alt2):] elif type1 == IN_TYPE and type2 == IN_TYPE: return ref, alt1, alt2 elif type1 == DEL_TYPE or type2 == DEL_TYPE: if type1 == DEL_TYPE and type2 == IN_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == IN_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE and type2 == SNP_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == SNP_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE: return alt1, ref, alt2 elif type2 == DEL_TYPE: return alt2, alt1, ref else: return ref, alt1, alt2 @staticmethod def solve_single_alt(alts, ref): alt1, alt_type = alts[0] if alt_type == DEL_TYPE: return alt1, ref, '.' return ref, alt1, '.' @staticmethod def get_genotype_tuple(genotype): split_values = genotype.split('/') split_values = [int(x) for x in split_values] return tuple(split_values) @staticmethod def process_prediction(pos, prediction_alt1, prediction_alt2): # get the list of prediction labels # assume both are homozygous first alt1_probability = [0.0, 0.0, 0.0] alt2_probability = [0.0, 0.0, 0.0] if prediction_alt1: count = 0 for label, probability in prediction_alt1: count += 1 for j, prob_value in enumerate(probability): alt1_probability[j] += prob_value alt1_probability = [prob / count for prob in alt1_probability] if prediction_alt2: count = 0 for label, probability in prediction_alt2: count += 1 for j, prob_value in enumerate(probability): alt2_probability[j] += prob_value alt1_probability = [prob / count for prob in alt1_probability] # probability that the site genotype is 0/0 p00 = min(alt1_probability[0], alt2_probability[0]) p01 = alt1_probability[1] p11 = alt1_probability[2] p02 = alt2_probability[1] p22 = alt2_probability[2] p12 = min(max(alt1_probability[1], alt1_probability[2]), max(alt2_probability[1], alt2_probability[2])) # print(alt_probs) prob_list = [p00, p01, p11, p02, p22, p12] # print(prob_list) sum_probs = sum(prob_list) # print(sum_probs) normalized_list = [(float(i) / sum_probs) if sum_probs else 0 for i in prob_list] prob_list = normalized_list # print(prob_list) # print(sum(prob_list)) gq, index = 0, 0 for i, prob in enumerate(prob_list): if gq <= prob and prob > 0: index = i gq = prob # get alts from label genotype = VCFWriter.prediction_label_to_allele(index) genotype = genotype[0] + '/' + genotype[1] qual = sum(prob_list) - prob_list[0] phred_qual = min( 60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60) phred_qual = math.ceil(phred_qual * 100.0) / 100.0 phred_gq = min(60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60) phred_gq = math.ceil(phred_gq * 100.0) / 100.0 return genotype, phred_qual, phred_gq @staticmethod def get_proper_alleles(positional_record, genotype): alts = [(positional_record.alt1, positional_record.alt1_type), (positional_record.alt2, positional_record.alt2_type)] gts = genotype.split('/') refined_alt = [] if gts[0] == '0' and gts[1] == '0': refined_alt.append('.') if gts[0] == '1' or gts[1] == '1': refined_alt.append(alts[0]) if gts[0] == '2' or gts[1] == '2': if len(alts) > 1: refined_alt.append(alts[1]) elif genotype == '0/2': refined_alt.append(alts[0]) genotype = '0/1' elif genotype == '2/2': refined_alt.append(alts[0]) genotype = '1/1' elif genotype == '1/2': genotype = '0/1' if len(refined_alt) == 1: ref, alt1, alt2 = VCFWriter.solve_single_alt( refined_alt, positional_record.ref) else: ref, alt1, alt2 = VCFWriter.solve_multiple_alts( refined_alt, positional_record.ref) refined_alt = [alt1, alt2] refined_gt = genotype if genotype == '0/2': refined_gt = '0/1' if genotype == '2/2': refined_gt = '1/1' return ref, refined_alt, refined_gt @staticmethod def get_filter(record, last_end): chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record if st_pos < last_end: return 'conflictPos' if genotype == '0/0': return 'refCall' if phred_qual < 0: return 'lowQUAL' if phred_gq < 0: return 'lowGQ' return 'PASS' def get_vcf_header(self, sample_name): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) bam_sqs = self.bam_handler.get_header_sq() for sq in bam_sqs: id = sq['SN'] ln = sq['LN'] items = [('ID', id), ('length', ln)] header.add_meta(key='contig', items=items) header.add_sample(sample_name) return header
class VCFWriter: def __init__(self, bam_file_path, sample_name, output_dir): self.bam_handler = BamHandler(bam_file_path) bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0] vcf_header = self.get_vcf_header(sample_name) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + bam_file_name + '_' + time_str + '.vcf', 'w', header=vcf_header) def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype, qual, gq, rec_filter): alleles = tuple([ref]) + tuple(alts) genotype = self.get_genotype_tuple(genotype) end_pos = int(end_pos) + 1 st_pos = int(st_pos) vcf_record = self.vcf_file.new_record(contig=str(chrm), start=st_pos, stop=end_pos, id='.', qual=qual, filter=rec_filter, alleles=alleles, GT=genotype, GQ=gq) self.vcf_file.write(vcf_record) @staticmethod def prediction_label_to_allele(label): label_to_allele = { 0: ['0', '0'], 1: ['0', '1'], 2: ['1', '1'], 3: ['0', '2'], 4: ['2', '2'], 5: ['1', '2'] } return label_to_allele[label] @staticmethod def get_qual_and_gq(probabilities, predicted_class): qual = 1.0 - probabilities[0] phred_qual = min( 60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60) phred_qual = math.ceil(phred_qual * 100.0) / 100.0 gq = probabilities[predicted_class] phred_gq = min(60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60) phred_gq = math.ceil(phred_gq * 100.0) / 100.0 return phred_qual, phred_gq @staticmethod def solve_multiple_alts(alts, ref): type1, type2 = alts[0][1], alts[1][1] alt1, alt2 = alts[0][0], alts[1][0] if type1 == DEL_TYPE and type2 == DEL_TYPE: if len(alt2) > len(alt1): return alt2, ref, alt2[0] + alt2[len(alt1):] else: return alt1, ref, alt1[0] + alt1[len(alt2):] elif type1 == IN_TYPE and type2 == IN_TYPE: return ref, alt1, alt2 elif type1 == DEL_TYPE or type2 == DEL_TYPE: if type1 == DEL_TYPE and type2 == IN_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == IN_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE and type2 == SNP_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == SNP_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE: return alt1, ref, alt2 elif type2 == DEL_TYPE: return alt2, alt1, ref else: return ref, alt1, alt2 @staticmethod def solve_single_alt(alts, ref): alt1, alt_type = alts[0] if alt_type == DEL_TYPE: return alt1, ref, '.' return ref, alt1, '.' @staticmethod def get_genotype_tuple(genotype): split_values = genotype.split('/') split_values = [int(x) for x in split_values] return tuple(split_values) @staticmethod def process_prediction(pos, predictions): # get the list of prediction labels list_prediction_labels = [label for label, probs in predictions] predicted_class = max(set(list_prediction_labels), key=list_prediction_labels.count) # get alts from label genotype = VCFWriter.prediction_label_to_allele(predicted_class) genotype = genotype[0] + '/' + genotype[1] # get the probabilities list_prediction_probabilities = [probs for label, probs in predictions] num_classes = len(list_prediction_probabilities[0]) min_probs_for_each_class = [ min(l[i] for l in list_prediction_probabilities) for i in range(num_classes) ] # normalize the probabilities sum_of_probs = sum(min_probs_for_each_class ) if sum(min_probs_for_each_class) > 0 else 1 if sum(min_probs_for_each_class) <= 0: print("SUM ZERO ENCOUNTERED IN: ", pos, predictions) exit() probabilities = [ float(i) / sum_of_probs for i in min_probs_for_each_class ] qual, gq = VCFWriter.get_qual_and_gq(probabilities, predicted_class) return genotype, qual, gq @staticmethod def get_proper_alleles(record): ref, alt_field, genotype, phred_qual, phred_gq = record gts = genotype.split('/') refined_alt = [] if gts[0] == '0' and gts[1] == '0': refined_alt.append('.') if gts[0] == '1' or gts[1] == '1': refined_alt.append(alt_field[0]) if gts[0] == '2' or gts[1] == '2': if len(alt_field) > 1: refined_alt.append(alt_field[1]) elif genotype == '0/2': refined_alt.append(alt_field[0]) genotype = '0/1' elif genotype == '2/2': refined_alt.append(alt_field[0]) genotype = '1/1' elif genotype == '1/2': genotype = '0/1' if len(refined_alt) == 1: ref, alt1, alt2 = VCFWriter.solve_single_alt(refined_alt, ref) else: ref, alt1, alt2 = VCFWriter.solve_multiple_alts(refined_alt, ref) refined_alt = [alt1, alt2] refined_gt = genotype if genotype == '0/2': refined_gt = '0/1' if genotype == '2/2': refined_gt = '1/1' record = ref, refined_alt, phred_qual, phred_gq, refined_gt return record @staticmethod def get_filter(record, last_end): chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record if st_pos < last_end: return 'conflictPos' if genotype == '0/0': return 'refCall' if phred_qual < 0: return 'lowQUAL' if phred_gq < 0: return 'lowGQ' return 'PASS' def get_vcf_header(self, sample_name): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) bam_sqs = self.bam_handler.get_header_sq() for sq in bam_sqs: id = sq['SN'] ln = sq['LN'] items = [('ID', id), ('length', ln)] header.add_meta(key='contig', items=items) header.add_sample(sample_name) return header
def run_process(opts, mutect2_vcf, mutect2_bam): outputvcf = opts.output # Open VCF, BAM m2vcf = VariantFile(mutect2_vcf) m2bam = AlignmentFile(mutect2_bam, 'rb') old_chrom = '' old_pos = -1 old_ref = '' old_alts = () variants_list = list() # Get Splited Variants for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts if chrom == old_chrom and pos == old_pos + 1 and len(old_ref) == 1 and len(ref) == 1 and len(alts) == 1: tmp_dict = { "chrom" : chrom, "start_pos" : old_pos, "end_pos" : pos, "ref" : old_ref + ref, "alt" : old_alts[0] + alts[0] } variants_list.append(tmp_dict) old_chrom = chrom old_pos = pos old_ref = ref old_alts = alts # Get Read Information for v in variants_list: reads = m2bam.fetch(v["chrom"], v["start_pos"] - 1, v["end_pos"]) ref_read_cnt = 0 alt_read_cnt = 0 alt_first_cnt = 0 alt_second_cnt = 0 f1r2_ref_cnt = 0 f2r1_ref_cnt = 0 f1r2_alt_cnt = 0 f2r1_alt_cnt = 0 dp = 0 for read in reads: if not read.is_secondary and not read.is_supplementary and not read.is_unmapped and not read.is_duplicate: query_position_list = read.get_reference_positions() try: q_start_index = query_position_list.index(v["start_pos"]-1) q_end_index = query_position_list.index(v["end_pos"]-1) query_seq = read.query_sequence[q_start_index] + read.query_sequence[q_end_index] if query_seq == v["ref"]: ref_read_cnt += 1 if read.is_read1: f1r2_ref_cnt += 1 elif read.is_read2: f2r1_ref_cnt += 1 elif query_seq == v["alt"]: alt_read_cnt += 1 if read.is_read1: f1r2_alt_cnt += 1 elif read.is_read2: f2r1_alt_cnt += 1 elif query_seq[0] != v["ref"][0] and query_seq[1] == v["ref"][1]: alt_first_cnt += 1 elif query_seq[0] == v["ref"][0] and query_seq[1] != v["ref"][1]: alt_second_cnt += 1 dp += 1 except: continue v["ref_cnt"] = ref_read_cnt v["alt_cnt"] = alt_read_cnt v["alt_first_cnt"] = alt_first_cnt v["alt_second_cnt"] = alt_second_cnt v["f1r2"] = (f1r2_ref_cnt, f1r2_alt_cnt) v["f2r1"] = (f2r1_ref_cnt, f2r1_alt_cnt) v["dp"] = dp # Re-index True:False m2vcf_index = 0 m2vcf_flag = list() second_flag = True for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos if second_flag == True: m2vcf_flag.append(True) else: m2vcf_flag.append(False) second_flag = True for v in variants_list: if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0: if v["alt_first_cnt"] == 0: m2vcf_flag[m2vcf_index] = False if v["alt_second_cnt"] == 0: second_flag = False m2vcf_index += 1 # Write Recrod & VCF new_header = m2vcf.header new_header.formats.add("MDV", "1", "Integer", "Merged Di-Allelic Variant : Backed Phased variant that was splited snp before") vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header) m2vcf_index = 0 for record in m2vcf.fetch(): chrom = record.chrom pos = record.pos if m2vcf_flag[m2vcf_index] == True: vcf_out.write(record) for v in variants_list: if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0: record2 = vcf_out.new_record() record2.chrom = v["chrom"] record2.pos = v["start_pos"] record2.ref = v["ref"] record2.alts = (v["alt"],) record2.info["DP"] = v["dp"] if "F1R2" in record2.samples[0]: record2.samples[0]["F1R2"] = v["f1r2"] record2.samples[0]["F2R1"] = v["f2r1"] record2.samples[0]["AD"] = (v["ref_cnt"], v["alt_cnt"]) record2.samples[0]["DP"] = v["dp"] record2.samples[0]["AF"] = float(v["alt_cnt"]) / float(v["dp"]) record2.samples[0]["GT"] = ("0", "0") record2.samples[0]["MDV"] = True vcf_out.write(record2) else: continue m2vcf_index += 1
class VCFWriter: def __init__(self, bam_file_path, sample_name, output_dir): self.bam_handler = BamHandler(bam_file_path) bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0] vcf_header = self.get_vcf_header(sample_name) time_str = time.strftime("%m%d%Y_%H%M%S") self.vcf_file = VariantFile(output_dir + bam_file_name + '_' + time_str + '.vcf', 'w', header=vcf_header) def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype, qual, gq, rec_filter): alleles = tuple([ref]) + tuple(alts) genotype = self.get_genotype_tuple(genotype) end_pos = int(end_pos) + 1 st_pos = int(st_pos) vcf_record = self.vcf_file.new_record(contig=chrm, start=st_pos, stop=end_pos, id='.', qual=qual, filter=rec_filter, alleles=alleles, GT=genotype, GQ=gq) self.vcf_file.write(vcf_record) @staticmethod def solve_multiple_alts(alts, ref): type1, type2 = alts[0][1], alts[1][1] alt1, alt2 = alts[0][0], alts[1][0] if type1 == DEL_TYPE and type2 == DEL_TYPE: if len(alt2) > len(alt1): return alt2, ref, alt2[0] + alt2[len(alt1):] else: return alt1, ref, alt1[0] + alt1[len(alt2):] elif type1 == IN_TYPE and type2 == IN_TYPE: return ref, alt1, alt2 elif type1 == DEL_TYPE or type2 == DEL_TYPE: if type1 == DEL_TYPE and type2 == IN_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == IN_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE and type2 == SNP_TYPE: return alt1, ref, alt2 + alt1[1:] elif type1 == SNP_TYPE and type2 == DEL_TYPE: return alt2, alt1 + alt2[1:], ref elif type1 == DEL_TYPE: return alt1, ref, alt2 elif type2 == DEL_TYPE: return alt2, alt1, ref else: return ref, alt1, alt2 @staticmethod def solve_single_alt(alts, ref): # print(alts) alt1, alt_type = alts if alt_type == DEL_TYPE: return alt1, ref, '.' return ref, alt1, '.' @staticmethod def get_genotype_tuple(genotype): split_values = genotype.split('/') split_values = [int(x) for x in split_values] return tuple(split_values) @staticmethod def get_genotype_for_multiple_allele(records): ref = '.' st_pos = 0 end_pos = 0 chrm = '' rec_alt1 = '.' rec_alt2 = '.' alt_probs = defaultdict(list) alt_with_types = [] for record in records: chrm = record[0] st_pos = record[1] end_pos = record[2] ref = record[3] alt1 = record[4] alt2 = record[5] if alt1 != '.' and alt2 != '.': rec_alt1 = alt1 rec_alt2 = alt2 alt_probs['both'] = (record[8:]) else: alt_probs[alt1] = (record[8:]) alt_with_types.append((alt1, record[6])) p00 = min(alt_probs[rec_alt1][0], alt_probs[rec_alt2][0], alt_probs['both'][0]) p01 = min(alt_probs[rec_alt1][1], alt_probs['both'][1]) p11 = min(alt_probs[rec_alt1][2], alt_probs['both'][2]) p02 = min(alt_probs[rec_alt2][1], alt_probs['both'][1]) p22 = min(alt_probs[rec_alt2][2], alt_probs['both'][2]) p12 = min(max(alt_probs[rec_alt1][1], alt_probs[rec_alt1][2]), max(alt_probs[rec_alt2][1], alt_probs[rec_alt2][2]), max(alt_probs['both'][1], alt_probs['both'][2])) # print(alt_probs) prob_list = [p00, p01, p11, p02, p22, p12] # print(prob_list) sum_probs = sum(prob_list) # print(sum_probs) normalized_list = [(float(i) / sum_probs) if sum_probs else 0 for i in prob_list] prob_list = normalized_list # print(prob_list) # print(sum(prob_list)) genotype_list = ['0/0', '0/1', '1/1', '0/2', '2/2', '1/2'] gq, index = 0, 0 for i, prob in enumerate(prob_list): if gq <= prob and prob > 0: index = i gq = prob qual = sum(prob_list) - prob_list[0] if index == 5: ref, rec_alt1, rec_alt2 = VCFWriter.solve_multiple_alts( alt_with_types, ref) else: if index <= 2: ref, rec_alt1, rec_alt2 = VCFWriter.solve_single_alt( alt_with_types[0], ref) else: ref, rec_alt2, rec_alt1 = VCFWriter.solve_single_alt( alt_with_types[1], ref) phred_qual = min( 60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60) phred_qual = math.ceil(phred_qual * 100.0) / 100.0 phred_gq = min(60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60) phred_gq = math.ceil(phred_gq * 100.0) / 100.0 return chrm, st_pos, end_pos, ref, [ rec_alt1, rec_alt2 ], genotype_list[index], phred_qual, phred_gq @staticmethod def get_genotype_for_single_allele(records): for record in records: probs = [record[8], record[9], record[10]] genotype_list = ['0/0', '0/1', '1/1'] gq, index = max([(v, i) for i, v in enumerate(probs)]) qual = sum(probs) - probs[0] ref = record[3] alt_with_types = list() alt_with_types.append((record[4], record[6])) # print(alt_with_types) ref, alt1, alt2 = VCFWriter.solve_single_alt( alt_with_types[0], ref) # print(ref, rec_alt1, rec_alt2) phred_qual = min( 60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60) phred_qual = math.ceil(phred_qual * 100.0) / 100.0 phred_gq = min( 60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60) phred_gq = math.ceil(phred_gq * 100.0) / 100.0 return record[0], record[1], record[2], ref, [ alt1, alt2 ], genotype_list[index], phred_qual, phred_gq @staticmethod def get_proper_alleles(record): chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record gts = genotype.split('/') refined_alt = [] refined_gt = genotype if gts[0] == '1' or gts[1] == '1': refined_alt.append(alt_field[0]) if gts[0] == '2' or gts[1] == '2': refined_alt.append(alt_field[1]) if gts[0] == '0' and gts[1] == '0': refined_alt.append('.') if genotype == '0/2': refined_gt = '0/1' if genotype == '2/2': refined_gt = '1/1' end_pos = st_pos + len(ref) - 1 record = chrm, st_pos, end_pos, ref, refined_alt, refined_gt, phred_qual, phred_gq return record @staticmethod def get_filter(record, last_end): chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record if st_pos <= last_end: return 'conflictPos' if genotype == '0/0': return 'refCall' if phred_qual <= 1: return 'lowQUAL' if phred_gq <= 1: return 'lowGQ' return 'PASS' def get_vcf_header(self, sample_name): header = VariantHeader() items = [('ID', "PASS"), ('Description', "All filters passed")] header.add_meta(key='FILTER', items=items) items = [('ID', "refCall"), ('Description', "Call is homozygous")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowGQ"), ('Description', "Low genotype quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "lowQUAL"), ('Description', "Low variant call quality")] header.add_meta(key='FILTER', items=items) items = [('ID', "conflictPos"), ('Description', "Overlapping record")] header.add_meta(key='FILTER', items=items) items = [('ID', "GT"), ('Number', 1), ('Type', 'String'), ('Description', "Genotype")] header.add_meta(key='FORMAT', items=items) items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'), ('Description', "Genotype Quality")] header.add_meta(key='FORMAT', items=items) bam_sqs = self.bam_handler.get_header_sq() for sq in bam_sqs: id = sq['SN'] ln = sq['LN'] items = [('ID', id), ('length', ln)] header.add_meta(key='contig', items=items) header.add_sample(sample_name) return header