def format_pindel_vcf(input_vcf: str, output_vcf: str) -> None: """ Formats Pindel VCFs to work better with GDC downstream workflows. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("format_pindel_vcf") logger.info("Formats Pindel VCFs.") # setup total = 0 reader = pysam.VariantFile(input_vcf) header = get_header(reader.header) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=header) # Process try: for record in reader.fetch(): total += 1 tgt = record.samples["TUMOR"]["GT"] flag = tgt == (0, 0) if flag: record.samples["TUMOR"]["GT"] = (0, 1) # Info new_info = get_info(record, flag) # New record new_record = writer.new_record() new_record.contig = record.contig new_record.alleles = record.alleles new_record.start = record.start new_record.stop = record.stop new_record.id = record.id new_record.qual = record.qual for f in record.filter: new_record.filter.add(f) for i in new_info: new_record.info[i[0]] = i[1] for i, sample in enumerate(record.samples): for k, v in record.samples[sample].items(): new_record.samples[i][k] = v writer.write(new_record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records.".format(total))
def filter_nonstandard_variants(input_vcf: str, output_vcf: str) -> None: """ Remove non-ACTG loci from a VCF. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("filter_nonstandard_variants") logger.info("Drops non-ACTG loci from a VCF.") # setup total = 0 removed = 0 written = 0 # Full vcf reader reader = pysam.VariantFile(input_vcf) # Writer mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 alleles = list(''.join(list(record.alleles)).upper()) check = set(alleles) - ALLOWED_BASES if check: logger.warning("Removing {0}:{1}:{2}".format( record.chrom, record.pos, ",".join(alleles))) removed += 1 else: written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Removed {}; Wrote {} ".format( total, removed, written))
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str, output_vcf: str) -> None: """ Transforms dToxoG MAF to minimal VCF of only dtoxo failures. :param input_maf: The annotated dtoxog MAF output file. :param reference_fa: Reference fasta used to make seqdict header. :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("dtoxog_maf_to_vcf") logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures") # setup total = 0 written = 0 tag = "oxog" # header header = generate_header(reference_fa, tag) # Writer mode = get_pysam_outmode(output_vcf) writer = VariantFile(output_vcf, mode=mode, header=header) # Process try: with open(input_maf, "rt") as fh: for record in maf_generator(fh): total += 1 if record["oxoGCut"] == "1": new_vcf_record = build_new_record(record, writer, tag) writer.write(new_vcf_record) written += 1 finally: writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Wrote {}".format(total, written))
def position_filter_dkfz(input_vcf: str, output_vcf: str) -> None: """ Removes VCF records where the POS-2 is less than 0 which will cause an Exception to be thrown in DKFZBiasFilter. We assume that the input VCF only contains SNPs, but no assertions are made to validate this. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("position_filter_dkfz") logger.info("Position Filter for DKFZ.") # setup total = 0 removed = 0 written = 0 reader = pysam.VariantFile(input_vcf) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 if record.pos - 2 < 0: removed += 1 continue written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Removed {}; Wrote {} ".format( total, removed, written))
def add_oxog_filters(input_vcf: str, input_dtoxog: str, output_vcf: str) -> None: """ Adds 'oxog' filter tag to VCFs. :param input_vcf: The full input VCF file to filter. :param input_dtoxog: The dtoxog VCF from dtoxog-maf-to-vcf used to annotate the full input VCF. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("add_oxog_filters") logger.info("Adds dtoxog filters to VCF.") # setup total = 0 tagged = 0 written = 0 # Full vcf reader reader = pysam.VariantFile(input_vcf) filter_tag = "oxog" reader.header.filters.add(filter_tag, None, None, "Failed dToxoG") # Writer mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # dtoxog reader dtoxog_reader = pysam.VariantFile(input_dtoxog) # Process try: for record in reader.fetch(): total += 1 region = "{0}:{1}-{2}".format(record.contig, record.pos, record.pos) try: for row in dtoxog_reader.fetch(region=region): if record.pos == row.pos and record.ref.upper( ) == row.ref.upper(): # Add filter if failed oxog record.filter.add("oxog") tagged += 1 break except ValueError: pass # handle case where the INFO column is '.' for i in record.info: if i == ".": del record.info[i] written += 1 writer.write(record) finally: reader.close() writer.close() dtoxog_reader.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Tagged {}; Wrote {} ".format( total, tagged, written))
def format_gdc_vcf( input_vcf: str, output_vcf: str, patient_barcode: str, case_id: str, tumor_barcode: str, tumor_aliquot_uuid: str, tumor_bam_uuid: str, normal_barcode: str, normal_aliquot_uuid: str, normal_bam_uuid: str, *, reference_name: str = "GRCh38.d1.vd1.fa", ) -> None: """ Adds VCF header metadata specific to the GDC. :param input_vcf: The input VCF file to format. :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. :param patient_barcode: The case submitter id. :param case_id: The case uuid. :param tumor_barcode: The tumor aliquot submitter id. :param tumor_aliquot_uuid: The tumor aliquot uuid. :param tumor_bam_uuid: The tumor bam uuid. :param normal_barcode: The normal aliquot submitter id. :param normal_aliquot_uuid: The normal aliquot uuid. :param normal_bam_uuid: The normal bam uuid. :param reference_name: Reference name to use in header. """ logger = Logger.get_logger("format_gdc_vcf") logger.info("Format GDC tumor/normal paired VCFs.") # setup reader = pysam.VariantFile(input_vcf) mode = get_pysam_outmode(output_vcf) # Load new header new_header = build_header( reader, patient_barcode, case_id, tumor_barcode, tumor_aliquot_uuid, tumor_bam_uuid, normal_barcode, normal_aliquot_uuid, normal_bam_uuid, reference_name, ) writer = pysam.VariantFile(output_vcf, mode=mode, header=new_header) # Process try: for record in reader.fetch(): writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)
def test_get_pysam_outmode(self): mode = get_pysam_outmode("fake.vcf") self.assertEqual(mode, "w") mode = get_pysam_outmode("fake.vcf.gz") self.assertEqual(mode, "wz")
def filter_somatic_score( input_vcf: str, output_vcf: str, *, tumor_sample_name: str = "TUMOR", drop_somatic_score: int = 25, min_somatic_score: int = 40, ) -> None: """ Filters SomaticSniper VCF files based on the Somatic Score. :param input_vcf: The input VCF file to filter. :param output_vcf: The output filtered VCF file to create. BGzip and tabix-index created if ends with '.gz'. :param tumor_sample_name: The name of the tumor sample in the VCF. :param drop_somatic_score: If the somatic score is < this, remove it. :param min_somatic_score: If the somatic score is > drop_somatic_score and < this value, add ssc filter tag. """ logger = Logger.get_logger("filter_somatic_score") logger.info("Filters SomaticSniper VCF files based on Somatic Score.") # setup total = 0 removed = 0 tagged = 0 written = 0 reader = pysam.VariantFile(input_vcf) filter_tag = "ssc{0}".format(min_somatic_score) logger.info("Filter tag: {}".format(filter_tag)) reader.header.filters.add(filter_tag, None, None, "Somatic Score < {0}".format(min_somatic_score)) mode = get_pysam_outmode(output_vcf) writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) # Process try: for record in reader.fetch(): total += 1 ssc = record.samples[tumor_sample_name]["SSC"] if ssc < drop_somatic_score: removed += 1 continue elif ssc < min_somatic_score: tagged += 1 record.filter.add(filter_tag) written += 1 writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) logger.info( "Processed {} records - Removed {}; Tagged {}; Wrote {} ".format( total, removed, tagged, written))