def getCleanningRules(variant_caller): """ Return by INFO tag the correct declaration for header and the function to clean the values of this tag in records. :param variant_caller: The variant caller used to produce the VCF to fix. :type variant_caller: str :return: By INFO tag the correct declaration for header and the function to clean the values of this tag in records. :rtype: dict """ info_by_caller = { "vardict": { "REFBIAS": { "declaration": HeaderInfoAttr("REFBIAS", "Reference depth by strand", type="Integer", number="2"), "process": lambda val: [int(elt) for elt in val.split(":")] }, "VARBIAS": { "declaration": HeaderInfoAttr("VARBIAS", "Variant depth by strand", type="Integer", number="2"), "process": lambda val: [int(elt) for elt in val.split(":")] } } } return info_by_caller[variant_caller]
def normAndMove(genome_path, in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in three steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). 3- The allele is replaced by the most upstream allele that can have the same alternative sequence (example: a deletion in homopolymer is moved to first nucleotid of this homopolymer). :param genome_path: Path to the genome file (format: fasta). :type genome_path: str :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ genome_by_chr = getSeqByChr(genome_path) with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: curr_chrom = genome_by_chr[record.chrom] for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) FH_out.write(alt_record.getMostUpstream(curr_chrom))
def normOnly(in_variant_file, out_variant_file, trace_unstandard): """ Write in a new file the normalized version of each variant. The normalization constists in two steps: 1- The variants with multiple alternative alleles are splitted in one record by alternative allele. 2- In each allele the empty allele marker is replaced by a dot and alternative and reference allele are reduced to the minimal string (example: ATG/A becomes TG/. ; AAGC/ATAC becomes AG/TA.). :param in_variant_file: Path to the variants file (format: VCF). :type in_variant_file: str :param out_variant_file: Path to the normalized variants file (format: VCF). :type out_variant_file: str :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool """ with VCFIO(out_variant_file, "w") as FH_out: with VCFIO(in_variant_file) as FH_in: # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = "{}:{}={}/{}".format( alt_record.chrom, alt_record.pos, alt_record.ref, "/".join(alt_record.alt)) alt_record.normalizeSingleAllele() FH_out.write(alt_record)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length", "4", "--input-variants", self.tmp_variants, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84 88 92 96 100 # 2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102 # | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | FH_seq.write( Sequence( "artificial_chr1", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr2", "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA" )) FH_seq.write( Sequence( "artificial_chr3", "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr4", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG" )) FH_seq.write( Sequence( "artificial_chr5", "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr6", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT" )) """ Variant before_start before_end before_seq after_start after_end after_seq alt_00 10 13 TCCA 15 18 CAAT alt_01 20 23 AAAA 25 28 TTCC alt_02 30 33 ACAG 35 38 AAAA alt_03 40 43 AGTA 45 48 AAAG alt_04 10 13 TCCA 16 19 AATA alt_05 20 23 AAAA 26 29 TCCT alt_06 30 33 ACAG 36 39 AAAA alt_07 40 43 GTAG 46 49 AAAG alt_08 11 14 CCAG 15 18 CAAT alt_09 20 23 AAAA 24 27 TTCC alt_10 31 34 AGGT 35 38 AAAA alt_11 40 43 GTAG 44 47 AAAG alt_12 11 14 CCAG 15 18 CAAT alt_13 20 23 AAAA 24 27 GTTC alt_14 31 34 CAGG 35 38 AAAA alt_15 41 44 GTAG 45 48 AAAG alt_16 50 53 GAAA 57 60 GTCA alt_17 60 63 AAAA 67 70 TATT alt_18 70 73 TCTC 77 80 AAAA alt_19 80 83 ACAG 87 90 AAAG alt_20 11 14 CCAG 16 19 AATA alt_21 20 23 AAAA 25 28 TTCC alt_22 31 34 CAGG 36 39 AAAA alt_23 40 43 AGTA 45 48 AAAG alt_24 11 14 CCAG 17 20 ATAA alt_25 19 22 AAAA 26 29 TCCT alt_26 29 32 TACA 35 38 AAAA alt_27 38 41 AAAG 45 48 AAAG alt_28 50 53 ACAA 61 64 CTTG alt_29 66 69 AAAA 76 79 CACA alt_30 76 79 CACA 86 89 AAAA alt_31 88 91 AACA 99 102 AAAT """ # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 89 17 89 90 artificial_chr2 89 124 89 90 artificial_chr3 88 231 88 89 artificial_chr4 95 337 95 96 artificial_chr5 89 450 89 90 artificial_chr6 102 557 102 103""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "is_filtered": HeaderInfoAttr( "is_filtered", "1 if the variant is adjacent to an homopolymer.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr1", 24, "alt_01", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr1", 34, "alt_02", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr1", 44, "alt_03", "G", ["T"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Substit multi nt VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins single nt VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins multi nt VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer # Del single nt VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # # Del multi nt VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_selected_rna = os.path.join(tmp_folder, unique_id + "_rna.tsv") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Create RNA ref with open(self.tmp_selected_rna, "w") as FH_rna: FH_rna.write("#Gene\tTranscript\n") FH_rna.write("Gene_1\tENST_selected1\n") FH_rna.write("Gene_1\tENST_selected2\n") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = [ "Allele", "Consequence", "Feature", "EUR_AF", "gnomAD_AF", "expected_filter" ] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|Consequence|Feature|gnomAD_AF|expected_filter.", type="String", number="."), "expected_filter": HeaderInfoAttr("expected_filter", "The expected filters.", type="String", number=".") } FH_var.writeHeader() self.variants = [ VCFRecord( "artificial_chr1", 14, "alt_00", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }], "expected_filter": ["PASS"] }), VCFRecord("artificial_chr1", 14, "alt_01", "G", ["T"], None, None, {"expected_filter": ["CSQ"]}), VCFRecord( "artificial_chr1", 14, "alt_02", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_03", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }], "expected_filter": ["popAF"] }), VCFRecord( "artificial_chr1", 14, "alt_04", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_05", "G", ["T"], None, None, { "ANN": [{ "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_06", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["PASS"], }), VCFRecord( "artificial_chr1", 14, "alt_07", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_08", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_09", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_10", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_11", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_12", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_13", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_14", "G", ["GT"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 15, "alt_15", "-", ["T"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_15", "G", ["-"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_16", "GG", ["G"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_var_filters = os.path.join(tmp_folder, unique_id + "_varFilters.json") self.tmp_annot_filters = os.path.join(tmp_folder, unique_id + "_annFilters.json") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Command self.cmd = [ "filterAnnotVCF.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create filters with open(self.tmp_var_filters, "w") as FH_filter: FH_filter.write("""{ "class": "FiltersCombiner", "operator": "or", "filters": [ { "class": "Filter", "getter": "filter", "action": "select", "aggregator": "ratio:1", "operator": "!=", "values": "CSQ" }, { "class": "Filter", "getter": "chrom", "action": "select", "aggregator": "nb:1", "operator": "==", "values": "artificial_chr2" } ] }""") with open(self.tmp_annot_filters, "w") as FH_filter: FH_filter.write("""{ "class": "Filter", "getter": "FILTER", "action": "select", "aggregator": "ratio:1", "operator": "==", "values": "PASS" }""") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = ["Allele", "id", "is_filtered", "FILTER"] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|id|is_filtered|FILTER.", type="String", number="."), "is_filtered": HeaderInfoAttr("is_filtered", "The expected result.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 10, "alt_00", "G", ["T"], None, ["PASS"], {"is_filtered": 0}), VCFRecord("artificial_chr1", 10, "alt_01", "G", ["T"], None, ["CSQ"], {"is_filtered": 1}), VCFRecord( "artificial_chr2", 10, "alt_02", "G", ["T"], None, ["CSQ"], { "is_filtered": 0, # Proctected }), VCFRecord( "artificial_chr1", 10, "alt_03", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "T", "id": "ann_00", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_04", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "C", "id": "ann_01", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_05", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "C", "id": "ann_02", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_06", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "T", "id": "ann_03", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_07", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_04", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_05", "FILTER": "ANN.COLLOC", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_08", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_06", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_07", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr2", 10, "alt_09", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_08", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_09", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }), VCFRecord( "artificial_chr2", 10, "alt_10", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_10", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_11", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): # VCF self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) # Ref seq tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa") self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai") self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 with open(self.tmp_fasta_path, "w") as FH_seq: FH_seq.write(">chr1\n{}".format(self.ref_seq)) with open(self.tmp_faidx_path, "w") as FH_faidx: FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq))) # Variants self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, })
def getNewHeaderAttr(args): """ Return renamed and new VCFHeader elements for the merged VCF. :param args: The script's parameters. :type args: NameSpace :return: VCFHeader elements (filter, info, format, samples). :rtype: dict """ unchanged_info = {"MATEID", "RNA_FIRST", "SVTYPE", "IMPRECISE"} final_filter = {} final_info = { "CIPOS": HeaderInfoAttr("CIPOS", type="Integer", number="2", description="Confidence interval around POS"), "IDSRC": HeaderInfoAttr("IDSRC", type="String", number=".", description="ID of breakend by source"), "REFSRC": HeaderInfoAttr( "REFSRC", type="String", number="1", description="Selected support data (SR, PR) come from this source" ), "SRC": HeaderInfoAttr( "SRC", type="String", number=".", description= "Fusions callers where the breakend is identified. Possible values: {}" .format({ name: "s" + str(idx) for idx, name in enumerate(args.calling_sources) })) } final_format = { "SR": HeaderFormatAttr( "SR", type="Integer", number="1", description="Count of reads mapping on the fusion junction"), "PR": HeaderFormatAttr( "PR", type="Integer", number="1", description="Count of pairs of reads supporting the fusion"), "SRSRC": HeaderFormatAttr( "SRSRC", type="Integer", number=".", description= "Count of reads mapping on the fusion junction by source"), "PRSRC": HeaderFormatAttr( "PRSRC", type="Integer", number=".", description= "Count of pairs of reads supporting the fusion by source") } final_samples = None for idx_in, curr_in in enumerate(args.inputs_variants): with VCFIO(curr_in) as FH_vcf: # Samples if final_samples is None: final_samples = FH_vcf.samples elif FH_vcf.samples != final_samples: raise Exception( "The samples in VCF are not the same: {} in {} and {} in {}." .format(final_samples, args.inputs_variants[0], FH_vcf.samples, curr_in)) # FILTER for tag, data in FH_vcf.filter.items(): new_tag = tag if tag not in args.shared_filters: # Rename filters not based on caller new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_filter[new_tag] = data # INFO for tag, data in FH_vcf.info.items(): if tag in unchanged_info: if tag not in final_info or len( final_info[tag].description ) < len( data.description ): # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants final_info[tag] = data else: new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_info[new_tag] = data qual_tag = "s{}_VCQUAL".format(idx_in) final_info[qual_tag] = HeaderInfoAttr( qual_tag, type="Float", number="1", description="The variant quality", source=args.calling_sources[idx_in]) # FORMAT for tag, data in FH_vcf.format.items(): new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_format[new_tag] = data return { "filter": final_filter, "info": final_info, "format": final_format, "samples": final_samples }
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFTargets.py", "--mode", "remove", "--input-variants", self.tmp_variants, "--input-targets", self.tmp_regions, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write( Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create targets with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed: FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1")) FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2")) FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3")) # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "target": HeaderInfoAttr("target", "The ID of the overlapped target.", type="String", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_01", "G", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_04", "G", ["C"], None, None, {"target": None}), # After target ; first nt after target # Substit multi nt VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG", ["GTACCCGC"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG", ["GTACCCGC"], None, None, {"target": "target_2"}), # Overlap target start VCFRecord("artificial_chr1", 13, "alt_07", "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None, None, {"target": "target_2"}), # Include target VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA", ["CGCCCCTACATTGC"], None, None, {"target": "target_2"}), # Exact target VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA", ["GTACCCG"], None, None, {"target": "target_2"}), # Included by target VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA", ["GTACCCG"], None, None, {"target": "target_2"}), # Overlap target end VCFRecord( "artificial_chr1", 29, "alt_11", "GTAGTAGAT", ["GTACCCGA"], None, None, {"target": None}), # After target ; first nt after target # Ins single nt VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord( "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None, {"target": "target_2"}), # Movable to last nt of target # Del single nt VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_20", "T", [""], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_23", "G", [""], None, None, {"target": None}), # After target ; first nt afetr target # Del multi nt VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None, None, {"target": "target_2" }), # On repeat and movable to first nt of target VCFRecord( "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None, {"target": "target_2"}), # Movable to last nt of target VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None, None, {"target": "target_2" }), # On repeat and movable to last nt of target ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFBySOR.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="1"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="1"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="1"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="1"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 5, "SRF": 5, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 95, "SRF": 95, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 95, "SRF": 95, "expected": "strandRatioBias" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 5, "SRF": 5, "expected": "strandRatioBias" }), # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, { "SAR": 400, "SAF": 600, "SRR": 1400, "SRF": 1000, "expected": "PASS" }), # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 400, "SRF": 600, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP VCFRecord( "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP VCFRecord( "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP VCFRecord( "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 0, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP VCFRecord( "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 2, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP VCFRecord( "artificial_chr1", 150, "sub_15", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 1, "SRF": 0, "expected": "PASS" # It can be discuss: 2.89 }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, { "SAR": 15, "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 170, "sub_17", "G", ["T"], None, None, { "SAR": 13, # 12 => PASS "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 350, "SRF": 50, "expected": "PASS" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias VCFRecord( "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 50, "SRF": 350, "expected": "strandRatioBias" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, { "SAR": 14, "SAF": 2, "SRR": 8, "SRF": 8, "expected": "strandRatioBias" }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "standardizeVCF.py", "--trace-unstandard", "--input-reference", self.tmp_sequences, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."), "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."), "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, { "expected": ["artificial_chr1:14=G/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["A|ann_1|0", "A|ann_2|0"] }), # Substit multi nt VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, { "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"] }), VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, { "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"] }), # Insertion single nt VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, { "expected": ["artificial_chr1:14=G/GA"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, { "expected": ["artificial_chr1:19=T/TA"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, { "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"] }), VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"], "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"] }), # Insertion multi nt VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, { "expected": ["artificial_chr1:14=G/GATGC"], "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, { "expected": ["artificial_chr1:19=T/TAAATC"], "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"] }), # Movable insertion multi nt VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, { "expected": ["artificial_chr1:12=A/ATG"], "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"] }), VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, { "expected": ["artificial_chr1:25=C/CAAA"], "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"], "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"] }), # Deletion single nt VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, { "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"], "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"] }), # Movable deletion multi nt VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, { "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"], "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"] }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Load knowns log.info("Load known partners from {}.".format(args.input_known_partners)) sources_by_symbols = sourcesBySymbols(args.input_known_partners) # Annot variants log.info("Annotate known fusions partners.") with BreakendVCFIO(args.output_variants, "w", args.annotation_field) as writer: with BreakendVCFIO(args.input_variants, "r", args.annotation_field) as reader: # Header writer.copyHeader(reader) writer.info["known_partners"] = HeaderInfoAttr( id="known_partners", type="String", number=".", description= "Database containing the fusion of these gene. Format: 5primSymbol_@_3primSymbol=db1name:entryId,entryId|db2name:entryId (example: BCR_@_ABL1=cosmic_91:1743,1745|chimerdb_pub-V4:3427,3428)" ) writer.writeHeader() # Records for first, second in reader: annotate(first, second, sources_by_symbols, args.annotation_field) writer.write(first, second) log.info("End of job")
logging.basicConfig( format= '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s' ) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process with IdxFastaIO(args.input_genome) as genome_reader: with BreakendVCFIO(args.input_variants) as reader: with BreakendVCFIO(args.output_variants, "w") as writer: writer.copyHeader(reader) writer.info["CIPOS"] = HeaderInfoAttr( "CIPOS", type="Integer", number="2", description="Confidence interval around POS") if args.trace_unstandard: writer.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "Breakend id (chromosome:position=reference/alternative) before standardization" ) writer.writeHeader() for first, second in reader: if args.trace_unstandard: first.info["UNSTD"] = "{}:{}={}/{}".format( first.chrom, first.pos, first.ref,
def getNewHeaderAttr(args): """ Return renamed and new VCFHeader elements for the merged VCF. :param args: The script's parameters. :type args: NameSpace :return: VCFHeader elements (filter, info, format, samples). :rtype: dict """ final_filter = {} final_info = { "SRC": HeaderInfoAttr( "SRC", type="String", number=".", description= "Variant callers where the variant is identified. Possible values: {}" .format({ name: "s" + str(idx) for idx, name in enumerate(args.calling_sources) })) } final_format = { "AD": HeaderFormatAttr("AD", type="Integer", number="A", description="Allele Depth"), "DP": HeaderFormatAttr("DP", type="Integer", number="1", description="Total Depth"), "ADSRC": HeaderFormatAttr("ADSRC", type="Integer", number=".", description="Allele Depth by source"), "DPSRC": HeaderFormatAttr("DPSRC", type="Integer", number=".", description="Total Depth by source") } final_samples = None for idx_in, curr_in in enumerate(args.inputs_variants): with VCFIO(curr_in) as FH_vcf: # Samples if final_samples is None: final_samples = FH_vcf.samples elif FH_vcf.samples != final_samples: raise Exception( "The samples in VCF are not the same: {} in {} and {} in {}." .format(final_samples, args.inputs_variants[0], FH_vcf.samples, curr_in)) # FILTER for tag, data in FH_vcf.filter.items(): new_tag = tag if tag not in args.shared_filters: # Rename filters not based on caller new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_filter[new_tag] = data # INFO for tag, data in FH_vcf.info.items(): if tag == args.annotations_field: if tag not in final_info or len( final_info[tag].description ) < len( data.description ): # Manage merge between callers with 0 variants (and 0 annotations) and callers with variants final_info[tag] = data else: new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_info[new_tag] = data qual_tag = "s{}_VCQUAL".format(idx_in) final_info[qual_tag] = HeaderInfoAttr( qual_tag, type="Float", number="1", description="The variant quality", source=args.calling_sources[idx_in]) # FORMAT for tag, data in FH_vcf.format.items(): new_tag = "s{}_{}".format(idx_in, tag) data.id = new_tag data.source = args.calling_sources[idx_in] final_format[new_tag] = data return { "filter": final_filter, "info": final_info, "format": final_format, "samples": final_samples }
vcaller_curr_AF = vcaller_AF[alt_idx + 1] record_allele.samples[curr_spl]["AF"] = [round(vcaller_curr_AF, args.AF_precision)] record_allele.samples[curr_spl]["AD"] = [int(vcaller_curr_AF * vcaller_DP)] record_allele.samples[curr_spl]["DP"] = vcaller_DP # Store allele allele_id = record_allele.getName() if allele_id not in variants: variants[allele_id] = record_allele else: variants[allele_id].samples[curr_spl] = record_allele.samples[curr_spl] # Completes and writes variants with VCFIO(args.output_variants, "w") as FH_out: # Header FH_out.copyHeader(FH_vcf) FH_out.info["AF"] = HeaderInfoAttr("AF", type="Float", number="A", description="The alleles frequencies for the group of samples.") FH_out.info["AD"] = HeaderInfoAttr("AD", type="Integer", number="A", description="The alleles depths for the group of samples.") FH_out.info["DP"] = HeaderInfoAttr("DP", type="Integer", number="1", description="Combined depth across samples.") FH_out.format["AF"] = HeaderFormatAttr("AF", type="Float", number="A", description="The alleles frequencies.") FH_out.format["AD"] = HeaderFormatAttr("AD", type="Integer", number="A", description="The alleles depths.") FH_out.format["DP"] = HeaderFormatAttr("DP", type="Integer", number="1", description="Depth.") FH_out.samples = [spl for spl in sorted(aln_by_samples)] FH_out.writeHeader() # Records for allele_id in variants: curr_var = variants[allele_id] # Add tag AF, AD and DP by sample if "AF" not in curr_var.format: curr_var.format.append("AF") if "AD" not in curr_var.format: curr_var.format.append("AD") if "DP" not in curr_var.format: curr_var.format.append("DP")
def setUp(self): self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, })
# Annot variants log.info("Annot variants in {}.".format(args.input_variants)) with BreakendVCFIO(args.output_variants, "w", args.annotation_field) as writer: with BreakendVCFIO(args.input_variants) as reader: # Header writer.copyHeader(reader) writer.ANN_titles = [ "SYMBOL", "Gene", "Feature", "Feature_type", "Protein", "STRAND", "RNA_ELT_TYPE", "RNA_ELT_POS", "CDS_position", "Protein_position", "GENE_SHARD", "IN_FRAME" ] writer.info[args.annotation_field] = HeaderInfoAttr( id=args.annotation_field, type="String", number=".", description="Consequence annotations. Format: " + "|".join(writer.ANN_titles)) writer.info["ANNOT_POS"] = HeaderInfoAttr( id="ANNOT_POS", type="Integer", number="1", description= "Breakend position used in annotation. It take into account CIPOS to give priority to a breakend on exon boundaries." ) writer.writeHeader() # Records for first, second in reader: annot(first, second, genes_by_chr, args.annotation_field) writer.write(first, second) log.info("End of job")
def testTagMultipleValues(self): # Write test data with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="A"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="A"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="A"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="A"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": [5], "SAF": [5], "SRR": [5], "SRF": [5], "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": [9], "SAF": [1], "SRR": [95], "SRF": [95], "expected": "strandRatioBias" }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var) # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [] for record in self.variants: for alt in record.alt: expected.append(record.id + ":" + record.info["expected"]) observed = [] with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id + ":" + record.filter[0]) self.assertEqual(expected, observed)
) log = logging.getLogger(os.path.basename(__file__)) log.setLevel(args.logging_level) log.info("Command: " + " ".join(sys.argv)) # Merge variants getIncludingReads = getIncludingReadsRNA if args.spliced_aln else getIncludingReadsDNA with IdxFastaIO(args.input_sequences) as FH_seq: with VCFIO(args.output_variants, "w") as FH_out: with pysam.AlignmentFile(args.input_aln, "rb") as FH_aln: with VCFIO(args.input_variants) as FH_vcf: # Header FH_out.copyHeader(FH_vcf) FH_out.info["MCO_VAR"] = HeaderInfoAttr( "MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".") FH_out.info["MCO_QUAL"] = HeaderInfoAttr( "MCO_QUAL", "Qualities of the variants merged because their occur on same reads.", type="String", number=".") FH_out.info["MCO_IR"] = HeaderInfoAttr( "MCO_IR", "Co-occurancy rate between pairs of variants.", type="String", number=".") FH_out.info["MCO_IC"] = HeaderInfoAttr( "MCO_IC", "Co-occurancy count between pairs of variants.",
args = parser.parse_args() # Logger logging.basicConfig(format='%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s') log = logging.getLogger(os.path.basename(__file__)) log.setLevel(logging.INFO) log.info("Command: " + " ".join(sys.argv)) # Process nb_variants = 0 nb_filtered = 0 with VCFIO(args.input_variants) as handle_in: with VCFIO(args.output_variants, "w") as handle_out: # Header handle_out.copyHeader(handle_in) handle_out.info[args.SOR_tag] = HeaderInfoAttr(args.SOR_tag, "Strand bias estimated by the symmetric odds ratio test.", type="Float") handle_out.filter[args.bias_tag] = HeaderFilterAttr(args.bias_tag, "Strand ratio bias (estimated by the symmetric odds ratio test): substit SOR > {}, InDel SOR > {}.".format(args.substit_max_SOR, args.indel_max_SOR)) handle_out.writeHeader() # Records for record in handle_in: if len(record.alt) > 1: raise Exception("The multi-allelic variants cannot be processed: {}.".format(record.getName())) nb_variants += 1 is_filtered = False # Compute SOR record.info[args.SOR_tag] = strandOddRatio( record.info[args.ref_fwd_tag] if handle_in.info[args.ref_fwd_tag].number == "1" else record.info[args.ref_fwd_tag][0], record.info[args.ref_rev_tag] if handle_in.info[args.ref_rev_tag].number == "1" else record.info[args.ref_rev_tag][0], record.info[args.alt_fwd_tag] if handle_in.info[args.alt_fwd_tag].number == "1" else record.info[args.alt_fwd_tag][0], record.info[args.alt_rev_tag] if handle_in.info[args.alt_rev_tag].number == "1" else record.info[args.alt_rev_tag][0] )
def stdizeVCF(FH_ref, FH_in, FH_out, trace_unstandard=False, log=None): """ Split alternatives alleles in multi-lines, removes unecessary reference and alternative nucleotids, move indel to most upstream position and update alt allele in annotations. :param FH_ref: File handle to the reference file (format: fasta with faidx). :type FH_ref: anacore.sequenceIO.IdxFastaIO :param FH_in: File handle to the variants file (format: VCF). :type FH_in: anacore.vcf.VCFIO :param FH_out: File handle to the standardized variants file (format: VCF). :type FH_out: anacore.vcf.VCFIO :param trace_unstandard: True if you want to keep the trace of the variant before standardization in INFO. :type trace_unstandard: bool :param log: Logger used. :type log: logging.Logger """ nb_annot = {"exact": 0, "collocated": 0} is_annotated = issubclass(FH_out.__class__, AnnotVCFIO) # Header FH_out.copyHeader(FH_in) if trace_unstandard: FH_out.info["UNSTD"] = HeaderInfoAttr( "UNSTD", type="String", number="1", description= "The variant id (chromosome:position=reference/alternative) before standardization." ) FH_out.writeHeader() # Records for record in FH_in: collocated_records = [] for alt_idx, alt in enumerate(record.alt): alt_record = getAlleleRecord(FH_in, record, alt_idx) if trace_unstandard: alt_record.info["UNSTD"] = alt_record.getName() # Previous unstd = { "chrom": alt_record.chrom, "pos": alt_record.pos, "ref": alt_record.ref, "alt": alt_record.alt[0] } # Standardize pos, ref and alt alt_record.fastStandardize(FH_ref, 1000) # Update annotations if is_annotated and FH_in.annot_field in alt_record.info: cleaned_annot = [] for idx_ann, annot in enumerate( alt_record.info[FH_in.annot_field]): if unstd["alt"] == annot["Allele"]: nb_annot["exact"] += 1 annot["Allele"] = alt_record.alt[0] cleaned_annot.append(annot) else: nb_annot["collocated"] += 1 alt_record.info[FH_in.annot_field] = cleaned_annot collocated_records.append(alt_record) if len(collocated_records) == 1: FH_out.write(collocated_records[0]) else: for alt_record in sorted( collocated_records, key=lambda elt: (elt.refStart(), elt.refEnd())): # Sorted splitted alleleles FH_out.write(alt_record) if log is not None and nb_annot["collocated"] != 0: log.warning( "{}/{} annotations have been deleted because they concern collocated variant." .format(nb_annot["collocated"], nb_annot["exact"] + nb_annot["collocated"]))
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf") self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf") self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf") self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf") # test cases self.test_cases = [ { # *a-b, a-b, a b, / "initial": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [ VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104}) ] }, "haplotyped": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})] }, "expected": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})] } }, { # *a b, a b, a-b, / "initial": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "haplotyped": { "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "expected": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ] } }, { # *a-b c, a-b c, a b c, / "initial": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}), VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] }, "haplotyped": { "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})] }, "expected": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] } }, { # *a-b c, a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}), VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ] } }, { # *a-b c, a' a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ] } }, { # *a b c, a' a-b c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ] } }, { # *a b c, a-b b' c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})], "caller2": [ VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}), VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ] } }, { # *a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})], "caller2": [ VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}), VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ] } }, { # *a' a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]}) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}), VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ] } } ] # Get callers callers = set() for curr_test in self.test_cases: for curr_caller in curr_test["initial"]: callers.add(curr_caller) self.callers = sorted(list(callers)) # Write files for curr_caller in self.callers: # Initial with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["initial"]: for curr_var in curr_test["initial"][curr_caller]: handle_out.write(curr_var) # Haplotyped with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"), "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["haplotyped"]: for curr_var in curr_test["haplotyped"][curr_caller]: handle_out.write(curr_var) # Expected with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["expected"]: for curr_var in curr_test["expected"][curr_caller]: handle_out.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFPrimers.py", "--input-variants", self.tmp_variants, "--input-regions", self.tmp_regions, "--input-sequences", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN")) # 123456789| | | | | | | | | | # 10| 14| 18| 22| 26| # 12 16 20 24 28 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")} FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}), # ZOI downstream limit deletion VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution before end of upstream primer VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}), # Substitution in upstream limit of ZOI VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}), # Substitution in dosnstream limit of ZOI VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}), # Substitution after start of downstream primer VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution in downstream primer VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}), # Insertion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}), # Deletion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)