def setRefPos(variant, seq_handler, padding=200): """ Add start and end attributes in VCFRecord. For insertions the start is defined on the first position before the insertion and the end on the last position affected by the insertion. :param variant: The variant to update. :type variant: anacore.vcf.VCFRecord """ if variant.ref == VCFRecord.getEmptyAlleleMarker() or variant.alt[ 0] == VCFRecord.getEmptyAlleleMarker(): # Normalized indel # Most upstream variant.upstream_start, variant.upstream_end = getStartEnd(variant) # Most downstream sub_region = seq_handler.getSub( variant.chrom, variant.pos - 2, variant.pos + len(variant.ref) + padding) chrom_pos = variant.pos variant.pos = 3 # Switch position from chromosome to position from subregion downstream_var = variant.getMostDownstream(sub_region) variant.pos = chrom_pos + variant.pos - 3 # Switch position from subregion to position from chromosome downstream_var.pos = variant.pos variant.downstream_start, variant.downstream_end = getStartEnd( downstream_var) else: variant.upstream_start, variant.upstream_end = getStartEnd(variant) variant.downstream_start = variant.upstream_start variant.downstream_end = variant.upstream_end
def testHasLowSupport(self): up = VCFRecord("chr1", 140, "id_01", "A", ["A[chr1:199["], pFormat=["PR", "SR"], samples={"splA": { "PR": 9, "SR": 3 }}) self.assertTrue(not hasLowSupport(up, 10)) up = VCFRecord("chr1", 140, "id_02", "A", ["A[chr1:199["], pFormat=["PR", "SR"], samples={"splA": { "PR": 9, "SR": 0 }}) self.assertTrue(hasLowSupport(up, 10)) up = VCFRecord("chr1", 140, "id_03", "A", ["A[chr1:199["], pFormat=["PR", "SR"], samples={ "splA": { "PR": 4, "SR": 2 }, "splB": { "PR": 3, "SR": 3 }, }) self.assertTrue(not hasLowSupport(up, 10)) up = VCFRecord("chr1", 140, "id_04", "A", ["A[chr1:199["], pFormat=["PR", "SR"], samples={ "splA": { "PR": 1, "SR": 2 }, "splB": { "PR": 3, "SR": 3 }, }) self.assertTrue(hasLowSupport(up, 10)) up = VCFRecord("chr1", 140, "id_05", "A", ["A[chr1:199["]) self.assertTrue(not hasLowSupport(up, 0)) # No test
def getSupportingReads(var, chrom_seq, FH_aln, log): """ Return read ID of reads supporting the altenative variant. :param var: The variant. :type var: anacore.vcf.VCFRecord updated with iniVariant() and isIns :param chrom_seq: The sequence of the chromosome. :type chrom_seq: str :param FH_aln: The file handle to the alignments file. The variants must have been defined from this alignments file. :type FH_aln: pysam.AlignmentFile :param log: The logger object. :type log: logging.Logger :return: The list of supporting reads IDs. :rtype: set """ supporting_reads = set() is_insertion = var.isInsertion() for read in FH_aln.fetch(var.chrom, var.upstream_start - 1, var.downstream_end): if not read.is_duplicate: reads_pos = read.get_reference_positions() if len(reads_pos) != 0: # Skip alignment with problem ref_start = reads_pos[0] + 1 # 0-based to 1-based ref_end = reads_pos[-1] + 1 # 0-based to 1-based overlap_var = (ref_start <= var.upstream_start and ref_end >= var.downstream_end) if overlap_var: ref_aln, read_aln = getAlnCmp(read, chrom_seq[ref_start - 1:ref_end]) var_alt = var.alt[0].upper().replace(VCFRecord.getEmptyAlleleMarker(), "") var_ref = var.ref.upper().replace(VCFRecord.getEmptyAlleleMarker(), "") # Test with upstream coordinates ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.upstream_start, var.upstream_end) if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref: # The alternative is present on most upstream coordinates log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring)) supporting_reads.add(read.query_name) # Fragment is overlapping if at least one of his read is ovelapping # Test with downstream coordinates elif var.upstream_start != var.downstream_start: ref, alt = getReadRefAlt(ref_aln, read_aln, ref_start, is_insertion, var.downstream_start, var.downstream_end) if "".join(alt).upper() == var_alt and "".join(ref).upper() == var_ref: # The alternative is present on most downstream coordinates log.debug("{}\t{}/{}\t'{}'\t'{}'\t{}".format(read.query_name, var.ref, var.alt[0], "".join(ref), "".join(alt), read.cigarstring)) supporting_reads.add(read.query_name) # Fragment is overlapping if at least one of his read is ovelapping return supporting_reads
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_selected_rna = os.path.join(tmp_folder, unique_id + "_rna.tsv") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Create RNA ref with open(self.tmp_selected_rna, "w") as FH_rna: FH_rna.write("#Gene\tTranscript\n") FH_rna.write("Gene_1\tENST_selected1\n") FH_rna.write("Gene_1\tENST_selected2\n") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = [ "Allele", "Consequence", "Feature", "EUR_AF", "gnomAD_AF", "expected_filter" ] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|Consequence|Feature|gnomAD_AF|expected_filter.", type="String", number="."), "expected_filter": HeaderInfoAttr("expected_filter", "The expected filters.", type="String", number=".") } FH_var.writeHeader() self.variants = [ VCFRecord( "artificial_chr1", 14, "alt_00", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }], "expected_filter": ["PASS"] }), VCFRecord("artificial_chr1", 14, "alt_01", "G", ["T"], None, None, {"expected_filter": ["CSQ"]}), VCFRecord( "artificial_chr1", 14, "alt_02", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_03", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }], "expected_filter": ["popAF"] }), VCFRecord( "artificial_chr1", 14, "alt_04", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_05", "G", ["T"], None, None, { "ANN": [{ "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"] }), VCFRecord( "artificial_chr1", 14, "alt_06", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "PASS" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["PASS"], }), VCFRecord( "artificial_chr1", 14, "alt_07", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_08", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_09", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.RNA" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ"], }), VCFRecord( "artificial_chr1", 14, "alt_10", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_11", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_12", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "other", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.01, "expected_filter": "ANN.RNA&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.001&0.001", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_13", "G", ["T"], None, None, { "ANN": [{ "Allele": "T", "Consequence": "synonymous_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.CSQ&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["CSQ", "popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_14", "G", ["GT"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 15, "alt_15", "-", ["T"], None, None, { "ANN": [{ "Allele": "GT", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "T", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_15", "G", ["-"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }), VCFRecord( "artificial_chr1", 14, "alt_16", "GG", ["G"], None, None, { "ANN": [{ "Allele": "-", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }, { "Allele": "G", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.01&0.01", "gnomAD_AF": 0.001, "expected_filter": "ANN.popAF" }, { "Allele": "C", "Consequence": "missense_variant", "Feature": "ENST_selected1", "EUR_AF": "0.05&0.05", "gnomAD_AF": 0.001, "expected_filter": "ANN.COLLOC&ANN.popAF" }], "expected_filter": ["popAF"], }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_var_filters = os.path.join(tmp_folder, unique_id + "_varFilters.json") self.tmp_annot_filters = os.path.join(tmp_folder, unique_id + "_annFilters.json") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Command self.cmd = [ "filterAnnotVCF.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create filters with open(self.tmp_var_filters, "w") as FH_filter: FH_filter.write("""{ "class": "FiltersCombiner", "operator": "or", "filters": [ { "class": "Filter", "getter": "filter", "action": "select", "aggregator": "ratio:1", "operator": "!=", "values": "CSQ" }, { "class": "Filter", "getter": "chrom", "action": "select", "aggregator": "nb:1", "operator": "==", "values": "artificial_chr2" } ] }""") with open(self.tmp_annot_filters, "w") as FH_filter: FH_filter.write("""{ "class": "Filter", "getter": "FILTER", "action": "select", "aggregator": "ratio:1", "operator": "==", "values": "PASS" }""") # Create VCF with AnnotVCFIO(self.tmp_variants, "w") as FH_var: FH_var.ANN_titles = ["Allele", "id", "is_filtered", "FILTER"] FH_var.info = { "ANN": HeaderInfoAttr( "ANN", "Consequence annotations from Ensembl VEP. Format: Allele|id|is_filtered|FILTER.", type="String", number="."), "is_filtered": HeaderInfoAttr("is_filtered", "The expected result.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 10, "alt_00", "G", ["T"], None, ["PASS"], {"is_filtered": 0}), VCFRecord("artificial_chr1", 10, "alt_01", "G", ["T"], None, ["CSQ"], {"is_filtered": 1}), VCFRecord( "artificial_chr2", 10, "alt_02", "G", ["T"], None, ["CSQ"], { "is_filtered": 0, # Proctected }), VCFRecord( "artificial_chr1", 10, "alt_03", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "T", "id": "ann_00", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_04", "G", ["T"], None, ["PASS"], { "ANN": [{ "Allele": "C", "id": "ann_01", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_05", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "C", "id": "ann_02", "FILTER": "ANN.COLLOC", "is_filtered": 1 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_06", "G", ["T"], None, ["CSQ"], { "ANN": [{ "Allele": "T", "id": "ann_03", "FILTER": "PASS", "is_filtered": 0 }], "is_filtered": 1 }), VCFRecord( "artificial_chr1", 10, "alt_07", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_04", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_05", "FILTER": "ANN.COLLOC", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr1", 10, "alt_08", "G", ["T"], None, ["PASS"], { "ANN": [ { "Allele": "T", "id": "ann_06", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_07", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 }), VCFRecord( "artificial_chr2", 10, "alt_09", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_08", "FILTER": "ANN.popAF", "is_filtered": 1 }, { "Allele": "C", "id": "ann_09", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }), VCFRecord( "artificial_chr2", 10, "alt_10", "G", ["T"], None, ["CSQ"], { "ANN": [ { "Allele": "T", "id": "ann_10", "FILTER": "PASS", "is_filtered": 0 }, { "Allele": "C", "id": "ann_11", "FILTER": "ANN.COLLOC&ANN.popAF", "is_filtered": 1 }, ], "is_filtered": 0 # Protected }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFTargets.py", "--mode", "remove", "--input-variants", self.tmp_variants, "--input-targets", self.tmp_regions, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write( Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create targets with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed: FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1")) FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2")) FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3")) # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "target": HeaderInfoAttr("target", "The ID of the overlapped target.", type="String", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_01", "G", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_04", "G", ["C"], None, None, {"target": None}), # After target ; first nt after target # Substit multi nt VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG", ["GTACCCGC"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG", ["GTACCCGC"], None, None, {"target": "target_2"}), # Overlap target start VCFRecord("artificial_chr1", 13, "alt_07", "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None, None, {"target": "target_2"}), # Include target VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA", ["CGCCCCTACATTGC"], None, None, {"target": "target_2"}), # Exact target VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA", ["GTACCCG"], None, None, {"target": "target_2"}), # Included by target VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA", ["GTACCCG"], None, None, {"target": "target_2"}), # Overlap target end VCFRecord( "artificial_chr1", 29, "alt_11", "GTAGTAGAT", ["GTACCCGA"], None, None, {"target": None}), # After target ; first nt after target # Ins single nt VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord( "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None, {"target": "target_2"}), # Movable to last nt of target # Del single nt VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 15, "alt_20", "T", [""], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 29, "alt_23", "G", [""], None, None, {"target": None}), # After target ; first nt afetr target # Del multi nt VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None, None, {"target": None }), # Before target ; first nt before target VCFRecord( "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None, {"target": "target_2"}), # On target ; first nt of target VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None, None, {"target": "target_2"}), # On target VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None, None, {"target": "target_2"}), # On target ; last nt VCFRecord( "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None, {"target": None}), # After target ; first nt afetr target # Movable del multi nt VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None, None, {"target": "target_2" }), # On repeat and movable to first nt of target VCFRecord( "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None, {"target": "target_2"}), # Movable to first nt of target VCFRecord( "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None, {"target": "target_2"}), # Movable to last nt of target VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None, None, {"target": "target_2" }), # On repeat and movable to last nt of target ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFBySOR.py", "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="1"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="1"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="1"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="1"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 5, "SRF": 5, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref no bias VCFRecord("artificial_chr1", 20, "sub_02", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 95, "SRF": 95, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt no bias, ref strand bias VCFRecord("artificial_chr1", 30, "sub_03", "G", ["T"], None, None, { "SAR": 5, "SAF": 5, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 95, "SRF": 95, "expected": "strandRatioBias" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 50, "sub_05", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 150, "SRF": 30, "expected": "PASS" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 60, "sub_06", "G", ["T"], None, None, { "SAR": 9, "SAF": 1, "SRR": 5, "SRF": 5, "expected": "strandRatioBias" }), # 0.29 alt, 0.71 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 70, "sub_07", "G", ["T"], None, None, { "SAR": 400, "SAF": 600, "SRR": 1400, "SRF": 1000, "expected": "PASS" }), # 0.71 alt, 0.29 ref, good DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 80, "sub_08", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 400, "SRF": 600, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 0 DP VCFRecord( "artificial_chr1", 90, "sub_09", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, good DP, alt no bias, ref 2 DP VCFRecord( "artificial_chr1", 100, "sub_10", "G", ["T"], None, None, { "SAR": 1400, "SAF": 1000, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 0 DP VCFRecord("artificial_chr1", 110, "sub_11", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 0, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt no bias, ref 2 DP VCFRecord("artificial_chr1", 120, "sub_12", "G", ["T"], None, None, { "SAR": 90, "SAF": 30, "SRR": 0, "SRF": 2, "expected": "PASS" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 0 DP VCFRecord( "artificial_chr1", 130, "sub_13", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 0, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 2 DP VCFRecord( "artificial_chr1", 140, "sub_14", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 0, "SRF": 2, "expected": "strandRatioBias" }), # 1.0 alt, 0.0 ref, limit DP, alt strand bias, ref 1 DP VCFRecord( "artificial_chr1", 150, "sub_15", "G", ["T"], None, None, { "SAR": 90, "SAF": 10, "SRR": 1, "SRF": 0, "expected": "PASS" # It can be discuss: 2.89 }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 160, "sub_16", "G", ["T"], None, None, { "SAR": 15, "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 170, "sub_17", "G", ["T"], None, None, { "SAR": 13, # 12 => PASS "SAF": 2, "SRR": 200, "SRF": 200, "expected": "strandRatioBias" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias => no bias VCFRecord("artificial_chr1", 180, "sub_18", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 350, "SRF": 50, "expected": "PASS" }), # 0.04 alt, 0.96 ref, good DP, alt strand bias, ref strand bias rev => bias VCFRecord( "artificial_chr1", 190, "sub_19", "G", ["T"], None, None, { "SAR": 13, "SAF": 2, "SRR": 50, "SRF": 350, "expected": "strandRatioBias" }), # 0.5 alt, 0.5 ref, low DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 200, "sub_20", "G", ["T"], None, None, { "SAR": 14, "SAF": 2, "SRR": 8, "SRF": 8, "expected": "strandRatioBias" }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def setUp(self): self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, })
def testIsHLA(self): up = VCFRecord( "chr1", 110, "id_01", "A", ["A[chr1:200["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "HLA-DRB1", "STRAND": "+"}, {"SYMBOL": "HLA-DMB", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 200, "id_02", "A", ["]chr1:110]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N02", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) self.assertTrue(isHLA(up, down, "TESTANN")) up = VCFRecord( "chr1", 110, "id_01", "A", ["A[chr1:200["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "HLA-DRB1", "STRAND": "+"}, {"SYMBOL": "HLA-DMB", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 200, "id_02", "A", ["]chr1:110]A"], info={ "MATEID": "id_01", "TESTANN": [] } ) self.assertTrue(isHLA(up, down, "TESTANN")) up = VCFRecord( "chr1", 110, "id_01", "A", ["A[chr1:200["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [] } ) down = VCFRecord( "chr1", 200, "id_02", "A", ["]chr1:110]A"], info={ "MATEID": "id_01", "TESTANN": [] } ) self.assertTrue(not isHLA(up, down, "TESTANN")) up = VCFRecord( "chr1", 110, "id_01", "A", ["A[chr1:200["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 200, "id_02", "A", ["]chr1:110]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "HLAN02", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) self.assertTrue(not isHLA(up, down, "TESTANN"))
def testInner(self): up = VCFRecord( "chr1", 140, "id_01", "A", ["A[chr1:299["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 299, "id_02", "A", ["]chr1:140]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N03", "STRAND": "+"}, {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) self.assertTrue( not isInner( up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False) ) ) # +/+ not inner (starts on limit) up = VCFRecord( "chr1", 140, "id_01", "A", ["A[chr1:199["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 199, "id_02", "A", ["]chr1:140]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N02", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) self.assertTrue( isInner( up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False) ) ) # +/+ inner gene 4 (starts on limit) up = VCFRecord( "chr1", 298, "id_01", "A", ["]chr1:320]A"], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) down = VCFRecord( "chr1", 320, "id_02", "A", ["A[chr1:298["], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"}, {"SYMBOL": "GENE_N03", "STRAND": "+"} ] } ) self.assertTrue( isInner( up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False) ) ) # -/- inner gene 6 up = VCFRecord( "chr1", 298, "id_01", "A", ["A[chr1:320["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) down = VCFRecord( "chr1", 320, "id_02", "A", ["]chr1:298]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"}, {"SYMBOL": "GENE_N03", "STRAND": "+"} ] } ) self.assertTrue( not isInner( up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False) ) ) # +/+ inner gene 6 => not valid strand up = VCFRecord( "chr1", 298, "id_01", "A", ["A[chr1:320["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) down = VCFRecord( "chr1", 320, "id_02", "A", ["A[chr1:298["], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"}, {"SYMBOL": "GENE_N03", "STRAND": "+"} ] } ) self.assertTrue( isInner( up, down, "TESTANN", annCmpNameFct(False), regCmpNameFct(False) ) ) # +/- inner gene 6
def testInNormal(self): normal_fusions_id = {"GENE_ID01 GENE_ID02", "GENE_ID02 GENE_ID03"} normal_fusions_symbol = {"GENE_N01 GENE_N02", "GENE_N02 GENE_N03"} up = VCFRecord( "chr1", 140, "id_01", "A", ["A[chr1:199["], info={ "RNA_FIRST": True, "MATEID": "id_02", "ANN": [ {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 199, "id_02", "A", ["]chr1:140]A"], info={ "MATEID": "id_01", "ANN": [ {"SYMBOL": "GENE_N02", "Gene": "GENE_ID02", "STRAND": "+"} ] } ) self.assertTrue(inNormal(up, down, "ANN", normal_fusions_id, "id")) self.assertTrue(inNormal(up, down, "ANN", normal_fusions_symbol, "symbol")) up = VCFRecord( "chr1", 140, "id_01", "A", ["A[chr1:299["], info={ "RNA_FIRST": True, "MATEID": "id_02", "ANN": [ {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 299, "id_02", "A", ["]chr1:140]A"], info={ "MATEID": "id_01", "ANN": [ {"SYMBOL": "GENE_N03", "Gene": "GENE_ID03", "STRAND": "+"}, {"SYMBOL": "GENE_N06", "Gene": "GENE_ID06", "STRAND": "-"} ] } ) self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_id, "id")) self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_symbol, "symbol")) down = VCFRecord( "chr1", 140, "id_01", "A", ["]chr1:299]A"], info={ "MATEID": "id_02", "ANN": [ {"SYMBOL": "GENE_N01", "Gene": "GENE_ID01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "Gene": "GENE_ID04", "STRAND": "+"} ] } ) up = VCFRecord( "chr1", 299, "id_02", "A", ["A[chr1:140["], info={ "RNA_FIRST": True, "MATEID": "id_01", "ANN": [ {"SYMBOL": "GENE_N03", "Gene": "GENE_ID03", "STRAND": "+"}, {"SYMBOL": "GENE_N06", "Gene": "GENE_ID06", "STRAND": "-"} ] } ) self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_id, "id")) self.assertTrue(not inNormal(up, down, "ANN", normal_fusions_symbol, "symbol"))
def setUp(self): # VCF self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) # Ref seq tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa") self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai") self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 with open(self.tmp_fasta_path, "w") as FH_seq: FH_seq.write(">chr1\n{}".format(self.ref_seq)) with open(self.tmp_faidx_path, "w") as FH_faidx: FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq))) # Variants self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, })
class MergeCoOccurVar(unittest.TestCase): def setUp(self): # VCF self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) # Ref seq tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa") self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai") self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 with open(self.tmp_fasta_path, "w") as FH_seq: FH_seq.write(">chr1\n{}".format(self.ref_seq)) with open(self.tmp_faidx_path, "w") as FH_faidx: FH_faidx.write("chr1\t{}\t6\t60\t61".format(len(self.ref_seq))) # Variants self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) def tearDown(self): # Clean temporary files for curr_file in [self.tmp_fasta_path, self.tmp_faidx_path]: if os.path.exists(curr_file): os.remove(curr_file) def testMergedRecord_1_substit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "A" self.variant_1.alt = ["T"] # Variant 2 self.variant_2.pos = 20 self.variant_2.ref = "G" self.variant_2.alt = ["C"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCTCGGCATGCCG" self.expected_merge.alt = ["TAATCTCGGCATGCCC"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_2_largeSubstit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["TGCA"] # Variant 2 self.variant_2.pos = 10 self.variant_2.ref = "TC" self.variant_2.alt = ["GG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCTC" self.expected_merge.alt = ["TGCACGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:10=TC/GG"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_3_largeCloseSubstit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["TGCA"] # Variant 2 self.variant_2.pos = 9 self.variant_2.ref = "CT" self.variant_2.alt = ["GG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCT" self.expected_merge.alt = ["TGCAGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:9=CT/GG"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_4_delIns(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["-"] # Variant 2 self.variant_2.pos = 10 self.variant_2.ref = "-" self.variant_2.alt = ["GGCATCT"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATC" self.expected_merge.alt = ["CGGCATCT"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/-", "chr1:10=-/GGCATCT"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_5_coDelIns(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["-"] # Variant 2 self.variant_2.pos = 9 self.variant_2.ref = "-" self.variant_2.alt = ["AGG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAAT" self.expected_merge.alt = ["AGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/-", "chr1:9=-/AGG"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_6_insDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 7 self.variant_2.ref = "ATC" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATC" self.expected_merge.alt = ["GTGTGAA"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:7=ATC/-"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_7_closeInsDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 6 self.variant_2.ref = "AA" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAA" self.expected_merge.alt = ["GTGTGA"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:6=AA/-"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_8_coInsDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 5 self.variant_2.ref = "AA" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AA" self.expected_merge.alt = ["GTGTG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:5=AA/-"] } # Eval with IdxFastaIO(self.tmp_fasta_path) as FH_ref: observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), FH_ref) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge))
'-o', '--output-variants', required=True, help='The path to the outputted file (format: VCF).') args = parser.parse_args() # Process curr_chrom = {"name": "", "seq": None} with VCFIO(args.output_variants, "w") as FH_out_vcf: with VCFIO(args.input_variants) as FH_in_vcf: # Header FH_out_vcf.copyHeader(FH_in_vcf) FH_out_vcf.writeHeader() # Records for record in FH_in_vcf: if record.ref == VCFRecord.getEmptyAlleleMarker() or any([ alt == VCFRecord.getEmptyAlleleMarker() for alt in record.alt ]): # record is a standardized in/del # Get previous nt if record.chrom != curr_chrom["name"]: curr_chrom["name"] = record.chrom curr_chrom["seq"] = getChromSeq( record.chrom, args.input_reference) prev_nt = curr_chrom["seq"][record.pos - 2] # Update record record.pos -= 1 if record.ref == VCFRecord.getEmptyAlleleMarker( ): # Insertion record.ref = prev_nt else: # Deletion
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length", "4", "--input-variants", self.tmp_variants, "--input-reference", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # 12 16 20 24 28 32 36 40 44 48 52 56 60 64 68 72 76 80 84 88 92 96 100 # 2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102 # | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | FH_seq.write( Sequence( "artificial_chr1", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr2", "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA" )) FH_seq.write( Sequence( "artificial_chr3", "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr4", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG" )) FH_seq.write( Sequence( "artificial_chr5", "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG" )) FH_seq.write( Sequence( "artificial_chr6", "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT" )) """ Variant before_start before_end before_seq after_start after_end after_seq alt_00 10 13 TCCA 15 18 CAAT alt_01 20 23 AAAA 25 28 TTCC alt_02 30 33 ACAG 35 38 AAAA alt_03 40 43 AGTA 45 48 AAAG alt_04 10 13 TCCA 16 19 AATA alt_05 20 23 AAAA 26 29 TCCT alt_06 30 33 ACAG 36 39 AAAA alt_07 40 43 GTAG 46 49 AAAG alt_08 11 14 CCAG 15 18 CAAT alt_09 20 23 AAAA 24 27 TTCC alt_10 31 34 AGGT 35 38 AAAA alt_11 40 43 GTAG 44 47 AAAG alt_12 11 14 CCAG 15 18 CAAT alt_13 20 23 AAAA 24 27 GTTC alt_14 31 34 CAGG 35 38 AAAA alt_15 41 44 GTAG 45 48 AAAG alt_16 50 53 GAAA 57 60 GTCA alt_17 60 63 AAAA 67 70 TATT alt_18 70 73 TCTC 77 80 AAAA alt_19 80 83 ACAG 87 90 AAAG alt_20 11 14 CCAG 16 19 AATA alt_21 20 23 AAAA 25 28 TTCC alt_22 31 34 CAGG 36 39 AAAA alt_23 40 43 AGTA 45 48 AAAG alt_24 11 14 CCAG 17 20 ATAA alt_25 19 22 AAAA 26 29 TCCT alt_26 29 32 TACA 35 38 AAAA alt_27 38 41 AAAG 45 48 AAAG alt_28 50 53 ACAA 61 64 CTTG alt_29 66 69 AAAA 76 79 CACA alt_30 76 79 CACA 86 89 AAAA alt_31 88 91 AACA 99 102 AAAT """ # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 89 17 89 90 artificial_chr2 89 124 89 90 artificial_chr3 88 231 88 89 artificial_chr4 95 337 95 96 artificial_chr5 89 450 89 90 artificial_chr6 102 557 102 103""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "is_filtered": HeaderInfoAttr( "is_filtered", "1 if the variant is adjacent to an homopolymer.", type="Integer", number="1") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr1", 24, "alt_01", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr1", 34, "alt_02", "G", ["T"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr1", 44, "alt_03", "G", ["T"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Substit multi nt VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins single nt VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # Ins multi nt VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer # Del single nt VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymers # # Del multi nt VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"], None, None, {"is_filtered": 0}), # Without adjacent homopolymers VCFRecord( "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers upstream VCFRecord( "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None, None, {"is_filtered": 1}), # Adjacent homopolymers downstream VCFRecord( "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None, None, {"is_filtered": 0}), # Adjacent too short homopolymer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
class MergeCoOccurVar(unittest.TestCase): def setUp(self): self.vcfio = FakeVCFIO( { "AF": HeaderInfoAttr("AF", "Alternative alleles frequencies", "Float", "A") }, { "AD": HeaderFormatAttr("AD", "Alternative alleles depths", "Integer", "A"), "DP": HeaderFormatAttr("DP", "total depth", "Integer", "1") }) self.ref_seq = "ACGCAAATCTCGGCATGCCGATT" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 self.variant_1 = VCFRecord( "chr1", # chrom None, # pos "artificial_1", # id None, # ref None, # alt 10, # qual ["lowQual", "lowDP"], # filter {"AF": [0.05]}, # info ["DP", "AD"], # format { "splA": { "AD": [10], "DP": 100 }, "splB": { "AD": [40], "DP": 4900 }, }) self.variant_2 = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 30, # qual ["PASS"], # filter {"AF": [0.06]}, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) self.expected_merge = VCFRecord( "chr1", # chrom None, # pos None, # id None, # ref None, # alt 20, # qual ["lowQual", "lowDP"], # filter { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] }, # info ["DP", "AD"], # format { "splA": { "AD": [5], "DP": 50 }, "splB": { "AD": [31], "DP": 550 }, }) def testMergedRecord_1_substit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "A" self.variant_1.alt = ["T"] # Variant 2 self.variant_2.pos = 20 self.variant_2.ref = "G" self.variant_2.alt = ["C"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCTCGGCATGCCG" self.expected_merge.alt = ["TAATCTCGGCATGCCC"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=A/T", "chr1:20=G/C"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_2_largeSubstit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["TGCA"] # Variant 2 self.variant_2.pos = 10 self.variant_2.ref = "TC" self.variant_2.alt = ["GG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCTC" self.expected_merge.alt = ["TGCACGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:10=TC/GG"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_3_largeCloseSubstit(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["TGCA"] # Variant 2 self.variant_2.pos = 9 self.variant_2.ref = "CT" self.variant_2.alt = ["GG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATCT" self.expected_merge.alt = ["TGCAGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/TGCA", "chr1:9=CT/GG"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_4_delIns(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["-"] # Variant 2 self.variant_2.pos = 10 self.variant_2.ref = "-" self.variant_2.alt = ["GGCATCT"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATC" self.expected_merge.alt = ["CGGCATCT"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/-", "chr1:10=-/GGCATCT"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_5_coDelIns(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "AAAT" self.variant_1.alt = ["-"] # Variant 2 self.variant_2.pos = 9 self.variant_2.ref = "-" self.variant_2.alt = ["AGG"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAAT" self.expected_merge.alt = ["AGG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=AAAT/-", "chr1:9=-/AGG"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_6_insDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 7 self.variant_2.ref = "ATC" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAATC" self.expected_merge.alt = ["GTGTGAA"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:7=ATC/-"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_7_closeInsDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 6 self.variant_2.ref = "AA" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AAA" self.expected_merge.alt = ["GTGTGA"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:6=AA/-"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge)) def testMergedRecord_8_coInsDel(self): # Variant 1 self.variant_1.pos = 5 self.variant_1.ref = "-" self.variant_1.alt = ["GTGTG"] # Variant 2 self.variant_2.pos = 5 self.variant_2.ref = "AA" self.variant_2.alt = ["-"] # Expected merge self.expected_merge.pos = 5 self.expected_merge.ref = "AA" self.expected_merge.alt = ["GTGTG"] self.expected_merge.info = { "AF": [0.06], "MCO_QUAL": [10, 30], "MCO_VAR": ["chr1:5=-/GTGTG", "chr1:5=AA/-"] } # Eval observed_merge = mergedRecord(self.vcfio, self.variant_1, self.variant_1.getName(), self.variant_2, self.variant_2.getName(), self.ref_seq) self.assertEqual(strVariant(observed_merge), strVariant(self.expected_merge))
def testIsReadthrough(self): genes = AnnotGetter(self.tmp_annot) up = VCFRecord( "chr1", 110, "id_01", "A", ["A[chr1:200["], info={ "RNA_FIRST": True, "MATEID": "id_02", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 200, "id_02", "A", ["]chr1:110]A"], info={ "MATEID": "id_01", "TESTANN": [ {"SYMBOL": "GENE_N02", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) self.assertTrue( isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) ) up = VCFRecord( "chr1", 110, "id_03", "A", ["A[chr1:300["], info={ "RNA_FIRST": True, "MATEID": "id_04", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 300, "id_04", "A", ["]chr1:110]A"], info={ "MATEID": "id_03", "TESTANN": [ {"SYMBOL": "GENE_N03", "STRAND": "+"}, {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) self.assertTrue( not isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) ) up = VCFRecord( "chr1", 140, "id_05", "A", ["A[chr1:199["], info={ "RNA_FIRST": True, "MATEID": "id_06", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"} ] } ) down = VCFRecord( "chr1", 199, "id_06", "A", ["]chr1:140]A"], info={ "MATEID": "id_05", "TESTANN": [ {"SYMBOL": "GENE_N02", "STRAND": "+"} ] } ) self.assertTrue( isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) ) up = VCFRecord( "chr1", 289, "id_07", "A", ["]chr1:148]A"], info={ "RNA_FIRST": True, "MATEID": "id_08", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) down = VCFRecord( "chr1", 148, "id_08", "A", ["A[chr1:289["], info={ "MATEID": "id_07", "TESTANN": [ {"SYMBOL": "GENE_N04", "STRAND": "+"}, {"SYMBOL": "GENE_N05", "STRAND": "-"} ] } ) self.assertTrue( isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) ) up = VCFRecord( "chr1", 180, "id_09", "A", ["]chr1:299]A"], info={ "RNA_FIRST": True, "MATEID": "id_10", "TESTANN": [ {"SYMBOL": "GENE_N01", "STRAND": "+"}, {"SYMBOL": "GENE_N04", "STRAND": "+"}, ] } ) down = VCFRecord( "chr1", 299, "id_10", "A", ["A[chr1:180["], info={ "MATEID": "id_09", "TESTANN": [ {"SYMBOL": "GENE_N03", "STRAND": "+"}, {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) self.assertTrue( not isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) ) up = VCFRecord( "chr1", 285, "id_11", "A", ["]chr1:148]A"], info={ "RNA_FIRST": True, "CIPOS": [0, 4], "MATEID": "id_12", "TESTANN": [ {"SYMBOL": "GENE_N06", "STRAND": "-"} ] } ) down = VCFRecord( "chr1", 148, "id_12", "A", ["A[chr1:285["], info={ "MATEID": "id_11", "TESTANN": [ {"SYMBOL": "GENE_N04", "STRAND": "+"}, {"SYMBOL": "GENE_N05", "STRAND": "-"} ] } ) self.assertTrue( isReadthrough( up, down, "TESTANN", genes, 1000, annCmpNameFct(False), regCmpNameFct(False) ) )
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_sam_path = os.path.join(tmp_folder, unique_id + ".sam") self.tmp_bam_path = os.path.join(tmp_folder, unique_id + ".bam") self.tmp_fasta_path = os.path.join(tmp_folder, unique_id + ".fa") self.tmp_faidx_path = os.path.join(tmp_folder, unique_id + ".fa.fai") self.ref_seq = "ggaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgcattggggtg" # | | | | | | | | | | # 1 3 5 7 9 11 14 17 20 23 with open(self.tmp_fasta_path, "w") as FH_seq: FH_seq.write(">chr1\n{}".format(self.ref_seq)) with open(self.tmp_faidx_path, "w") as FH_faidx: FH_faidx.write("chr1\t{}\t6\t200\t201".format(len(self.ref_seq))) self.reads_content = """>subtit_AAA/CAC_1_alt ggaagccctgatcACGCCACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAA/CAC_2_alt aagccctgatcACGCCACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAA/CAC_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >subtit_AAA/CAC_4_mixUp ggaagccctgatcACGCCAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAA/CAC_5_mixDown ggaagccctgatcACGCAACTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtitClose_AA/CC_1_alt ggaagccctgatcACGCCCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtitClose_AA/CC_2_alt aagccctgatcACGCCCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtitClose_AA/CC_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >subtitClose_AA/CC_4_mixUp ggaagccctgatcACGCCAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtitClose_AA/CC_5_mixDown ggaagccctgatcACGCACATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAATCTC/CCTTCGG_1_alt ggaagccctgatcACGCCCTTCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAATCTC/CCTTCGG_2_alt aagccctgatcACGCCCTTCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAATCTC/CCTTCGG_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >subtit_AAATCTC/CCTTCGG_4_mixUp ggaagccctgatcACGCCCTTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >subtit_AAATCTC/CCTTCGG_5_mixDown ggaagccctgatcACGCAAATCGGGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insertion_A/TGGAGG_1_alt ggaagccctgatcACGCTGGAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insertion_A/TGGAGG_2_alt aagccctgatcACGCTGGAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insertion_A/TGGAGG_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >insertion_A/TGGAGG_4_mixUp ggaagccctgatcACGCTGGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insertion_A/TGGAGG_5_mixDown ggaagccctgatcACGCAGGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >deletion_AAATCTC/T_1_alt ggaagccctgatcACGCTGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >deletion_AAATCTC/T_2_alt aagccctgatcACGCTGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >deletion_AAATCTC/T_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >deletion_AAATCTC/T_4_mixUp ggaagccctgatcACGCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >deletion_AAATCTC/T_5_mixDown ggaagccctgatcACGCAAATGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delIns_AAAT/TGA_1_alt ggaagccctgatcACGCTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delIns_AAAT/TGA_2_alt aagccctgatcACGCTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delIns_AAAT/TGA_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >delIns_AAAT/TGA_4_mixUp ggaagccctgatcACGCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delIns_AAAT/TGA_5_mixDown ggaagccctgatcACGCAAATGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDel_AAA/GGGA_1_alt ggaagccctgatcACGCGGGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDel_AAA/GGGA_2_alt aagccctgatcACGCGGGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDel_AAA/GGGA_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >insDel_AAA/GGGA_4_mixUp ggaagccctgatcACGCGGGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDel_AAA/GGGA_5_mixDown ggaagccctgatcACGCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delInsNoStd_AAATCTC/CTGGG_1_alt ggaagccctgatcACGCCTGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delInsNoStd_AAATCTC/CTGGG_2_alt aagccctgatcACGCCTGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delInsNoStd_AAATCTC/CTGGG_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >delInsNoStd_AAATCTC/CTGGG_4_mixUp ggaagccctgatcACGCCTCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >delInsNoStd_AAATCTC/CTGGG_5_mixDown ggaagccctgatcACGCAAATGGGCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_AAAT/GTGA_1_alt ggaagccctgatcACGCGTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_AAAT/GTGA_2_alt aagccctgatcACGCGTGACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_AAAT/GTGA_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >insDelNoStd_AAAT/GTGA_4_mixUp ggaagccctgatcACGCGTGAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_AAAT/GTGA_5_mixDown ggaagccctgatcACGCAACTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_CAAA/CGTGA_1_alt ggaagccctgatcACGCGTGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_CAAA/CGTGA_2_alt aagccctgatcACGCGTGATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_CAAA/CGTGA_3_ref gaagccctgatcACGCAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgc >insDelNoStd_CAAA/CGTGA_4_mixUp ggaagccctgatcACGCGTGAAATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca >insDelNoStd_CAAA/CGTGA_5_mixDown ggaagccctgatcACGCATCTCGGCATGCCGATTaagtgtgctctgaacaggacgaactggatttcctcatggaagccctgatcatcagcaaattcaaccaccagaacattgttcgctgca""" self.test_cases = [ [ VCFRecord("chr1", 18, "subtit_AAA/CAC", "A", ["C"]), VCFRecord("chr1", 20, "subtit_AAA/CAC", "A", ["C"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa subtit_AAA/CAC_1_alt 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:17A1A103 AS:i:113 XS:i:0 subtit_AAA/CAC_4_mixUp 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:1 MD:Z:17A105 AS:i:118 XS:i:0 subtit_AAA/CAC_5_mixDown 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCAACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:1 MD:Z:19A103 AS:i:118 XS:i:0 subtit_AAA/CAC_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 subtit_AAA/CAC_2_alt 0 chr1 3 60 121M * 0 0 AAGCCCTGATCACGCCACTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:15A1A103 AS:i:111 XS:i:0""" ], [ VCFRecord("chr1", 18, "subtitClose_AA/CC", "A", ["C"]), VCFRecord("chr1", 19, "subtitClose_AA/CC", "A", ["C"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa subtitClose_AA/CC_1_alt 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:17A0A104 AS:i:113 XS:i:0 subtitClose_AA/CC_4_mixUp 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:1 MD:Z:17A105 AS:i:118 XS:i:0 subtitClose_AA/CC_5_mixDown 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCACATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:1 MD:Z:18A104 AS:i:118 XS:i:0 subtitClose_AA/CC_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 subtitClose_AA/CC_2_alt 0 chr1 3 60 121M * 0 0 AAGCCCTGATCACGCCCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:15A0A104 AS:i:111 XS:i:0""" ], [ VCFRecord("chr1", 18, "subtit_AAATCTC/CCTTCGG", "AAA", ["CCT"]), VCFRecord("chr1", 23, "subtit_AAATCTC/CCTTCGG", "TC", ["GG"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa subtit_AAATCTC/CCTTCGG_1_alt 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCCTTCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:17A0A0A2T0C99 AS:i:99 XS:i:0 subtit_AAATCTC/CCTTCGG_4_mixUp 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCCCTTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:17A0A0A103 AS:i:108 XS:i:0 subtit_AAATCTC/CCTTCGG_5_mixDown 0 chr1 1 60 123M * 0 0 GGAAGCCCTGATCACGCAAATCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:22T0C99 AS:i:113 XS:i:0 subtit_AAATCTC/CCTTCGG_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 subtit_AAATCTC/CCTTCGG_2_alt 0 chr1 3 60 121M * 0 0 AAGCCCTGATCACGCCCTTCGGGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:15A0A0A2T0C99 AS:i:99 XS:i:0""" ], [ VCFRecord("chr1", 18, "insertion_A/TGGAGG", "-", ["TGG"]), VCFRecord("chr1", 19, "insertion_A/TGGAGG", "-", ["GG"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa insertion_A/TGGAGG_1_alt 0 chr1 1 60 17M3I1M2I105M * 0 0 GGAAGCCCTGATCACGCTGGAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:123 AS:i:107 XS:i:0 insertion_A/TGGAGG_4_mixUp 0 chr1 1 60 17M3I106M * 0 0 GGAAGCCCTGATCACGCTGGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:123 AS:i:114 XS:i:0 insertion_A/TGGAGG_5_mixDown 0 chr1 1 60 18M2I105M * 0 0 GGAAGCCCTGATCACGCAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:123 AS:i:115 XS:i:0 insertion_A/TGGAGG_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 insertion_A/TGGAGG_2_alt 0 chr1 3 60 15M3I1M2I105M * 0 0 AAGCCCTGATCACGCTGGAGGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:121 AS:i:105 XS:i:0""" ], [ VCFRecord("chr1", 18, "deletion_AAATCTC/T", "AAA", ["-"]), VCFRecord("chr1", 22, "deletion_AAATCTC/T", "CTC", ["-"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa deletion_AAATCTC/T_1_alt 0 chr1 1 60 17M3D1M3D99M * 0 0 GGAAGCCCTGATCACGCTGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:17^AAA1^CTC99 AS:i:100 XS:i:0 deletion_AAATCTC/T_4_mixUp 0 chr1 1 60 17M3D103M * 0 0 GGAAGCCCTGATCACGCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:17^AAA103 AS:i:111 XS:i:0 deletion_AAATCTC/T_5_mixDown 0 chr1 1 60 21M3D99M * 0 0 GGAAGCCCTGATCACGCAAATGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:21^CTC99 AS:i:111 XS:i:0 deletion_AAATCTC/T_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 deletion_AAATCTC/T_2_alt 0 chr1 3 60 15M3D1M3D99M * 0 0 AAGCCCTGATCACGCTGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:15^AAA1CTC99 AS:i:99 XS:i:0""" ], [ VCFRecord("chr1", 18, "delIns_AAAT/TGA", "AAA", ["-"]), VCFRecord("chr1", 22, "delIns_AAAT/TGA", "-", ["GA"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa delIns_AAAT/TGA_1_alt 0 chr1 1 60 17M3D1M2I102M * 0 0 GGAAGCCCTGATCACGCTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:17^AAA103 AS:i:105 XS:i:0 delIns_AAAT/TGA_4_mixUp 0 chr1 1 60 17M3D103M * 0 0 GGAAGCCCTGATCACGCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:17^AAA103 AS:i:111 XS:i:0 delIns_AAAT/TGA_5_mixDown 0 chr1 1 60 21M2I102M * 0 0 GGAAGCCCTGATCACGCAAATGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:123 AS:i:115 XS:i:0 delIns_AAAT/TGA_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 delIns_AAAT/TGA_2_alt 0 chr1 3 60 15M3D1M2I102M * 0 0 AAGCCCTGATCACGCTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:15^AAA103 AS:i:103 XS:i:0""" ], [ VCFRecord("chr1", 18, "insDel_AAA/GGGA", "-", ["GGG"]), VCFRecord("chr1", 19, "insDel_AAA/GGGA", "AA", ["-"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa insDel_AAA/GGGA_1_alt 0 chr1 1 60 17M3I1M2D103M * 0 0 GGAAGCCCTGATCACGCGGGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:18^AA103 AS:i:106 XS:i:0 insDel_AAA/GGGA_4_mixUp 0 chr1 1 60 17M3I106M * 0 0 GGAAGCCCTGATCACGCGGGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:123 AS:i:114 XS:i:0 insDel_AAA/GGGA_5_mixDown 0 chr1 1 60 18M2D103M * 0 0 GGAAGCCCTGATCACGCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:18^AA103 AS:i:113 XS:i:0 insDel_AAA/GGGA_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 insDel_AAA/GGGA_2_alt 0 chr1 3 60 15M3I1M2D103M * 0 0 AAGCCCTGATCACGCGGGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:16^AA103 AS:i:104 XS:i:0""" ], [ VCFRecord("chr1", 18, "delInsNoStd_AAATCTC/CTGGG", "AAA", ["C"]), VCFRecord("chr1", 22, "delInsNoStd_AAATCTC/CTGGG", "-", ["GGG"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa delInsNoStd_AAATCTC/CTGGG_1_alt 0 chr1 1 60 17M2D2M3I102M * 0 0 GGAAGCCCTGATCACGCCTGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:6 MD:Z:17^AA0C103 AS:i:102 XS:i:0 delInsNoStd_AAATCTC/CTGGG_4_mixUp 0 chr1 1 60 17M2D104M * 0 0 GGAAGCCCTGATCACGCCTCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:17^AA0C103 AS:i:108 XS:i:0 delInsNoStd_AAATCTC/CTGGG_5_mixDown 0 chr1 1 60 21M3I102M * 0 0 GGAAGCCCTGATCACGCAAATGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:123 AS:i:114 XS:i:0 delInsNoStd_AAATCTC/CTGGG_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 delInsNoStd_AAATCTC/CTGGG_2_alt 0 chr1 3 60 15M2D2M3I102M * 0 0 AAGCCCTGATCACGCCTGGGCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:6 MD:Z:15^AA0C103 AS:i:102 XS:i:0""" ], [ VCFRecord("chr1", 18, "insDelNoStd_AAAT/GTGA", "A", ["GTG"]), VCFRecord("chr1", 20, "insDelNoStd_AAAT/GTGA", "AT", ["-"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa insDelNoStd_AAAT/GTGA_1_alt 0 chr1 1 60 17M1D3I1M2D102M * 0 0 GGAAGCCCTGATCACGCGTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:6 MD:Z:17^A103 AS:i:103 XS:i:0 insDelNoStd_AAAT/GTGA_4_mixUp 0 chr1 1 60 17M1D3I105M * 0 0 GGAAGCCCTGATCACGCGTGAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:17^A105 AS:i:110 XS:i:0 insDelNoStd_AAAT/GTGA_5_mixDown 0 chr1 1 60 19M2D102M * 0 0 GGAAGCCCTGATCACGCAACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:19^AT102 AS:i:113 XS:i:0 insDelNoStd_AAAT/GTGA_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 insDelNoStd_AAAT/GTGA_2_alt 0 chr1 3 60 15M1D3I1M2D102M * 0 0 AAGCCCTGATCACGCGTGACTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:6 MD:Z:15A0A0A0T102 AS:i:102 XS:i:0""" ], [ VCFRecord("chr1", 17, "insDelNoStd_CAAA/CGTGA", "C", ["CGTG"]), VCFRecord("chr1", 18, "insDelNoStd_CAAA/CGTGA", "AAA", ["A"]), """@SQ SN:chr1 LN:131 @PG ID:bwa PN:bwa VN:0.7.17-r1188 CL:bwa mem ref.fa reads.fa insDelNoStd_CAAA/CGTGA_1_alt 0 chr1 1 60 17M3I1M2D103M * 0 0 GGAAGCCCTGATCACGCGTGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:18^AA103 AS:i:106 XS:i:0 insDelNoStd_CAAA/CGTGA_4_mixUp 0 chr1 1 60 17M3I106M * 0 0 GGAAGCCCTGATCACGCGTGAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:3 MD:Z:123 AS:i:114 XS:i:0 insDelNoStd_CAAA/CGTGA_5_mixDown 0 chr1 1 60 18M2D103M * 0 0 GGAAGCCCTGATCACGCATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:2 MD:Z:18^AA103 AS:i:113 XS:i:0 insDelNoStd_CAAA/CGTGA_3_ref 0 chr1 2 60 118M * 0 0 GAAGCCCTGATCACGCAAATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGC * NM:i:0 MD:Z:118 AS:i:118 XS:i:0 insDelNoStd_CAAA/CGTGA_2_alt 0 chr1 3 60 15M3I1M2D103M * 0 0 AAGCCCTGATCACGCGTGATCTCGGCATGCCGATTAAGTGTGCTCTGAACAGGACGAACTGGATTTCCTCATGGAAGCCCTGATCATCAGCAAATTCAACCACCAGAACATTGTTCGCTGCA * NM:i:5 MD:Z:16^AA103 AS:i:104 XS:i:0""" ] ]
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "standardizeVCF.py", "--trace-unstandard", "--input-reference", self.tmp_sequences, "--input-variants", self.tmp_variants, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: # Repeats: ****.... ...*** # Region: |----| |------------| |------| FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC")) # 123456789| | | | | | | | | | | | | | | | | | # 10| 14| 18| 22| 26| 30| 34| 38| 42| # 12 16 20 24 28 32 36 40 44 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create faidx with open(self.tmp_faidx, "w") as FH_fai: FH_fai.write("""artificial_chr1 45 17 45 46 artificial_chr2 11 80 11 12""") # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."), "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."), "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".") } FH_var.writeHeader() self.variants = [ # Substit single nt VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, { "expected": ["artificial_chr1:14=G/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["A|ann_1|0", "A|ann_2|0"] }), # Substit multi nt VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, { "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"] }), VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, { "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"], "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"], "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"] }), # Insertion single nt VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, { "expected": ["artificial_chr1:14=G/GA"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, { "expected": ["artificial_chr1:19=T/TA"], "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, { "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"], "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"], "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"] }), VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, { "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"], "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"], "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"] }), # Insertion multi nt VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, { "expected": ["artificial_chr1:14=G/GATGC"], "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"] }), VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, { "expected": ["artificial_chr1:19=T/TAAATC"], "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"] }), # Movable insertion multi nt VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, { "expected": ["artificial_chr1:12=A/ATG"], "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"], "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"] }), VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, { "expected": ["artificial_chr1:25=C/CAAA"], "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"], "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"] }), # Deletion single nt VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, { "expected": ["artificial_chr1:13=TG/T"], "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"], "expectedANN": ["T|ann_1|0", "T|ann_2|0"] }), VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, { "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"], "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"] }), # Movable deletion multi nt VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, { "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"], "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"], "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"] }), ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)
def testTagMultipleValues(self): # Write test data with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = { "expected": HeaderInfoAttr("expected", "Expected filter tag.", type="String", number="1"), "SAR": HeaderInfoAttr( "SAR", "Number of reads supporting the alternative allele in reverse strand.", type="Integer", number="A"), "SAF": HeaderInfoAttr( "SAF", "Number of reads supporting the alternative allele in forward strand.", type="Integer", number="A"), "SRR": HeaderInfoAttr( "SRR", "Number of reads supporting the reference allele in reverse strand.", type="Integer", number="A"), "SRF": HeaderInfoAttr( "SRF", "Number of reads supporting the reference allele in forward strand.", type="Integer", number="A"), } FH_var.writeHeader() self.variants = [ # 0.5 alt, 0.5 ref, low DP, alt no bias, ref no bias VCFRecord( "artificial_chr1", 10, "sub_01", "G", ["T"], None, None, { "SAR": [5], "SAF": [5], "SRR": [5], "SRF": [5], "expected": "PASS" }), # 0.05 alt, 0.95 ref, good DP, alt strand bias, ref no bias VCFRecord( "artificial_chr1", 40, "sub_04", "G", ["T"], None, None, { "SAR": [9], "SAF": [1], "SRR": [95], "SRF": [95], "expected": "strandRatioBias" }) ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var) # Execute command subprocess.check_call(self.cmd, stderr=subprocess.DEVNULL) # Validate results expected = [] for record in self.variants: for alt in record.alt: expected.append(record.id + ":" + record.info["expected"]) observed = [] with VCFIO(self.tmp_output) as FH_results: for record in FH_results: observed.append(record.id + ":" + record.filter[0]) self.assertEqual(expected, observed)
def mergedRecord(vcf, first, first_std_name, second, second_std_name, FH_seq): """ Return the VCFRecord corresponding to the merge of first and second. :param vcf: The file handle to VCF. :type vcf: anacore.vcf.VCFIO :param first: The upstream variant to merge. :type first: anacore.vcf.VCFRecord :param first_std_name: The initial name of the upstream variant to merge (before normalisation). :type first_std_name: str :param second: The downstream variant to merge. :type second: anacore.vcf.VCFRecord :param second_std_name: The initial name of the downstream variant to merge (before normalisation). :type second_std_name: str :param FH_seq: File handle to the refersence sequence file. :type FH_seq: IdxFastaIO :return: The variant corresponding to the merge of first and second. :rtype: anacore.vcf.VCFRecord :todo: Keep INFO and format on strand from FreeBayes, VarDict, ... """ merged = VCFRecord( first.chrom, # chrom first.pos, # pos pFormat=first.format) # Ref and Alt first_end = int(round(first.refEnd() - 0.49, 0)) second_start = int(round(second.refStart() + 0.49, 0)) ref_add = "" if second_start - first_end > 0: ref_add = FH_seq.getSub(first.chrom, first_end + 1, second_start - 1) merged.ref = first.ref + ref_add + second.ref merged.ref = merged.ref.replace(VCFRecord.getEmptyAlleleMarker(), "") merged.alt = [first.alt[0] + ref_add + second.alt[0]] merged.alt[0] = merged.alt[0].replace(VCFRecord.getEmptyAlleleMarker(), "") # Filter first_filters = [] if first.filter is None else first.filter second_filters = [] if second.filter is None else second.filter merged.filter = list(set(first_filters + second_filters)) if len(merged.filter) > 1 and "PASS" in merged.filter: merged.filter.remove("PASS") # Samples for spl in first.samples: merged.samples[spl] = {} if "DP" in first.format: merged.samples[spl]["DP"] = min(first.getDP(spl), second.getDP(spl)) if "AD" in first.format: if vcf.format["AD"].number == "1": # Contains one alt allele merged.samples[spl]["AD"] = min(first.samples[spl]["AD"], second.samples[spl]["AD"]) else: merged.samples[spl]["AD"] = [ min(first_AD, second_AD) for first_AD, second_AD in zip( first.samples[spl]["AD"], second.samples[spl]["AD"]) ] if "AF" in first.format: if vcf.format["AF"].number == "1": # Contains one alt allele merged.samples[spl]["AF"] = min(first.samples[spl]["AF"], second.samples[spl]["AF"]) else: merged.samples[spl]["AF"] = [ min(first_AF, second_AF) for first_AF, second_AF in zip( first.samples[spl]["AF"], second.samples[spl]["AF"]) ] # INFO metrics if "AD" in first.info: if vcf.info["AD"].number == "1": # Contains one alt allele merged.info["AD"] = merged.getPopAltAD()[0] elif vcf.info["AD"].number == "R": # Contains ref and alt alleles merged.info["AD"] = [merged.getPopRefAD()] + merged.getPopAltAD() else: # Contains only alt alleles merged.info["AD"] = merged.getPopAltAD() if "DP" in first.info: merged.info["DP"] = merged.getPopDP() if "AF" in first.info: if vcf.info["AF"].number == "1": # Contains one alt allele merged.info["AF"] = merged.getPopAltAF()[0] elif vcf.info["AF"].number == "R": # Contains ref and alt alleles merged.info["AF"] = [merged.getPopRefAF()] + merged.getPopAltAF() else: # Contains only alt alleles merged.info["AF"] = merged.getPopAltAF() # INFO Parents merged.info["MCO_VAR"] = [] if "MCO_VAR" in first.info: for parent in first.info["MCO_VAR"]: merged.info["MCO_VAR"].append(parent) else: merged.info["MCO_VAR"].append(first_std_name) if "MCO_VAR" in second.info: for parent in second.info["MCO_VAR"]: merged.info["MCO_VAR"].append(parent) else: merged.info["MCO_VAR"].append(second_std_name) # Quality merged.info["MCO_QUAL"] = [] if "MCO_QUAL" in first.info: for qual in first.info["MCO_QUAL"]: merged.info["MCO_QUAL"].append(qual) else: merged.info["MCO_QUAL"].append(first.qual) if "MCO_QUAL" in second.info: for qual in second.info["MCO_QUAL"]: merged.info["MCO_QUAL"].append(qual) else: merged.info["MCO_QUAL"].append(second.qual) if None not in merged.info["MCO_QUAL"]: merged.qual = mean(merged.info["MCO_QUAL"]) # Return return merged
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) self.tmp_initial_pathes = os.path.join(tmp_folder, unique_id + "_{}_initial.vcf") self.tmp_haplotyped_pathes = os.path.join(tmp_folder, unique_id + "_{}_haplotyped.vcf") self.tmp_expected_pathes = os.path.join(tmp_folder, unique_id + "_{}_expected.vcf") self.tmp_out_pathes = os.path.join(tmp_folder, unique_id + "_{}_out.vcf") # test cases self.test_cases = [ { # *a-b, a-b, a b, / "initial": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [ VCFRecord("chr1", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr1", 18, None, "A", ["G"], info={"AD": 104}) ] }, "haplotyped": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr1:14=G/C", "chr1:18=A/G"], "AD": 100})] }, "expected": { "caller1": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller2": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"])], "caller3": [VCFRecord("chr1", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104})] } }, { # *a b, a b, a-b, / "initial": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "haplotyped": { "caller1": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller2": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"], info={"MCO_VAR": ["chr2:14=G/C", "chr2:18=A/G"]})], "caller3": [VCFRecord("chr2", 14, None, "GCGTA", ["CCGTG"])] }, "expected": { "caller1": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr2", 14, None, "G", ["C"]), VCFRecord("chr2", 18, None, "A", ["G"]) ] } }, { # *a-b c, a-b c, a b c, / "initial": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "G", ["C"], info={"AD": 104}), VCFRecord("chr3", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] }, "haplotyped": { "caller1": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller2": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=GCGTA/CCGTG", "chr3:20=A/G"]})], "caller3": [VCFRecord("chr3", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr3:14=G/C", "chr3:18=A/G", "chr3:20=A/G"], "AD": 98})] }, "expected": { "caller1": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr3", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr3", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr3", 20, None, "A", ["G"], info={"AD": 98}) ] } }, { # *a-b c, a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "G", ["C"], info={"AD": 98}), VCFRecord("chr4", 18, None, "A", ["G"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller2": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=GCGTA/CCGTG", "chr4:20=A/G"]})], "caller3": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr4:14=G/C", "chr4:18=A/G", "chr4:20=A/G"], "AD": 98})], "caller4": [VCFRecord("chr4", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ], "caller3": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"], info={"AD": 104}), VCFRecord("chr4", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr4", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr4", 20, None, "A", ["G"]) ] } }, { # *a-b c, a' a-b c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr5", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"]})], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=GCGTA/CCGTG", "chr5:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr5:14=G/C", "chr5:18=A/G", "chr5:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr5", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr5", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr5", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr5", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr5", 20, None, "A", ["G"]) ] } }, { # *a b c, a' a-b c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "GCGTA", ["CCGTG"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=G/C", "chr6:18=A/G", "chr6:20=A/G"]})], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 3}), VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 100}) ], "caller3": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr6:14=GCGTA/CCGTG", "chr6:20=A/G"], "AD": 101})], "caller4": [VCFRecord("chr6", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr6", 14, None, "G", ["C"], info={"AD": 105}), VCFRecord("chr6", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr6", 20, None, "A", ["G"], info={"AD": 101}) ], "caller4": [ VCFRecord("chr6", 14, None, "G", ["C"]), VCFRecord("chr6", 18, None, "A", ["G"]), VCFRecord("chr6", 20, None, "A", ["G"]) ] } }, { # *a b c, a-b b' c, a-b c, a-b-c "initial": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=G/C", "chr7:18=A/G", "chr7:20=A/G"]})], "caller2": [ VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"], "AD": 100}), VCFRecord("chr7", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr7:14=GCGTA/CCGTG", "chr7:20=A/G"]})], "caller4": [VCFRecord("chr7", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr7", 14, None, "G", ["C"], info={"AD": 100}), VCFRecord("chr7", 18, None, "A", ["G"], info={"AD": 100}), VCFRecord("chr7", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ], "caller4": [ VCFRecord("chr7", 14, None, "G", ["C"]), VCFRecord("chr7", 18, None, "A", ["G"]), VCFRecord("chr7", 20, None, "A", ["G"]) ] } }, { # *a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"]})], "caller2": [ VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=GCGTA/CCGTG", "chr8:20=A/G"], "AD": 100}), VCFRecord("chr8", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr8:14=G/C", "chr8:18=A/G", "chr8:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr8", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr8", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr8", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr8", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr8", 20, None, "A", ["G"]) ] } }, { # *a' a-b c, a-b b' c, a b c, a-b-c "initial": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "G", ["C"], info={"AD": 110}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 105}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "haplotyped": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"]}) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=GCGTA/CCGTG", "chr9:20=A/G"], "AD": 100}), VCFRecord("chr9", 18, None, "G", ["C"], info={"AD": 3}) ], "caller3": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"], info={"MCO_VAR": ["chr9:14=G/C", "chr9:18=A/G", "chr9:20=A/G"], "AD": 100})], "caller4": [VCFRecord("chr9", 14, None, "GCGTATCA", ["CCGTGTCG"])] }, "expected": { "caller1": [ VCFRecord("chr9", 14, None, "G", ["C"]), VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ], "caller2": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 100}), VCFRecord("chr9", 18, None, "A", ["G"], info={"AD": 3}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 104}) ], "caller3": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"], info={"AD": 110}), VCFRecord("chr9", 20, None, "A", ["G"], info={"AD": 100}) ], "caller4": [ VCFRecord("chr9", 14, None, "GCGTA", ["CCGTG"]), VCFRecord("chr9", 20, None, "A", ["G"]) ] } } ] # Get callers callers = set() for curr_test in self.test_cases: for curr_caller in curr_test["initial"]: callers.add(curr_caller) self.callers = sorted(list(callers)) # Write files for curr_caller in self.callers: # Initial with VCFIO(self.tmp_initial_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["initial"]: for curr_var in curr_test["initial"][curr_caller]: handle_out.write(curr_var) # Haplotyped with VCFIO(self.tmp_haplotyped_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1"), "MCO_VAR": HeaderInfoAttr("MCO_VAR", "Name of the variants merged because their occur on same reads.", type="String", number=".") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["haplotyped"]: for curr_var in curr_test["haplotyped"][curr_caller]: handle_out.write(curr_var) # Expected with VCFIO(self.tmp_expected_pathes.format(curr_caller), "w") as handle_out: handle_out.info = { "AD": HeaderInfoAttr("AD", "Alternative allele depth.", type="Integer", number="1") } handle_out.extra_header = ["##source={}".format(curr_caller)] handle_out.writeHeader() for curr_test in self.test_cases: if curr_caller in curr_test["expected"]: for curr_var in curr_test["expected"][curr_caller]: handle_out.write(curr_var)
def setUp(self): tmp_folder = tempfile.gettempdir() unique_id = str(uuid.uuid1()) # Temporary files self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta") self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed") self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf") self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf") # Exec command self.cmd = [ "filterVCFPrimers.py", "--input-variants", self.tmp_variants, "--input-regions", self.tmp_regions, "--input-sequences", self.tmp_sequences, "--output-variants", self.tmp_output ] # Create fasta with FastaIO(self.tmp_sequences, "w") as FH_seq: FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN")) # 123456789| | | | | | | | | | # 10| 14| 18| 22| 26| # 12 16 20 24 28 FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT")) # 123456789| # 10 # Create VCF with VCFIO(self.tmp_variants, "w") as FH_var: FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")} FH_var.writeHeader() self.variants = [ VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}), VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}), # ZOI downstream limit deletion VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}), VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution before end of upstream primer VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}), # Substitution in upstream limit of ZOI VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}), # Substitution in dosnstream limit of ZOI VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}), # Substitution after start of downstream primer VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}), # Substitution in downstream primer VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}), # Insertion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}), # Insertion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}), # Insertion in downstream limit of ZOI VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}), # Deletion before end of upstream primer VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}), # Deletion in upstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}), # Deletion in downstream limit of ZOI and without standardization VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}), # Insertion after start of downstream primer ] for idx, curr_var in enumerate(self.variants): FH_var.write(curr_var)