def testRescueMission(self): # false negative variant at location is SV; don't rescue fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCATTGCT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G GC 20 PASS . GT 1/1\n """ true_vars = vcf_to_ChromVariants(fn_str,'chr1') pred_vars = vcf_to_ChromVariants(fp_str,'chr1') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(true_vars,pred_vars,get_empty_ChromVariants('chr1'),8000,get_reference(),100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) # variant couldn't be rescued; no change to counts or ChromVariants fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 2 . A C 20 PASS . GT 1/1\n chr1 7 . C T 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 4 . A C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr1'),2,get_reference(),100) self.assertFalse(any(map(lambda x: x > 0, num_new_tp.itervalues()))) self.assertFalse(any(map(lambda x: x > 0, num_removed_fn.itervalues()))) self.assertEqual(len(fn_vars.all_locations),2) self.assertEqual(len(fp_vars.all_locations),1) self.assertEqual(rescuedvars,[]) # variant is rescued; counts change; variants are removed from fn/fp fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') num_new_tp,num_removed_fn,rescuedvars = rescue_mission(fn_vars,fp_vars,get_empty_ChromVariants('chr2'),2,get_reference(),100) self.assertEqual(num_new_tp[VARIANT_TYPE.INDEL_OTH],1) self.assertEqual(num_removed_fn[VARIANT_TYPE.SNP],2) self.assertEqual(len(fn_vars.all_locations),0) self.assertEqual(len(fp_vars.all_locations),0) self.assertEqual(map(lambda r: r.pos,rescuedvars),[3,4])
def testFullRescue(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertTrue(rescuer.rescued) self.assertEqual(rescuer.windowsRescued,(0,0)) fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ fp_vars = vcf_to_ChromVariants(fp_str,'chr2') fn_vars = vcf_to_ChromVariants(fn_str,'chr2') rescuer = SequenceRescuer('chr2',3,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertTrue(rescuer.rescued) fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 5 . TC T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr4') fp_vars = vcf_to_ChromVariants(fp_str,'chr4') tp_vars = vcf_to_ChromVariants(tp_str,'chr4') rescuer = SequenceRescuer('chr4',3,fn_vars,fp_vars,tp_vars,get_reference(),50) self.assertTrue(rescuer.rescued)
def testTooManyPaths(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10053 . TGCGT T 20 PASS . GT 0/1\n chr1 10055 . GCTAA G 20 PASS . GT 0/1\n chr1 10057 . TA T 20 PASS . GT 1/1\n chr1 10058 . GC G 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10025 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10028 . TGCGT T 20 PASS . GT 0/1\n chr1 10029 . GCTAA G 20 PASS . GT 0/1\n chr1 10032 . TA T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10000, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testVariantWithMismatchedRef(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G C 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr2') fp_vars = vcf_to_ChromVariants(fp_str,'chr2') rescuer = SequenceRescuer('chr2',2,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testNormalizedVariants(self): fp_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ fn_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTC 20 PASS . GT 0/1\n """ fp_vars = normalize_vcf_to_ChromVariants(fp_str,'chr4') fn_vars = vcf_to_ChromVariants(fn_str,'chr4') rescuer = SequenceRescuer('chr4',2,fn_vars,fp_vars,get_empty_ChromVariants('chr4'),get_reference(),50) self.assertTrue(rescuer.rescued)
def testEmptyWindow(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G C 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testNormalizedVariants(self): fp_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C CTC 20 PASS . GT 0/1\n chr4 6 . C CTC 20 PASS . GT 0/1\n """ fn_str = """##fileformat=VCFv4.0 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 2 . A ATCTC 20 PASS . GT 0/1\n """ fp_vars = normalize_vcf_to_ChromVariants(fp_str, 'chr4') fn_vars = vcf_to_ChromVariants(fn_str, 'chr4') rescuer = SequenceRescuer('chr4', 2, fn_vars, fp_vars, get_empty_ChromVariants('chr4'), get_reference(), 50) self.assertTrue(rescuer.rescued)
def testVariantWithMismatchedRef(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G C 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testEmptyWindow(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 8000 . G C 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testWindowTooBig(self): longsv1 = 'ATTGTTCATGA'*300 longsv2 = 'GCCTAGGGTCA'*300 fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 7001 . """ + longsv1 + """ A 20 PASS . GT 1/1\n chr1 10100 . """ + longsv2 + """ G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10049,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testWindowTooBig(self): longsv1 = 'ATTGTTCATGA' * 300 longsv2 = 'GCCTAGGGTCA' * 300 fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 7001 . """ + longsv1 + """ A 20 PASS . GT 1/1\n chr1 10100 . """ + longsv2 + """ G 20 PASS . GT 0/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr1') fp_vars = vcf_to_ChromVariants(fp_str, 'chr1') rescuer = SequenceRescuer('chr1', 10049, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertFalse(rescuer.rescued)
def testTooManyPaths(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10049 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10053 . TGCGT T 20 PASS . GT 0/1\n chr1 10055 . GCTAA G 20 PASS . GT 0/1\n chr1 10057 . TA T 20 PASS . GT 1/1\n chr1 10058 . GC G 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n ##source=TVsim\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr1 10025 . CTTAAGCT C 20 PASS . GT 1/1\n chr1 10028 . TGCGT T 20 PASS . GT 0/1\n chr1 10029 . GCTAA G 20 PASS . GT 0/1\n chr1 10032 . TA T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str,'chr1') fp_vars = vcf_to_ChromVariants(fp_str,'chr1') rescuer = SequenceRescuer('chr1',10000,fn_vars,fp_vars,get_empty_ChromVariants('chr2'),get_reference(),50) self.assertFalse(rescuer.rescued)
def testFullRescue(self): fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 2 . TGC TAT 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . G A 20 PASS . GT 1/1\n chr2 4 . C T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') rescuer = SequenceRescuer('chr2', 2, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertTrue(rescuer.rescued) self.assertEqual(rescuer.windowsRescued, (0, 0)) fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GC G 20 PASS . GT 1/1\n chr2 6 . G A 20 PASS . GT 1/1\n """ fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr2 3 . GCCG GCA 20 PASS . GT 1/1\n """ fp_vars = vcf_to_ChromVariants(fp_str, 'chr2') fn_vars = vcf_to_ChromVariants(fn_str, 'chr2') rescuer = SequenceRescuer('chr2', 3, fn_vars, fp_vars, get_empty_ChromVariants('chr2'), get_reference(), 50) self.assertTrue(rescuer.rescued) fn_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 3 . TC T 20 PASS . GT 1/1\n chr4 8 . C T 20 PASS . GT 1/1\n """ fp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 4 . C T 20 PASS . GT 1/1\n chr4 7 . TC T 20 PASS . GT 1/1\n """ tp_str = """##fileformat=VCFv4.0\n ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001\n chr4 5 . TC T 20 PASS . GT 1/1\n """ fn_vars = vcf_to_ChromVariants(fn_str, 'chr4') fp_vars = vcf_to_ChromVariants(fp_str, 'chr4') tp_vars = vcf_to_ChromVariants(tp_str, 'chr4') rescuer = SequenceRescuer('chr4', 3, fn_vars, fp_vars, tp_vars, get_reference(), 50) self.assertTrue(rescuer.rescued)