Пример #1
0
    def test_consensus_fasta(self):
        with patch.object(
                Variant,
                "reference_sequence",
                return_value=
                "GGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGGGGCTGTGGGAGGGGTCCGCGCGTCCGCAGTGGGGATGTG",
        ) as mock_ref:
            record = next(vcf.Reader(self.vcf_file))
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            fasta_path, ref_contig, alt_contig = variant.synth_fasta(
                self.args, line_width=sys.maxsize)
            self.assertEqual(ref_contig, "1_899922_899993")
            self.assertEqual(alt_contig, "1_899922_899993_alt")
            mock_ref.assert_called_once_with(region="1:899922-899993")

            with open(fasta_path, "r") as fasta:
                lines = [line.strip() for line in fasta]
            self.assertEqual(len(lines), 4)
            self.assertEqual(lines[0], ">1_899922_899993")
            self.assertEqual(
                lines[1],
                "GGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGGGGCTGTGGGAGGGGTCCGCGCGTCCGCAGTGGGGATGTG",
            )
            self.assertEqual(lines[2], ">1_899922_899993_alt")
            self.assertEqual(lines[3], "GG")
Пример #2
0
    def test_consensus_fasta(self):
        with patch.object(
                Variant,
                "reference_sequence",
                return_value=
                "GAACCTGGGAGGCAGAGCTTGCAGTGAGCAGAGATCATGCCACTGCACTCCAGCCTGGGCGACAGAGCAAGACTCCGTCTCAAAAAAAAAAAAAAAATTAGCCAGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCATGA"
        ) as mock_ref:
            record = next(vcf.Reader(self.vcf_file))
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)
            self.assertIsNotNone(variant)

            fasta_path, ref_contig, alt_contig = variant.synth_fasta(
                self.args, line_width=sys.maxsize)
            mock_ref.assert_called_once_with(region="4:32197282-32197448")

            with open(fasta_path, "r") as fasta:
                lines = [line.strip() for line in fasta]
            self.assertEqual(len(lines), 4)
            self.assertEqual(lines[0], ">4_32197282_32197448")
            self.assertEqual(lines[0], f">{ref_contig}")
            self.assertEqual(
                lines[1],
                "GAACCTGGGAGGCAGAGCTTGCAGTGAGCAGAGATCATGCCACTGCACTCCAGCCTGGGCGACAGAGCAAGACTCCGTCTCAAAAAAAAAAAAAAAATTAGCCAGGCGTGGTGGCGGGCGCCTGTAGTCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATGGCATGA",
            )
            self.assertEqual(lines[2], ">4_32197282_32197448_alt")
            self.assertEqual(lines[2], f">{alt_contig}")
            self.assertEqual(
                lines[3],
                "GA",
            )
Пример #3
0
    def test_consensus_fasta(self):
        with patch.object(
                Variant,
                "reference_sequence",
                return_value=
                "GTATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATAT",
        ) as mock_ref:
            record = next(vcf.Reader(self.vcf_file))
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            fasta_path, ref_contig, alt_contig = variant.synth_fasta(
                self.args, line_width=sys.maxsize)
            mock_ref.assert_called_once_with(region="4:20473845-20474270")

            with open(fasta_path, "r") as fasta:
                lines = [line.strip() for line in fasta]
            self.assertEqual(len(lines), 4)
            self.assertEqual(lines[0], ">4_20473845_20474270")
            self.assertEqual(lines[0], f">{ref_contig}")
            self.assertEqual(
                lines[1],
                "GTATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATAT",
            )
            self.assertEqual(lines[2], ">4_20473845_20474270_alt")
            self.assertEqual(lines[2], f">{alt_contig}")
            self.assertEqual(
                lines[3],
                "GATATATATAGATATATCTATATATATCTATATAGATATATCTATATCTATATAGATATATCTATATATATATAGATATATCTATATCTATATAGATATATATCTATATATATATCTATATAGATATATCTATATAGATATAGATATATATCTATATATAGATATAGATATATCTATATAGATATATATCTATAGATATCTATATATATAGATATATAGATATCTATATCTATATT",
            )
Пример #4
0
    def test_pipeline_straddle_counting(self):
        for record in vcf.Reader(self.vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam")
            sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam)

            fragments = npsva.RealignedFragments(
                self.input_fasta,
                sample.mean_insert_size,
                sample.std_insert_size,
                sample.insert_size_density().as_dict(),
                input_bam,
            )
            fragments.gather_reads(variant.region_string(flank=self.args.flank))
            self.assertEqual(fragments.size(), 254)

            left_breakpoint = variant.left_flank_region_string(left_flank=1, right_flank=1)
            right_breakpoint = variant.right_flank_region_string(left_flank=1, right_flank=1)
            pair_results = fragments.count_pipeline_straddlers(
                left_breakpoint, right_breakpoint, self.args.flank, -variant.event_length, 1.5, 10,
            )
            self.assertAlmostEqual(pair_results["alt_weighted_count"], 13.496, places=1)
            self.assertAlmostEqual(pair_results["insert_lower"], 0.0, places=2)
            self.assertAlmostEqual(pair_results["insert_upper"] / pair_results["insert_count"], 0.166, places=2)
Пример #5
0
    def test_realigned_read_counting(self):
        for record in vcf.Reader(self.vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)

            input_bam = os.path.join(FILE_DIR, "1_2073761_2073846_DEL_2.bam")
            sample = Sample.from_npsv(os.path.join(FILE_DIR, "stats.json"), input_bam)

            fragments = npsva.RealignedFragments(
                self.input_fasta,
                sample.mean_insert_size,
                sample.std_insert_size,
                sample.insert_size_density().as_dict(),
                input_bam,
            )
            fragments.gather_reads(variant.region_string(flank=self.args.flank))
            self.assertEqual(fragments.size(), 254)

            ref_contig = "1_2073761_2073846_DEL"
            alt_contig = "1_2073761_2073846_DEL_alt"
            
            rl_breakpoint = f"{ref_contig}:{self.args.flank}-{self.args.flank+1}"
            al_breakpoint = f"{alt_contig}:{self.args.flank}-{self.args.flank+1}"
            ref_length = variant.ref_length    
            rr_breakpoint = f"{ref_contig}:{self.args.flank + ref_length - 1}-{self.args.flank + ref_length}"

            counts, read_names = fragments.count_realigned_reads([(rl_breakpoint, rr_breakpoint, al_breakpoint, "")])            
            self.assertEqual(counts["al"], 18.0)
            self.assertEqual((counts["rl"] + counts["rr"]) / 2, 4.0)
            for bp in ("rl", "rr", "al", "rl"):
                self.assertEqual(len(read_names[bp]), counts[bp])
Пример #6
0
    def test_complex_breakpoint_arguments(self, mock_fragments):
        vcf_file = io.StringIO(
            """##fileformat=VCFv4.1
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##ALT=<ID=DEL,Description="Deletion">
##contig=<ID=1,length=249250621>
#CHROM POS ID REF ALT QUAL FILTER INFO
1	100	.	AACGGTT	ACGT	.	PASS	SVTYPE=DEL;END=106;SVLEN=-3
"""
        )

        for record in vcf.Reader(vcf_file):
            variant = Variant.from_pyvcf(record, None)
            self.assertIsNotNone(variant)

        read_counts = {"rl": 0, "rr": 0, "al": 0, "ar": 0}
        mock_fragments.count_realigned_reads.return_value = (read_counts, [])

        count_realigned_reads(self.args, mock_fragments, variant)
        mock_fragments.count_realigned_reads.assert_called_once_with(
            [("ref:1-2","ref:7-8","alt:1-2","alt:4-5")],
            count_straddle=True,
        )
Пример #7
0
    def test_count_realigned_alleles(self):
        for record in vcf.Reader(self.vcf_file):
            variant = Variant.from_pyvcf(record, None)

            fragments = npsva.RealignedFragments(
                self.input_fasta,
                self.sample.mean_insert_size,
                self.sample.std_insert_size,
                self.sample.insert_size_density().as_dict(),
                self.input_bam,
            )
            fragments.gather_reads(
                variant.region_string(flank=self.args.flank))
            self.assertGreater(fragments.size(), 0)

            hom_alt_ref, hom_alt_alt, _ = count_realigned_reads(
                self.args,
                fragments,
                variant,
                input_fasta=self.input_fasta,
                ref_contig="1_2073761_2073846_DEL",
                alt_contig="1_2073761_2073846_DEL_alt",
            )
            self.assertTrue(hom_alt_ref, 4.0)
            self.assertTrue(hom_alt_alt, 18.0)
Пример #8
0
    def test_consensus_fasta(self):
        with patch.object(
                Variant,
                "reference_sequence",
                return_value=
                "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC",
        ) as mock_ref:
            record = next(vcf.Reader(self.vcf_file))
            variant = Variant.from_pyvcf(record, None)
            self.assertIsNotNone(variant)

            fasta_path, ref_contig, alt_contig = variant.synth_fasta(
                self.args, line_width=sys.maxsize)
            mock_ref.assert_called_once_with(region="1:4999478-4999733")

            self.assertEqual(ref_contig, "1_4999478_4999733")
            self.assertEqual(alt_contig, "1_4999478_4999733_alt")

            with open(fasta_path, "r") as fasta:
                lines = [line.strip() for line in fasta]
            self.assertEqual(len(lines), 4)
            self.assertEqual(lines[0], ">1_4999478_4999733")
            self.assertEqual(
                lines[1],
                "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC"
            )
            self.assertEqual(lines[2], ">1_4999478_4999733_alt")
            self.assertEqual(
                lines[3],
                "TCTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGACTCCATATGATGTCAGTGTCCTCCATATGATGTCAGTGTCCTCCATATGACATCAATATCCTCCATATGATATCAATATCCTCTGTATTGATATTGATATTGATATTTGGAGGATATCAATATCCTCCAAATGATGTCAGTGTCCTCCATATGATGTCAATGTCCTCCATATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCGTATGATGTCAATATCCTCCATATGATGTCAGTGTCCTCTGTATGAC"
            )
Пример #9
0
 def test_region_strings(self):
     record = next(vcf.Reader(self.vcf_file))
     variant = Variant.from_pyvcf(record, None)
     self.assertEqual(variant.region_string(), "1:899923-899992")
     self.assertEqual(
         variant.left_flank_region_string(left_flank=2, right_flank=5),
         "1:899921-899927")
     self.assertEqual(
         variant.right_flank_region_string(left_flank=2, right_flank=5),
         "1:899991-899997")
Пример #10
0
def gnomad_coverage_profile(args, gnomad_coverage: str, input_vcf: str,
                            output_file):
    record = next(vcf.Reader(filename=input_vcf))
    variant = Variant.from_pyvcf(record)
    variant.gnomad_coverage_profile(
        args,
        gnomad_coverage,
        output_file,
        ref_contig=args.ref_contig,
        alt_contig=args.alt_contig,
    )
Пример #11
0
    def test_feature_extraction(self):
        for record in vcf.Reader(self.vcf_file):
            self.assertTrue(record.is_sv)

            variant = Variant.from_pyvcf(record, None)
            features = extract_features(
                self.args,
                variant,
                self.input_bam,
                self.sample,
                input_fasta=self.input_fasta,
                ref_contig="1_2073761_2073846_DEL",
                alt_contig="1_2073761_2073846_DEL_alt",
            )
Пример #12
0
    def test_breakpoints(self):
        record = next(vcf.Reader(self.vcf_file))
        variant = Variant.from_pyvcf(record, None)

        self.assertEqual(
            variant.left_flank_region_string(left_flank=1, right_flank=1),
            "4:32197282-32197283")
        self.assertEqual(
            variant.right_flank_region_string(left_flank=1, right_flank=1),
            "4:32197447-32197448")

        self.assertEqual(variant.ref_breakpoints(self.args.flank),
                         ("4:1-2", "4:166-167"))
        self.assertEqual(variant.alt_breakpoints(self.args.flank),
                         ("4:1-2", None))
Пример #13
0
    def test_breakpoints(self):
        record = next(vcf.Reader(self.vcf_file))
        variant = Variant.from_pyvcf(record, None)

        self.assertEqual(
            variant.left_flank_region_string(left_flank=1, right_flank=1),
            "4:20473845-20473846")
        self.assertEqual(
            variant.right_flank_region_string(left_flank=1, right_flank=1),
            "4:20474269-20474270")

        self.assertEqual(variant.ref_breakpoints(self.args.flank),
                         ("4:1-2", "4:425-426"))
        self.assertEqual(variant.alt_breakpoints(self.args.flank),
                         ("4:1-2", "4:235-236"))
Пример #14
0
    def test_breakpoints(self):
        record = next(vcf.Reader(self.vcf_file))
        variant = Variant.from_pyvcf(record, None)

        self.assertEqual(
            variant.left_flank_region_string(left_flank=1, right_flank=1),
            "8:79683397-79683398")
        self.assertEqual(
            variant.right_flank_region_string(left_flank=1, right_flank=1),
            "8:79683487-79683488")

        self.assertEqual(variant.ref_breakpoints(self.args.flank),
                         ("8:1-2", "8:91-92"))
        self.assertEqual(variant.alt_breakpoints(self.args.flank),
                         ("8:1-2", "8:2-3"))
Пример #15
0
    def test_breakpoints(self):
        record = next(vcf.Reader(self.vcf_file))
        variant = Variant.from_pyvcf(record, None)
        self.assertIsNotNone(variant)

        self.assertEqual(
            variant.left_flank_region_string(left_flank=1, right_flank=1),
            "1:899922-899923")
        self.assertEqual(
            variant.right_flank_region_string(left_flank=1, right_flank=1),
            "1:899992-899993")

        self.assertEqual(variant.ref_breakpoints(self.args.flank),
                         ("1:1-2", "1:71-72"))
        self.assertEqual(variant.alt_breakpoints(self.args.flank),
                         ("1:1-2", None))
Пример #16
0
    def test_breakpoints(self):
        record = next(vcf.Reader(self.vcf_file))
        variant = Variant.from_pyvcf(record, None)
        self.assertIsNotNone(variant)

        self.assertEqual(
            variant.left_flank_region_string(left_flank=1, right_flank=1),
            "1:4999478-4999479")
        self.assertEqual(
            variant.right_flank_region_string(left_flank=1, right_flank=1),
            "1:4999732-4999733")

        self.assertEqual(variant.ref_breakpoints(self.args.flank),
                         ("1:1-2", "1:255-256"))
        self.assertEqual(variant.alt_breakpoints(self.args.flank),
                         ("1:255-256", None))
Пример #17
0
 def test_read_gather_call(self):
     for record in vcf.Reader(self.vcf_file):
         with patch.object(npsva.RealignedFragments,
                           "gather_reads",
                           return_value=0) as mock_gather:
             variant = Variant.from_pyvcf(record, None)
             extract_features(
                 self.args,
                 variant,
                 self.input_bam,
                 self.sample,
                 input_fasta=self.input_fasta,
                 ref_contig="1_2073761_2073846_DEL",
                 alt_contig="1_2073761_2073846_DEL_alt",
             )
             # Since this is a small variant, there should only one gather reads call
             mock_gather.assert_called_once()
Пример #18
0
    def test_variant_properties(self):
        record = next(vcf.Reader(self.vcf_file))
        self.assertTrue(record.is_sv)
        variant = Variant.from_pyvcf(record, None)
        self.assertIsNotNone(variant)
        self.assertTrue(variant.is_deletion)

        self.assertEqual(
            variant.ref_length,
            len("TATATATATATAGATCTATATATCTATATATAGATCTATATATAGATATATATCTATATATATAGATATATAGATATATAGATCTATATATAGATATATATATCTATATATAGATCTATATATAGATATAGATATCTATATAGATATCTATATCTATATATATGTAGATATATAGATATAGATATCTATATATCTATATATATAGATATCTATAGATATATATCTATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATATCTATATATAGATAGATATATATCTATATATAGATATATCTATATCTATATATAGATATATATCTATATATAGATATATCTATATATAGATATATATCTATAGATATATCTATATATATCGATATATCTATATATATCGATATATA"
                ))

        self.assertEqual(
            variant.alt_length,
            len("ATATATATAGATATATCTATATATATCTATATAGATATATCTATATCTATATAGATATATCTATATATATATAGATATATCTATATCTATATAGATATATATCTATATATATATCTATATAGATATATCTATATAGATATAGATATATATCTATATATAGATATAGATATATCTATATAGATATATATCTATAGATATCTATATATATAGATATATAGATATCTATATCTATAT"
                ),
        )
Пример #19
0
    def test_ci_for_precise_variants(self):
        vcf_file = io.StringIO("""##fileformat=VCFv4.1
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##ALT=<ID=DEL,Description="Deletion">
##contig=<ID=1,length=249250621>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827708;SVLEN=-14
""")
        for record in vcf.Reader(vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)
            self.assertTrue(variant.is_precise)
            self.assertEqual(variant.get_ci("CIPOS", 10), [0, 0])
Пример #20
0
    def test_manta_vcf(self):
        vcf_file = io.StringIO("""##fileformat=VCFv4.1
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVINSLEN,Number=.,Type=Integer,Description="Length of insertion">
##INFO=<ID=SVINSSEQ,Number=.,Type=String,Description="Sequence of insertion">
##ALT=<ID=INS,Description="Insertion">
##contig=<ID=3,length=198022430>
#CHROM POS ID REF ALT QUAL FILTER INFO
3       72386664        MantaINS:1:5470:5470:0:0:0      G       <INS>   999     MaxDepth        END=72386664;SVTYPE=INS;SVLEN=10;CIPOS=0,9;CIEND=0,9;SVINSLEN=10;SVINSSEQ=GTGTGTGTGC
""")
        record = next(vcf.Reader(vcf_file))
        variant = Variant.from_pyvcf(record, None)
        self.assertEqual(variant._alt_seq(flank=1, ref_seq="GT"),
                         "GGTGTGTGTGCT")
Пример #21
0
    def test_ci_for_imprecise_variants(self):
        vcf_file = io.StringIO("""##fileformat=VCFv4.1
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##ALT=<ID=DEL,Description="Deletion">
##contig=<ID=2,length=243199373>
#CHROM POS ID REF ALT QUAL FILTER INFO
2 321682 . T <DEL> . PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20
""")
        for record in vcf.Reader(vcf_file):
            self.assertTrue(record.is_sv)
            variant = Variant.from_pyvcf(record, None)
            self.assertFalse(variant.is_precise)
            self.assertEqual(variant.get_ci("CIPOS", 10), [-56, 20])
            self.assertEqual(variant.get_ci("CIEND", 10), [-10, 10])
Пример #22
0
    def test_gnomad_coverage_profile(self):
        record = next(vcf.Reader(self.vcf_file))
        self.assertTrue(record.is_sv)
        variant = Variant.from_pyvcf(record, None)

        covg_path, ref_contig, alt_contig = variant.gnomad_coverage_profile(
            self.args,
            os.path.join(
                FILE_DIR,
                "1_896922_903086.gnomad.genomes.coverage.summary.tsv.gz"),
            line_width=sys.maxsize,
        )
        self.assertEqual(ref_contig, "1_899922_899993")
        self.assertEqual(alt_contig, "1_899922_899993_alt")
        with open(covg_path, "r") as fasta:
            lines = [line.strip() for line in fasta]
        self.assertEqual(len(lines), 2)
        self.assertEqual(lines[1], "1_899922_899993_alt\t=1")
Пример #23
0
def random_variants(
    variant_or_vcf_path,
    ref_path,
    genome_path,
    gap_path,
    output_file,
    n=1,
    use_X=False,
    only_sex=False,
    flank=0,
):
    contigs = load_contigs(genome_path, use_X=use_X, only_sex=only_sex)
    gaps = pysam.TabixFile(gap_path)  # pylint: disable=no-member
    ref_reader = pysam.FastaFile(ref_path)  # pylint: disable=no-member

    write_header(contigs, output_file)

    if isinstance(variant_or_vcf_path, Variant):
        random_variant(variant_or_vcf_path,
                       ref_reader,
                       contigs,
                       gaps,
                       output_file,
                       n=n,
                       flank=flank)
    else:
        for record in vcf.Reader(filename=variant_or_vcf_path):
            variant = Variant.from_pyvcf(record)
            if variant is not None:
                random_variant(variant,
                               ref_reader,
                               contigs,
                               gaps,
                               output_file,
                               n=n,
                               flank=flank)

    ref_reader.close()
    gaps.close()
Пример #24
0
def main():
    parser = make_argument_parser()
    args = parser.parse_args()

    logging.basicConfig(level=args.loglevel)

    # Create any directories that are needed
    logging.info(
        f"Creating {args.output} output and {args.tempdir} temporary directories if they don't exist"
    )
    os.makedirs(args.output, exist_ok=True)
    os.makedirs(args.tempdir, exist_ok=True)

    # Initialize parallel computing setup
    ray.init(num_cpus=args.threads,
             _temp_dir=args.tempdir,
             include_dashboard=False)

    # TODO: If library is not specified compute statistics, i.e. mean insert size, tec.
    if args.stats_path is not None:
        logging.info("Extracting BAM stats from NPSV stats file")
        sample = Sample.from_npsv(args.stats_path,
                                  bam_path=args.bam,
                                  ped_path=args.ped_path)
    elif None not in (
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            args.depth,
    ):
        logging.info("Using Normal distribution for BAM stats")
        sample = Sample.from_distribution(
            args.bam,
            args.fragment_mean,
            args.fragment_sd,
            args.read_length,
            mean_coverage=args.depth,
        )
    else:
        raise parser.error(
            "Library information needed. Either provide distribution parameters or run `npsvg preprocess` to generate stats file."
        )

    # Select directory for variant files
    if args.keep_synth_bams:
        variant_dir = args.output
    else:
        variant_dir = args.tempdir

    # For each variant generate synthetic bam file(s) and extract relevant evidence
    observed_variants = {}
    record_results = []
    vcf_reader = vcf.Reader(filename=args.input)
    for i, record in enumerate(tqdm(vcf_reader, desc="Preparing variants")):
        variant = Variant.from_pyvcf(record, args.reference)
        # npsv currently only supports deletions
        if variant is None:
            continue

        # NPSV currently does not support variants with duplicate start and end coordinates
        description = variant_descriptor(record)
        if observed_variants.setdefault(description, i) != i:
            logging.warning("Skipping variant with duplicate description %s",
                            description)
            continue

        # Construct single variant VCF outside of worker so we don't need to pass the reader into the thread
        variant_vcf_path = os.path.join(variant_dir, description + ".vcf")
        if not args.reuse or not os.path.exists(variant_vcf_path + ".gz"):
            variant_vcf_path = write_record_to_indexed_vcf(
                record, vcf_reader, variant_vcf_path)
        else:
            # Variant file already exists, no need to recreate
            variant_vcf_path += ".gz"

        record_results.append(
            simulate_and_extract.remote(args, sample, variant,
                                        variant_vcf_path, description))

    # Concatenate output files to create feature files
    sim_tsv_path = os.path.join(args.output, args.prefix + ".sim.tsv")
    real_tsv_path = os.path.join(args.output, args.prefix + ".real.tsv")
    logging.info("Extracting features (to %s and %s)", sim_tsv_path,
                 real_tsv_path)

    with open(sim_tsv_path, "w") as file:
        Features.header(out_file=file, ac=True)
    with open(real_tsv_path, "w") as file:
        Features.header(out_file=file, ac=False)

    with open(sim_tsv_path, "ab") as sim_sink, open(real_tsv_path,
                                                    "ab") as real_sink:
        for sim_result, real_result in tqdm(
                ray_iterator(record_results),
                total=len(record_results),
                desc="Extracting features",
        ):
            with open(sim_result, "rb") as source:
                shutil.copyfileobj(source, sim_sink)
            sim_sink.flush()
            with open(real_result, "rb") as source:
                shutil.copyfileobj(source, real_sink)
            real_sink.flush()

    # Perform genotyping
    with open(os.path.join(args.output, args.prefix + ".npsv.vcf"),
              "w") as gt_vcf_file:
        logging.info("Determining genotypes (output in %s)", gt_vcf_file.name)
        genotyping_args = argparse.Namespace(**vars(args))
        genotype_vcf(
            genotyping_args,
            args.input,
            sim_tsv_path,
            real_tsv_path,
            gt_vcf_file,
            samples=[sample.name],
        )
Пример #25
0
 def test_region_string(self):
     record = next(vcf.Reader(self.vcf_file))
     variant = Variant.from_pyvcf(record, None)
     self.assertIsNotNone(variant)
     self.assertEqual(variant.region_string(flank=1), "1:4999478-4999733")
Пример #26
0
 def test_region_string(self):
     record = next(vcf.Reader(self.vcf_file))
     variant = Variant.from_pyvcf(record, None)
     with self.assertRaises(ValueError):
         variant.region_string()
     self.assertEqual(variant.region_string(flank=1), "1:931634-931635")
Пример #27
0
 def test_event_length(self):
     record = next(vcf.Reader(self.vcf_file))
     variant = Variant.from_pyvcf(record, None)
     self.assertEqual(variant.event_length, 70)