Пример #1
0
    def test_reference_sequence_generic_load_sequence(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)
            self.assertEqual(seq.ID, "1")
            self.assertEqual(seq.chrom, "1")
            self.assertEqual(seq.path, dest)
            np.testing.assert_array_equal(
                seq.sequence,
                np.array(
                    bytearray(
                        "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN",
                        encoding="utf-8",
                        errors="strict",
                    ),
                    dtype=np.uint8,
                ),
            )
            self.assertListEqual(list("AGGCCGGAC"),
                                 list(map(chr, seq.sequence[100:109])))
            self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6")
            self.assertEqual(seq.start, 1)
            self.assertEqual(seq.end, 117)
            self.assertEqual(seq.length, 117)
Пример #2
0
    def test_save_snps_vcf_discrepant_pos(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # create discrepant SNPs by setting positions outside reference sequence
            s._snps.loc["rs1", "pos"] = 0
            s._snps.loc["rs17", "pos"] = 118

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        pd.testing.assert_frame_equal(
            s.discrepant_vcf_position,
            self.create_snp_df(
                rsid=["rs1", "rs17"],
                chrom=["1", "1"],
                pos=[0, 118],
                genotype=["AA", np.nan],
            ),
            check_exact=True,
        )

        expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
Пример #3
0
    def test_save_snps_vcf_false_positive_build(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(snps.save(vcf=True), output)

                s = ""
                with open(output, "r") as f:
                    for line in f.readlines():
                        if "snps v" in line:
                            s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n'
                        else:
                            s += line

                with open(output, "w") as f:
                    f.write(s)

            self.run_parsing_tests_vcf(output)
Пример #4
0
    def test_read_ftdna_concat_gzip_extra_data(self):
        # https://www.familytreedna.com

        total_snps1 = 10
        total_snps2 = 10
        # generate content of first file
        s1 = "RSID,CHROMOSOME,POSITION,RESULT\r\n"
        for i in range(0, total_snps1):
            s1 += '"rs{}","1","{}","AA"\r\n'.format(1 + i, 101 + i)

        # generate content of second file
        s2 = "RSID,CHROMOSOME,POSITION,RESULT\r\n"
        for i in range(0, total_snps2):
            s2 += '"rs{}","1","{}","AA"\r\n'.format(total_snps1 + 1 + i,
                                                    total_snps1 + 101 + i)

        snps_df = self.create_snp_df(
            rsid=[
                "rs{}".format(1 + i)
                for i in range(0, total_snps1 + total_snps2)
            ],
            chrom="1",
            pos=[101 + i for i in range(0, total_snps1 + total_snps2)],
            genotype="AA",
        )

        with tempfile.TemporaryDirectory() as tmpdir:
            file1 = os.path.join(tmpdir, "ftdna_concat_gzip1.csv")
            file1_gz = "{}.gz".format(file1)
            file2 = os.path.join(tmpdir, "ftdna_concat_gzip2.csv")
            file2_gz = "{}.gz".format(file2)
            path = os.path.join(tmpdir, "ftdna_concat_gzip.csv.gz")

            # write individual files
            with open(file1, "w") as f:
                f.write(s1)
            with open(file2, "w") as f:
                f.write(s2)

            # compress files
            gzip_file(file1, file1_gz)
            gzip_file(file2, file2_gz)

            # concatenate gzips
            with open(file1_gz, "rb") as f:
                data = f.read()
            with open(file2_gz, "rb") as f:
                data += f.read()

            # add extra data
            data += b"extra data"

            # write file with concatenated gzips and extra data
            with open(path, "wb") as f:
                f.write(data)

            self.make_parsing_assertions(self.parse_file(path), "FTDNA", False,
                                         37, False, snps_df)
            self.make_parsing_assertions(self.parse_bytes(path), "FTDNA",
                                         False, 37, False, snps_df)
Пример #5
0
    def _setup_gsa_test(resources_dir):
        # reset resource if already loaded
        r = Resources()
        r._resources_dir = resources_dir
        r._gsa_resources = {}

        gzip_file(
            "tests/resources/gsa_rsid_map.txt",
            os.path.join(resources_dir, "gsa_rsid_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/gsa_chrpos_map.txt",
            os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"),
        )
Пример #6
0
    def run_parsing_tests(self,
                          file,
                          source,
                          phased=False,
                          build=37,
                          build_detected=False,
                          snps_df=None):
        self.make_parsing_assertions(self.parse_file(file), source, phased,
                                     build, build_detected, snps_df)
        self.make_parsing_assertions(self.parse_bytes(file), source, phased,
                                     build, build_detected, snps_df)

        with tempfile.TemporaryDirectory() as tmpdir:
            base = os.path.basename(file)
            dest = os.path.join(tmpdir, f"{base}.gz")
            gzip_file(file, dest)
            self.make_parsing_assertions(self.parse_file(dest), source, phased,
                                         build, build_detected, snps_df)
            self.make_parsing_assertions(self.parse_bytes(dest), source,
                                         phased, build, build_detected,
                                         snps_df)
            # remove .gz extension
            shutil.move(dest, dest[:-3])
            self.make_parsing_assertions(
                self.parse_file(dest[:-3]),
                source,
                phased,
                build,
                build_detected,
                snps_df,
            )

            dest = os.path.join(tmpdir, f"{base}.zip")
            zip_file(file, dest, base)
            self.make_parsing_assertions(self.parse_file(dest), source, phased,
                                         build, build_detected, snps_df)
            self.make_parsing_assertions(self.parse_bytes(dest), source,
                                         phased, build, build_detected,
                                         snps_df)
            # remove .zip extension
            shutil.move(dest, dest[:-4])
            self.make_parsing_assertions(
                self.parse_file(dest[:-4]),
                source,
                phased,
                build,
                build_detected,
                snps_df,
            )
Пример #7
0
    def test_save_snps_vcf(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf")
Пример #8
0
    def test_save_snps_vcf(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(s.save(vcf=True), output)

            self.run_parsing_tests_vcf(output)
Пример #9
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # save phased data to VCF
            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        # read saved VCF
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", phased=True)
Пример #10
0
    def test_save_snps_vcf_discrepant_pos(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                # create discrepant SNPs by setting positions outside reference sequence
                s._snps.loc["rs1", "pos"] = 0
                s._snps.loc["rs17", "pos"] = 118

                # esnure this is the right type after manual tweaking
                s._snps = s._snps.astype({"pos": np.uint32})

                self.assertEqual(s.save(vcf=True), output)

            pd.testing.assert_frame_equal(
                s.discrepant_vcf_position,
                self.create_snp_df(
                    rsid=["rs1", "rs17"],
                    chrom=["1", "1"],
                    pos=[0, 118],
                    genotype=["AA", np.nan],
                ),
                check_exact=True,
            )

            expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
            self.run_parsing_tests_vcf(output, snps_df=expected)
Пример #11
0
    def _setup_gsa_test(resources_dir):
        # reset resource if already loaded
        r = Resources()
        r._resources_dir = resources_dir
        r._init_resource_attributes()

        gzip_file(
            "tests/resources/gsa_rsid_map.txt",
            os.path.join(resources_dir, "gsa_rsid_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/gsa_chrpos_map.txt",
            os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/dbsnp_151_37_reverse.txt",
            os.path.join(resources_dir, "dbsnp_151_37_reverse.txt.gz"),
        )
Пример #12
0
    def run_parsing_tests_vcf(
            self,
            file,
            source="vcf",
            phased=False,
            unannotated=False,
            rsids=(),
            build=37,
            build_detected=False,
            snps_df=None,
    ):
        # https://samtools.github.io/hts-specs/VCFv4.2.pdf
        # this tests for homozygous snps, heterozygous snps, multiallelic snps,
        # phased snps, and snps with missing rsID
        self.make_parsing_assertions_vcf(
            self.parse_file(file, rsids),
            source,
            phased,
            unannotated,
            rsids,
            build,
            build_detected,
            snps_df,
        )
        self.make_parsing_assertions_vcf(
            self.parse_bytes(file, rsids),
            source,
            phased,
            unannotated,
            rsids,
            build,
            build_detected,
            snps_df,
        )

        with tempfile.TemporaryDirectory() as tmpdir:
            base = os.path.basename(file)
            dest = os.path.join(tmpdir, f"{base}.gz")
            gzip_file(file, dest)
            self.make_parsing_assertions_vcf(
                self.parse_file(dest, rsids),
                source,
                phased,
                unannotated,
                rsids,
                build,
                build_detected,
                snps_df,
            )
            self.make_parsing_assertions_vcf(
                self.parse_bytes(dest, rsids),
                source,
                phased,
                unannotated,
                rsids,
                build,
                build_detected,
                snps_df,
            )
            # remove .gz extension
            shutil.move(dest, dest[:-3])
            self.make_parsing_assertions_vcf(
                self.parse_file(dest[:-3], rsids),
                source,
                phased,
                unannotated,
                rsids,
                build,
                build_detected,
                snps_df,
            )