예제 #1
0
    def test_save_snps_vcf_false_positive_build(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(snps.save(vcf=True), output)

                s = ""
                with open(output, "r") as f:
                    for line in f.readlines():
                        if "snps v" in line:
                            s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n'
                        else:
                            s += line

                with open(output, "w") as f:
                    f.write(s)

            self.run_parsing_tests_vcf(output)
예제 #2
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")

        r._reference_sequences["GRCh37"]["1"] = seq

        # save phased data to VCF
        assert os.path.relpath(
            s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf"
        # read saved VCF
        s = SNPs("output/vcf_GRCh37.vcf")
        assert s.phased
        pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
예제 #3
0
    def test_save_snps_vcf_discrepant_pos(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # create discrepant SNPs by setting positions outside reference sequence
            s._snps.loc["rs1", "pos"] = 0
            s._snps.loc["rs17", "pos"] = 118

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        pd.testing.assert_frame_equal(
            s.discrepant_vcf_position,
            self.create_snp_df(
                rsid=["rs1", "rs17"],
                chrom=["1", "1"],
                pos=[0, 118],
                genotype=["AA", np.nan],
            ),
            check_exact=True,
        )

        expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
예제 #4
0
    def _setup_gsa_test(resources_dir):
        # reset resource if already loaded
        r = Resources()
        r._resources_dir = resources_dir
        r._gsa_resources = {}

        gzip_file(
            "tests/resources/gsa_rsid_map.txt",
            os.path.join(resources_dir, "gsa_rsid_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/gsa_chrpos_map.txt",
            os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"),
        )
예제 #5
0
    def run(self, result=None):
        # set resources directory based on if downloads are being performed
        # https://stackoverflow.com/a/11180583

        self.resource = Resources()
        self._reset_resource()
        if self.downloads_enabled:
            self.resource._resources_dir = "resources"
            super().run(result)
        else:
            # use a temporary directory for test resource data
            with tempfile.TemporaryDirectory() as tmpdir:
                self.resource._resources_dir = tmpdir
                super().run(result)
                self.resource._resources_dir = "resources"
예제 #6
0
    def _setup_gsa_test(resources_dir):
        # reset resource if already loaded
        r = Resources()
        r._resources_dir = resources_dir
        r._init_resource_attributes()

        gzip_file(
            "tests/resources/gsa_rsid_map.txt",
            os.path.join(resources_dir, "gsa_rsid_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/gsa_chrpos_map.txt",
            os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"),
        )
        gzip_file(
            "tests/resources/dbsnp_151_37_reverse.txt",
            os.path.join(resources_dir, "dbsnp_151_37_reverse.txt.gz"),
        )
예제 #7
0
    def test_save_snps_vcf(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf")
예제 #8
0
    def test_save_snps_vcf(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(s.save(vcf=True), output)

            self.run_parsing_tests_vcf(output)
예제 #9
0
    def test_load_opensnp_datadump_file(self):
        # temporarily set resources dir to tests
        r = Resources()
        r._resources_dir = "tests/resources"

        # write test openSNP datadump zip
        with atomic_write("tests/resources/opensnp_datadump.current.zip",
                          mode="wb",
                          overwrite=True) as f:
            with zipfile.ZipFile(f, "w") as f_zip:
                f_zip.write("tests/input/generic.csv", arcname="generic1.csv")
                f_zip.write("tests/input/generic.csv", arcname="generic2.csv")

        snps1 = SNPs(r.load_opensnp_datadump_file("generic1.csv"))
        snps2 = SNPs(r.load_opensnp_datadump_file("generic2.csv"))

        pd.testing.assert_frame_equal(snps1.snps, self.generic_snps())
        pd.testing.assert_frame_equal(snps2.snps, self.generic_snps())

        r._resources_dir = "resources"
예제 #10
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # save phased data to VCF
            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        # read saved VCF
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", phased=True)
예제 #11
0
    def test_save_snps_vcf_discrepant_pos(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                # create discrepant SNPs by setting positions outside reference sequence
                s._snps.loc["rs1", "pos"] = 0
                s._snps.loc["rs17", "pos"] = 118

                # esnure this is the right type after manual tweaking
                s._snps = s._snps.astype({"pos": np.uint32})

                self.assertEqual(s.save(vcf=True), output)

            pd.testing.assert_frame_equal(
                s.discrepant_vcf_position,
                self.create_snp_df(
                    rsid=["rs1", "rs17"],
                    chrom=["1", "1"],
                    pos=[0, 118],
                    genotype=["AA", np.nan],
                ),
                check_exact=True,
            )

            expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
            self.run_parsing_tests_vcf(output, snps_df=expected)
예제 #12
0
class TestResources(BaseSNPsTestCase):
    def _reset_resource(self):
        self.resource._reference_sequences = {}
        self.resource._gsa_resources = {}
        self.resource._opensnp_datadump_filenames = []

    def run(self, result=None):
        # set resources directory based on if downloads are being performed
        # https://stackoverflow.com/a/11180583

        self.resource = Resources()
        self._reset_resource()
        if self.downloads_enabled:
            self.resource._resources_dir = "resources"
            super().run(result)
        else:
            # use a temporary directory for test resource data
            with tempfile.TemporaryDirectory() as tmpdir:
                self.resource._resources_dir = tmpdir
                super().run(result)
                self.resource._resources_dir = "resources"

    def test_get_assembly_mapping_data(self):
        def f():
            effects = [{"mappings": []} for _ in range(1, 26)]
            for k, v in self.NCBI36_GRCh37().items():
                effects[int(k) - 1] = v
            mock = Mock(side_effect=effects)
            with patch("snps.ensembl.EnsemblRestClient.perform_rest_action",
                       mock):
                return self.resource.get_assembly_mapping_data(
                    "NCBI36", "GRCh37")

        assembly_mapping_data = (self.resource.get_assembly_mapping_data(
            "NCBI36", "GRCh37") if self.downloads_enabled else f())

        self.assertEqual(len(assembly_mapping_data), 25)

    def test_get_gsa_resources(self):
        def f():
            # mock download of test data for each resource
            self._generate_test_gsa_resources()
            # load test resources saved to `tmpdir`
            return self.resource.get_gsa_resources()

        gsa_resources = (self.resource.get_gsa_resources()
                         if self.downloads_enabled else f())

        self.assertEqual(len(gsa_resources["rsid_map"]), 618539)
        self.assertEqual(len(gsa_resources["chrpos_map"]), 665607)

        # cleanup these test resources so other tests can use the file resources
        if os.path.exists("resources"):
            shutil.rmtree("resources")
        Singleton._instances = {}

    def _generate_test_gsa_resources(self):
        # Name RsID"
        s = ""
        for i in range(1, 618541):
            s += f"rs{i}\trs{i}\n"
        mock = mock_open(read_data=gzip.compress(s.encode()))
        with patch("urllib.request.urlopen", mock):
            self.resource.get_gsa_rsid()

        # Name Chr MapInfo deCODE(cM)
        s = ""
        for i in range(1, 665609):
            s += f"rs{i}\t1\t{i}\t0.0000\n"

        mock = mock_open(read_data=gzip.compress(s.encode()))
        with patch("urllib.request.urlopen", mock):
            self.resource.get_gsa_chrpos()

    def test_get_all_resources(self):
        def f():
            # mock download of test data for each resource
            self._generate_test_gsa_resources()

            # generate test data for permutations of remapping data
            effects = [{"mappings": []} for _ in range(1, 26)]
            for k, v in self.NCBI36_GRCh37().items():
                effects[int(k) - 1] = v
            mock = Mock(side_effect=effects * 6)
            with patch("snps.ensembl.EnsemblRestClient.perform_rest_action",
                       mock):
                return self.resource.get_all_resources()

        resources = self.resource.get_all_resources(
        ) if self.downloads_enabled else f()

        for k, v in resources.items():
            self.assertGreater(len(v), 0)

        # cleanup these test resources so other tests can use the file resources
        if os.path.exists("resources"):
            shutil.rmtree("resources")
        Singleton._instances = {}

    def test_download_example_datasets(self):
        def f():
            with patch("urllib.request.urlopen", mock_open(read_data=b"")):
                return self.resource.download_example_datasets()

        paths = (self.resource.download_example_datasets()
                 if self.downloads_enabled else f())

        for path in paths:
            if not path or not os.path.exists(path):
                warnings.warn("Example dataset(s) not currently available")
                return

    def test_get_paths_reference_sequences_invalid_assembly(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="36")
        self.assertFalse(assembly)
        self.assertFalse(chroms)
        self.assertFalse(urls)
        self.assertFalse(paths)

    def run_reference_sequences_test(self, f, assembly="GRCh37"):
        if self.downloads_enabled:
            f()
        else:
            s = f">MT dna:chromosome chromosome:{assembly}:MT:1:16569:1 REF\n"
            for i in range(276):
                s += "A" * 60
                s += "\n"
            s += "A" * 9
            s += "\n"
            with patch("urllib.request.urlopen",
                       mock_open(read_data=gzip.compress(s.encode()))):
                f()

    def run_create_reference_sequences_test(self, assembly_expect, url_expect):
        def f():
            (
                assembly,
                chroms,
                urls,
                paths,
            ) = self.resource._get_paths_reference_sequences(
                assembly=assembly_expect, chroms=["MT"])
            seqs = self.resource._create_reference_sequences(
                assembly, chroms, urls, paths)
            self.assertEqual(len(seqs), 1)
            self.assertEqual(
                seqs["MT"].__repr__(),
                f"ReferenceSequence(assembly='{assembly_expect}', ID='MT')",
            )
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(seqs["MT"].url, f"{url_expect}")
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    f'{os.path.join(self.resource._resources_dir,"fasta", assembly_expect,os.path.basename(url_expect))}'
                ),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, assembly_expect)
            self.assertEqual(seqs["MT"].build, f"B{assembly_expect[-2:]}")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f, assembly_expect)

    def test_create_reference_sequences_NCBI36(self):
        self.run_create_reference_sequences_test(
            "NCBI36",
            "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_GRCh37(self):
        self.run_create_reference_sequences_test(
            "GRCh37",
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_GRCh38(self):
        self.run_create_reference_sequences_test(
            "GRCh38",
            "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz",
        )

    def test_create_reference_sequences_invalid_path(self):
        def f():
            (
                assembly,
                chroms,
                urls,
                paths,
            ) = self.resource._get_paths_reference_sequences(assembly="GRCh37",
                                                             chroms=["MT"])
            paths[0] = ""
            seqs = self.resource._create_reference_sequences(
                assembly, chroms, urls, paths)
            self.assertEqual(len(seqs), 0)

        self.run_reference_sequences_test(f)

    def test_download_file_socket_timeout(self):
        mock = Mock(side_effect=socket.timeout)
        with patch("urllib.request.urlopen", mock):
            path = self.resource._download_file("http://url", "test.txt")
        self.assertEqual(path, "")

    def test_download_file_URL_error(self):
        mock = Mock(side_effect=urllib.error.URLError("test error"))
        with patch("urllib.request.urlopen", mock):
            path1 = self.resource._download_file("http://url", "test.txt")
            path2 = self.resource._download_file("ftp://url", "test.txt")
        self.assertEqual(path1, "")
        self.assertEqual(path2, "")

    def test_get_reference_sequences(self):
        def f():
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 1)
            self.assertEqual(seqs["MT"].__repr__(),
                             "ReferenceSequence(assembly='GRCh37', ID='MT')")
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(
                seqs["MT"].url,
                "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
            )
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    f'{os.path.join(self.resource._resources_dir,"fasta", "GRCh37","Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz")}'
                ),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, "GRCh37")
            self.assertEqual(seqs["MT"].build, "B37")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f)

    def test_get_all_reference_sequences(self):
        def f():
            seqs = self.resource.get_all_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 3)
            self.assertEqual(len(seqs["NCBI36"]), 1)
            self.assertEqual(
                seqs["NCBI36"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "NCBI36",
                        "Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertEqual(len(seqs["GRCh37"]), 1)
            self.assertEqual(
                seqs["GRCh37"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh37",
                        "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertEqual(len(seqs["GRCh38"]), 1)
            self.assertEqual(
                seqs["GRCh38"]["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh38",
                        "Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz",
                    )),
            )

        self.run_reference_sequences_test(f)

    def test_get_reference_sequences_invalid_assembly(self):
        seqs = self.resource.get_reference_sequences(assembly="36")
        self.assertEqual(len(seqs), 0)

    def test_get_reference_sequences_chrom_not_available(self):
        def f():
            self.resource.get_reference_sequences(chroms=["MT"])
            del self.resource._reference_sequences["GRCh37"]["MT"]
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs), 1)
            self.assertEqual(seqs["MT"].__repr__(),
                             "ReferenceSequence(assembly='GRCh37', ID='MT')")
            self.assertEqual(seqs["MT"].ID, "MT")
            self.assertEqual(seqs["MT"].chrom, "MT")
            self.assertEqual(
                seqs["MT"].url,
                "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
            )
            self.assertEqual(
                seqs["MT"].path,
                os.path.relpath(
                    os.path.join(
                        self.resource._resources_dir,
                        "fasta",
                        "GRCh37",
                        "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz",
                    )),
            )
            self.assertTrue(os.path.exists(seqs["MT"].path))
            self.assertEqual(seqs["MT"].assembly, "GRCh37")
            self.assertEqual(seqs["MT"].build, "B37")
            self.assertEqual(seqs["MT"].species, "H**o sapiens")
            self.assertEqual(seqs["MT"].taxonomy, "x")

        self.run_reference_sequences_test(f)

    def run_reference_sequence_load_sequence_test(self, hash):
        def f():
            seqs = self.resource.get_reference_sequences(chroms=["MT"])
            self.assertEqual(len(seqs["MT"].sequence), 16569)
            self.assertEqual(seqs["MT"].md5, hash)
            self.assertEqual(seqs["MT"].start, 1)
            self.assertEqual(seqs["MT"].end, 16569)
            self.assertEqual(seqs["MT"].length, 16569)

            seqs["MT"].clear()
            self.assertEqual(seqs["MT"]._sequence.size, 0)
            self.assertEqual(seqs["MT"]._md5, "")
            self.assertEqual(seqs["MT"]._start, 0)
            self.assertEqual(seqs["MT"]._end, 0)
            self.assertEqual(seqs["MT"]._length, 0)

            self.assertEqual(len(seqs["MT"].sequence), 16569)
            self.assertEqual(seqs["MT"].md5, hash)
            self.assertEqual(seqs["MT"].start, 1)
            self.assertEqual(seqs["MT"].end, 16569)
            self.assertEqual(seqs["MT"].length, 16569)

        self.run_reference_sequences_test(f)

    def test_reference_sequence_load_sequence(self):
        if self.downloads_enabled:
            self.run_reference_sequence_load_sequence_test(
                "c68f52674c9fb33aef52dcf399755519")
        else:
            self.run_reference_sequence_load_sequence_test(
                "d432324413a21aa9247321c56c300ad3")

    def test_reference_sequence_generic_load_sequence(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)
            self.assertEqual(seq.ID, "1")
            self.assertEqual(seq.chrom, "1")
            self.assertEqual(seq.path, dest)
            np.testing.assert_array_equal(
                seq.sequence,
                np.array(
                    bytearray(
                        "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN",
                        encoding="utf-8",
                        errors="strict",
                    ),
                    dtype=np.uint8,
                ),
            )
            self.assertListEqual(list("AGGCCGGAC"),
                                 list(map(chr, seq.sequence[100:109])))
            self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6")
            self.assertEqual(seq.start, 1)
            self.assertEqual(seq.end, 117)
            self.assertEqual(seq.length, 117)

    def test_get_opensnp_datadump_filenames(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            filenames = self.resource.get_opensnp_datadump_filenames()

            self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"])

            self.resource._resources_dir = "resources"

    def test_load_opensnp_datadump_file(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            snps1 = SNPs(
                self.resource.load_opensnp_datadump_file("generic1.csv"))
            snps2 = SNPs(
                self.resource.load_opensnp_datadump_file("generic2.csv"))

            pd.testing.assert_frame_equal(snps1.snps,
                                          self.generic_snps(),
                                          check_exact=True)
            pd.testing.assert_frame_equal(snps2.snps,
                                          self.generic_snps(),
                                          check_exact=True)

            self.resource._resources_dir = "resources"
예제 #13
0
 def _teardown_gsa_test():
     r = Resources()
     r._resources_dir = "resources"
     r._gsa_resources = {}
예제 #14
0
 def _teardown_gsa_test():
     r = Resources()
     r._resources_dir = "resources"
     r._init_resource_attributes()
예제 #15
0
class TestResources(BaseSNPsTestCase):
    def setUp(self):
        self.resource = Resources(resources_dir="resources")
        self.del_output_dir_helper()

    def test_get_assembly_mapping_data(self):
        assembly_mapping_data = self.resource.get_assembly_mapping_data(
            "NCBI36", "GRCh37")
        assert len(assembly_mapping_data) == 25

    def test_get_gsa_resources(self):
        gsa_resources = self.resource.get_gsa_resources()
        assert len(gsa_resources["rsid_map"]) == 618541
        assert len(gsa_resources["chrpos_map"]) == 665609

    def test_get_all_resources(self):
        resources = self.resource.get_all_resources()

        for k, v in resources.items():
            if not v:
                assert False
        assert True

    def test_download_example_datasets(self):
        paths = self.resource.download_example_datasets()

        for path in paths:
            if not path or not os.path.exists(path):
                warnings.warn("Example dataset(s) not currently available")
                return

        assert True

    def test_get_paths_reference_sequences_invalid_assembly(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="36")
        assert not assembly
        assert not chroms
        assert not urls
        assert not paths

    def test_create_reference_sequences_NCBI36(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="NCBI36", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='NCBI36', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "NCBI36"
        assert seqs["MT"].build == "B36"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_GRCh37(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh37", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_GRCh38(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh38", chroms=["MT"])
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh38', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh38"
        assert seqs["MT"].build == "B38"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_create_reference_sequences_invalid_path(self):
        assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences(
            assembly="GRCh38", chroms=["MT"])
        paths[0] = ""
        seqs = self.resource._create_reference_sequences(
            assembly, chroms, urls, paths)
        assert len(seqs) == 0

    def test_get_reference_sequences(self):
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_get_all_reference_sequences(self):
        seqs = self.resource.get_all_reference_sequences(chroms=["MT"])
        assert len(seqs) == 3
        assert len(seqs["NCBI36"]) == 1
        assert (
            seqs["NCBI36"]["MT"].path ==
            "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz"
        )
        assert len(seqs["GRCh37"]) == 1
        assert (
            seqs["GRCh37"]["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert len(seqs["GRCh38"]) == 1
        assert (
            seqs["GRCh38"]["MT"].path ==
            "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz"
        )

    def test_get_reference_sequences_invalid_assembly(self):
        seqs = self.resource.get_reference_sequences(assembly="36")
        assert len(seqs) == 0

    def test_get_reference_sequences_chrom_not_available(self):
        self.resource.get_reference_sequences(chroms=["MT"])
        del self.resource._reference_sequences["GRCh37"]["MT"]
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs) == 1
        assert seqs["MT"].__repr__(
        ) == "ReferenceSequence(assembly='GRCh37', ID='MT')"
        assert seqs["MT"].ID == "MT"
        assert seqs["MT"].chrom == "MT"
        assert (
            seqs["MT"].url ==
            "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert (
            seqs["MT"].path ==
            "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz"
        )
        assert os.path.exists(seqs["MT"].path)
        assert seqs["MT"].assembly == "GRCh37"
        assert seqs["MT"].build == "B37"
        assert seqs["MT"].species == "H**o sapiens"
        assert seqs["MT"].taxonomy == "x"

    def test_reference_sequence_load_sequence(self):
        seqs = self.resource.get_reference_sequences(chroms=["MT"])
        assert len(seqs["MT"].sequence) == 16569
        assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519"
        assert seqs["MT"].start == 1
        assert seqs["MT"].end == 16569
        assert seqs["MT"].length == 16569

        seqs["MT"].clear()
        assert seqs["MT"]._sequence.size == 0
        assert seqs["MT"]._md5 == ""
        assert seqs["MT"]._start == 0
        assert seqs["MT"]._end == 0
        assert seqs["MT"]._length == 0

        assert len(seqs["MT"].sequence) == 16569
        assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519"
        assert seqs["MT"].start == 1
        assert seqs["MT"].end == 16569
        assert seqs["MT"].length == 16569

    def test_reference_sequence_generic_load_sequence(self):
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")
        assert seq.ID == "1"
        assert seq.chrom == "1"
        assert seq.path == "tests/input/generic.fa.gz"
        np.testing.assert_array_equal(
            seq.sequence,
            np.array(
                bytearray(
                    "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN",
                    encoding="utf-8",
                    errors="strict",
                ),
                dtype=np.uint8,
            ),
        )
        assert list("AGGCCGGAC") == list(map(chr, seq.sequence[100:109]))
        assert seq.md5 == "6ac6176535ad0e38aba2d05d786c39b6"
        assert seq.start == 1
        assert seq.end == 117
        assert seq.length == 117

    def test_get_opensnp_datadump_filenames(self):
        # temporarily set resources dir to tests
        self.resource._resources_dir = "tests/resources"

        # write test openSNP datadump zip
        with atomic_write("tests/resources/opensnp_datadump.current.zip",
                          mode="wb",
                          overwrite=True) as f:
            with zipfile.ZipFile(f, "w") as f_zip:
                f_zip.write("tests/input/generic.csv", arcname="generic1.csv")
                f_zip.write("tests/input/generic.csv", arcname="generic2.csv")

        filenames = self.resource.get_opensnp_datadump_filenames()

        assert filenames == ["generic1.csv", "generic2.csv"]

        self.resource._resources_dir = "resources"
예제 #16
0
from atomicwrites import atomic_write
import pandas as pd

from snps import SNPs
from snps.resources import Resources
from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str

OUTPUT_DIR = "output"
EXTRACT_FILES = True

# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

# setup logger to output to file in output directory
logging.basicConfig(
    filename=f'{os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")}',
    format="%(asctime)s: %(message)s",
    filemode="w",
    level=logging.INFO,
)

logger = logging.getLogger()


def load_file(task):
    file = task["file"]
예제 #17
0
import numpy as np
from matplotlib import patches
import matplotlib.pyplot as plt
import pandas as pd

from snps import SNPs
from snps.resources import Resources
from snps.utils import Parallelizer, save_df_as_csv, create_dir

OUTPUT_DIR = "output"

# create output directory for this example
create_dir(OUTPUT_DIR)

# assume script is being run from examples dir
r = Resources(resources_dir="../../resources")

# setup logger to output to file in output directory
logging.basicConfig(
    filename=f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.txt')}",
    format="%(asctime)s#%(message)s",
    filemode="w",
    level=logging.INFO,
)

logger = logging.getLogger()


def get_xy_chrom_snp_ratios(task):
    file = task["file"]
예제 #18
0
 def setUp(self):
     self.resource = Resources(resources_dir="resources")
     self.del_output_dir_helper()
예제 #19
0
""" Get a file from the openSNP datadump for debugging. """

import os

from atomicwrites import atomic_write

from snps.resources import Resources
from snps.utils import create_dir

OUTPUT_DIR = "output"
FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt"

if __name__ == "__main__":
    # create output directory for this example
    create_dir(OUTPUT_DIR)

    # assume script is being run from examples dir
    r = Resources(resources_dir="../../resources")

    with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f:
        f.write(r.load_opensnp_datadump_file(FILE))
예제 #20
0
    filter_user_genotypes,
    get_1kg_samples,
    impute_missing,
    vcf2df,
)

DATA_DIR = "data"
OUTPUT_DIR = "output"
aisnp_SET = "kidd et al. 55 aisnps"  # {"kidd et al. 55 aisnps", "Seldin et al. 128 aisnps"}
DIMENSIONALITY_REDUCTION_ALGORITHM = "pca"  # {"pca", "umap", "t-SNE"}

# create output directory for this example
create_dir(OUTPUT_DIR)

# assume `opensnp_datadump.current.zip` is found at this location
r = Resources(resources_dir=DATA_DIR)

# setup logger to output to file in output directory
logging.basicConfig(
    filename=f'{os.path.join(OUTPUT_DIR, "opensnp_ancestry.txt")}',
    format="%(asctime)s: %(message)s",
    filemode="w",
    level=logging.INFO,
)


def main():
    logging.info("start analysis")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()