class TestResources(BaseSNPsTestCase): def _reset_resource(self): self.resource._reference_sequences = {} self.resource._gsa_resources = {} self.resource._opensnp_datadump_filenames = [] def run(self, result=None): # set resources directory based on if downloads are being performed # https://stackoverflow.com/a/11180583 self.resource = Resources() self._reset_resource() if self.downloads_enabled: self.resource._resources_dir = "resources" super().run(result) else: # use a temporary directory for test resource data with tempfile.TemporaryDirectory() as tmpdir: self.resource._resources_dir = tmpdir super().run(result) self.resource._resources_dir = "resources" def test_get_assembly_mapping_data(self): def f(): effects = [{"mappings": []} for _ in range(1, 26)] for k, v in self.NCBI36_GRCh37().items(): effects[int(k) - 1] = v mock = Mock(side_effect=effects) with patch("snps.ensembl.EnsemblRestClient.perform_rest_action", mock): return self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") assembly_mapping_data = (self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") if self.downloads_enabled else f()) self.assertEqual(len(assembly_mapping_data), 25) def test_get_gsa_resources(self): def f(): # mock download of test data for each resource self._generate_test_gsa_resources() # load test resources saved to `tmpdir` return self.resource.get_gsa_resources() gsa_resources = (self.resource.get_gsa_resources() if self.downloads_enabled else f()) self.assertEqual(len(gsa_resources["rsid_map"]), 618539) self.assertEqual(len(gsa_resources["chrpos_map"]), 665607) # cleanup these test resources so other tests can use the file resources if os.path.exists("resources"): shutil.rmtree("resources") Singleton._instances = {} def _generate_test_gsa_resources(self): # Name RsID" s = "" for i in range(1, 618541): s += f"rs{i}\trs{i}\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): self.resource.get_gsa_rsid() # Name Chr MapInfo deCODE(cM) s = "" for i in range(1, 665609): s += f"rs{i}\t1\t{i}\t0.0000\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): self.resource.get_gsa_chrpos() def test_get_all_resources(self): def f(): # mock download of test data for each resource self._generate_test_gsa_resources() # generate test data for permutations of remapping data effects = [{"mappings": []} for _ in range(1, 26)] for k, v in self.NCBI36_GRCh37().items(): effects[int(k) - 1] = v mock = Mock(side_effect=effects * 6) with patch("snps.ensembl.EnsemblRestClient.perform_rest_action", mock): return self.resource.get_all_resources() resources = self.resource.get_all_resources( ) if self.downloads_enabled else f() for k, v in resources.items(): self.assertGreater(len(v), 0) # cleanup these test resources so other tests can use the file resources if os.path.exists("resources"): shutil.rmtree("resources") Singleton._instances = {} def test_download_example_datasets(self): def f(): with patch("urllib.request.urlopen", mock_open(read_data=b"")): return self.resource.download_example_datasets() paths = (self.resource.download_example_datasets() if self.downloads_enabled else f()) for path in paths: if not path or not os.path.exists(path): warnings.warn("Example dataset(s) not currently available") return def test_get_paths_reference_sequences_invalid_assembly(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="36") self.assertFalse(assembly) self.assertFalse(chroms) self.assertFalse(urls) self.assertFalse(paths) def run_reference_sequences_test(self, f, assembly="GRCh37"): if self.downloads_enabled: f() else: s = f">MT dna:chromosome chromosome:{assembly}:MT:1:16569:1 REF\n" for i in range(276): s += "A" * 60 s += "\n" s += "A" * 9 s += "\n" with patch("urllib.request.urlopen", mock_open(read_data=gzip.compress(s.encode()))): f() def run_create_reference_sequences_test(self, assembly_expect, url_expect): def f(): ( assembly, chroms, urls, paths, ) = self.resource._get_paths_reference_sequences( assembly=assembly_expect, chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) self.assertEqual(len(seqs), 1) self.assertEqual( seqs["MT"].__repr__(), f"ReferenceSequence(assembly='{assembly_expect}', ID='MT')", ) self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual(seqs["MT"].url, f"{url_expect}") self.assertEqual( seqs["MT"].path, os.path.relpath( f'{os.path.join(self.resource._resources_dir,"fasta", assembly_expect,os.path.basename(url_expect))}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, assembly_expect) self.assertEqual(seqs["MT"].build, f"B{assembly_expect[-2:]}") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f, assembly_expect) def test_create_reference_sequences_NCBI36(self): self.run_create_reference_sequences_test( "NCBI36", "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_GRCh37(self): self.run_create_reference_sequences_test( "GRCh37", "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_GRCh38(self): self.run_create_reference_sequences_test( "GRCh38", "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_invalid_path(self): def f(): ( assembly, chroms, urls, paths, ) = self.resource._get_paths_reference_sequences(assembly="GRCh37", chroms=["MT"]) paths[0] = "" seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) self.assertEqual(len(seqs), 0) self.run_reference_sequences_test(f) def test_download_file_socket_timeout(self): mock = Mock(side_effect=socket.timeout) with patch("urllib.request.urlopen", mock): path = self.resource._download_file("http://url", "test.txt") self.assertEqual(path, "") def test_download_file_URL_error(self): mock = Mock(side_effect=urllib.error.URLError("test error")) with patch("urllib.request.urlopen", mock): path1 = self.resource._download_file("http://url", "test.txt") path2 = self.resource._download_file("ftp://url", "test.txt") self.assertEqual(path1, "") self.assertEqual(path2, "") def test_get_reference_sequences(self): def f(): seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 1) self.assertEqual(seqs["MT"].__repr__(), "ReferenceSequence(assembly='GRCh37', ID='MT')") self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual( seqs["MT"].url, "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) self.assertEqual( seqs["MT"].path, os.path.relpath( f'{os.path.join(self.resource._resources_dir,"fasta", "GRCh37","Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz")}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, "GRCh37") self.assertEqual(seqs["MT"].build, "B37") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f) def test_get_all_reference_sequences(self): def f(): seqs = self.resource.get_all_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 3) self.assertEqual(len(seqs["NCBI36"]), 1) self.assertEqual( seqs["NCBI36"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "NCBI36", "Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz", )), ) self.assertEqual(len(seqs["GRCh37"]), 1) self.assertEqual( seqs["GRCh37"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh37", "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", )), ) self.assertEqual(len(seqs["GRCh38"]), 1) self.assertEqual( seqs["GRCh38"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh38", "Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz", )), ) self.run_reference_sequences_test(f) def test_get_reference_sequences_invalid_assembly(self): seqs = self.resource.get_reference_sequences(assembly="36") self.assertEqual(len(seqs), 0) def test_get_reference_sequences_chrom_not_available(self): def f(): self.resource.get_reference_sequences(chroms=["MT"]) del self.resource._reference_sequences["GRCh37"]["MT"] seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 1) self.assertEqual(seqs["MT"].__repr__(), "ReferenceSequence(assembly='GRCh37', ID='MT')") self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual( seqs["MT"].url, "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) self.assertEqual( seqs["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh37", "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", )), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, "GRCh37") self.assertEqual(seqs["MT"].build, "B37") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f) def run_reference_sequence_load_sequence_test(self, hash): def f(): seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs["MT"].sequence), 16569) self.assertEqual(seqs["MT"].md5, hash) self.assertEqual(seqs["MT"].start, 1) self.assertEqual(seqs["MT"].end, 16569) self.assertEqual(seqs["MT"].length, 16569) seqs["MT"].clear() self.assertEqual(seqs["MT"]._sequence.size, 0) self.assertEqual(seqs["MT"]._md5, "") self.assertEqual(seqs["MT"]._start, 0) self.assertEqual(seqs["MT"]._end, 0) self.assertEqual(seqs["MT"]._length, 0) self.assertEqual(len(seqs["MT"].sequence), 16569) self.assertEqual(seqs["MT"].md5, hash) self.assertEqual(seqs["MT"].start, 1) self.assertEqual(seqs["MT"].end, 16569) self.assertEqual(seqs["MT"].length, 16569) self.run_reference_sequences_test(f) def test_reference_sequence_load_sequence(self): if self.downloads_enabled: self.run_reference_sequence_load_sequence_test( "c68f52674c9fb33aef52dcf399755519") else: self.run_reference_sequence_load_sequence_test( "d432324413a21aa9247321c56c300ad3") def test_reference_sequence_generic_load_sequence(self): with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) self.assertEqual(seq.ID, "1") self.assertEqual(seq.chrom, "1") self.assertEqual(seq.path, dest) np.testing.assert_array_equal( seq.sequence, np.array( bytearray( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), dtype=np.uint8, ), ) self.assertListEqual(list("AGGCCGGAC"), list(map(chr, seq.sequence[100:109]))) self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6") self.assertEqual(seq.start, 1) self.assertEqual(seq.end, 117) self.assertEqual(seq.length, 117) def test_get_opensnp_datadump_filenames(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") filenames = self.resource.get_opensnp_datadump_filenames() self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"]) self.resource._resources_dir = "resources" def test_load_opensnp_datadump_file(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") snps1 = SNPs( self.resource.load_opensnp_datadump_file("generic1.csv")) snps2 = SNPs( self.resource.load_opensnp_datadump_file("generic2.csv")) pd.testing.assert_frame_equal(snps1.snps, self.generic_snps(), check_exact=True) pd.testing.assert_frame_equal(snps2.snps, self.generic_snps(), check_exact=True) self.resource._resources_dir = "resources"
class TestResources(BaseSNPsTestCase): def setUp(self): self.resource = Resources(resources_dir="resources") self.del_output_dir_helper() def test_get_assembly_mapping_data(self): assembly_mapping_data = self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") assert len(assembly_mapping_data) == 25 def test_get_gsa_resources(self): gsa_resources = self.resource.get_gsa_resources() assert len(gsa_resources["rsid_map"]) == 618541 assert len(gsa_resources["chrpos_map"]) == 665609 def test_get_all_resources(self): resources = self.resource.get_all_resources() for k, v in resources.items(): if not v: assert False assert True def test_download_example_datasets(self): paths = self.resource.download_example_datasets() for path in paths: if not path or not os.path.exists(path): warnings.warn("Example dataset(s) not currently available") return assert True def test_get_paths_reference_sequences_invalid_assembly(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="36") assert not assembly assert not chroms assert not urls assert not paths def test_create_reference_sequences_NCBI36(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="NCBI36", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='NCBI36', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "NCBI36" assert seqs["MT"].build == "B36" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_GRCh37(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh37", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_GRCh38(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh38", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh38', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh38" assert seqs["MT"].build == "B38" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_invalid_path(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh38", chroms=["MT"]) paths[0] = "" seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 0 def test_get_reference_sequences(self): seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_get_all_reference_sequences(self): seqs = self.resource.get_all_reference_sequences(chroms=["MT"]) assert len(seqs) == 3 assert len(seqs["NCBI36"]) == 1 assert ( seqs["NCBI36"]["MT"].path == "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert len(seqs["GRCh37"]) == 1 assert ( seqs["GRCh37"]["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert len(seqs["GRCh38"]) == 1 assert ( seqs["GRCh38"]["MT"].path == "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) def test_get_reference_sequences_invalid_assembly(self): seqs = self.resource.get_reference_sequences(assembly="36") assert len(seqs) == 0 def test_get_reference_sequences_chrom_not_available(self): self.resource.get_reference_sequences(chroms=["MT"]) del self.resource._reference_sequences["GRCh37"]["MT"] seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_reference_sequence_load_sequence(self): seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs["MT"].sequence) == 16569 assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519" assert seqs["MT"].start == 1 assert seqs["MT"].end == 16569 assert seqs["MT"].length == 16569 seqs["MT"].clear() assert seqs["MT"]._sequence.size == 0 assert seqs["MT"]._md5 == "" assert seqs["MT"]._start == 0 assert seqs["MT"]._end == 0 assert seqs["MT"]._length == 0 assert len(seqs["MT"].sequence) == 16569 assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519" assert seqs["MT"].start == 1 assert seqs["MT"].end == 16569 assert seqs["MT"].length == 16569 def test_reference_sequence_generic_load_sequence(self): with open("tests/input/generic.fa", "rb") as f_in: with atomic_write("tests/input/generic.fa.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz") assert seq.ID == "1" assert seq.chrom == "1" assert seq.path == "tests/input/generic.fa.gz" np.testing.assert_array_equal( seq.sequence, np.array( bytearray( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), dtype=np.uint8, ), ) assert list("AGGCCGGAC") == list(map(chr, seq.sequence[100:109])) assert seq.md5 == "6ac6176535ad0e38aba2d05d786c39b6" assert seq.start == 1 assert seq.end == 117 assert seq.length == 117 def test_get_opensnp_datadump_filenames(self): # temporarily set resources dir to tests self.resource._resources_dir = "tests/resources" # write test openSNP datadump zip with atomic_write("tests/resources/opensnp_datadump.current.zip", mode="wb", overwrite=True) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") filenames = self.resource.get_opensnp_datadump_filenames() assert filenames == ["generic1.csv", "generic2.csv"] self.resource._resources_dir = "resources"