def test_save_snps_vcf_false_positive_build(self): with tempfile.TemporaryDirectory() as tmpdir1: snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(snps.save(vcf=True), output) s = "" with open(output, "r") as f: for line in f.readlines(): if "snps v" in line: s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n' else: s += line with open(output, "w") as f: f.write(s) self.run_parsing_tests_vcf(output)
def test_save_snps_vcf_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # setup resource to use test FASTA reference sequence r = Resources() r._reference_sequences["GRCh37"] = {} with open("tests/input/generic.fa", "rb") as f_in: with atomic_write("tests/input/generic.fa.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz") r._reference_sequences["GRCh37"]["1"] = seq # save phased data to VCF assert os.path.relpath( s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf" # read saved VCF s = SNPs("output/vcf_GRCh37.vcf") assert s.phased pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
def test_save_snps_vcf_discrepant_pos(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
def _setup_gsa_test(resources_dir): # reset resource if already loaded r = Resources() r._resources_dir = resources_dir r._gsa_resources = {} gzip_file( "tests/resources/gsa_rsid_map.txt", os.path.join(resources_dir, "gsa_rsid_map.txt.gz"), ) gzip_file( "tests/resources/gsa_chrpos_map.txt", os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"), )
def run(self, result=None): # set resources directory based on if downloads are being performed # https://stackoverflow.com/a/11180583 self.resource = Resources() self._reset_resource() if self.downloads_enabled: self.resource._resources_dir = "resources" super().run(result) else: # use a temporary directory for test resource data with tempfile.TemporaryDirectory() as tmpdir: self.resource._resources_dir = tmpdir super().run(result) self.resource._resources_dir = "resources"
def _setup_gsa_test(resources_dir): # reset resource if already loaded r = Resources() r._resources_dir = resources_dir r._init_resource_attributes() gzip_file( "tests/resources/gsa_rsid_map.txt", os.path.join(resources_dir, "gsa_rsid_map.txt.gz"), ) gzip_file( "tests/resources/gsa_chrpos_map.txt", os.path.join(resources_dir, "gsa_chrpos_map.txt.gz"), ) gzip_file( "tests/resources/dbsnp_151_37_reverse.txt", os.path.join(resources_dir, "dbsnp_151_37_reverse.txt.gz"), )
def test_save_snps_vcf(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf")
def test_save_snps_vcf(self): with tempfile.TemporaryDirectory() as tmpdir1: s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(s.save(vcf=True), output) self.run_parsing_tests_vcf(output)
def test_load_opensnp_datadump_file(self): # temporarily set resources dir to tests r = Resources() r._resources_dir = "tests/resources" # write test openSNP datadump zip with atomic_write("tests/resources/opensnp_datadump.current.zip", mode="wb", overwrite=True) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") snps1 = SNPs(r.load_opensnp_datadump_file("generic1.csv")) snps2 = SNPs(r.load_opensnp_datadump_file("generic2.csv")) pd.testing.assert_frame_equal(snps1.snps, self.generic_snps()) pd.testing.assert_frame_equal(snps2.snps, self.generic_snps()) r._resources_dir = "resources"
def test_save_snps_vcf_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # setup resource to use test FASTA reference sequence r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # save phased data to VCF self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") # read saved VCF self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", phased=True)
def test_save_snps_vcf_discrepant_pos(self): with tempfile.TemporaryDirectory() as tmpdir1: s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 # esnure this is the right type after manual tweaking s._snps = s._snps.astype({"pos": np.uint32}) self.assertEqual(s.save(vcf=True), output) pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf(output, snps_df=expected)
class TestResources(BaseSNPsTestCase): def _reset_resource(self): self.resource._reference_sequences = {} self.resource._gsa_resources = {} self.resource._opensnp_datadump_filenames = [] def run(self, result=None): # set resources directory based on if downloads are being performed # https://stackoverflow.com/a/11180583 self.resource = Resources() self._reset_resource() if self.downloads_enabled: self.resource._resources_dir = "resources" super().run(result) else: # use a temporary directory for test resource data with tempfile.TemporaryDirectory() as tmpdir: self.resource._resources_dir = tmpdir super().run(result) self.resource._resources_dir = "resources" def test_get_assembly_mapping_data(self): def f(): effects = [{"mappings": []} for _ in range(1, 26)] for k, v in self.NCBI36_GRCh37().items(): effects[int(k) - 1] = v mock = Mock(side_effect=effects) with patch("snps.ensembl.EnsemblRestClient.perform_rest_action", mock): return self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") assembly_mapping_data = (self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") if self.downloads_enabled else f()) self.assertEqual(len(assembly_mapping_data), 25) def test_get_gsa_resources(self): def f(): # mock download of test data for each resource self._generate_test_gsa_resources() # load test resources saved to `tmpdir` return self.resource.get_gsa_resources() gsa_resources = (self.resource.get_gsa_resources() if self.downloads_enabled else f()) self.assertEqual(len(gsa_resources["rsid_map"]), 618539) self.assertEqual(len(gsa_resources["chrpos_map"]), 665607) # cleanup these test resources so other tests can use the file resources if os.path.exists("resources"): shutil.rmtree("resources") Singleton._instances = {} def _generate_test_gsa_resources(self): # Name RsID" s = "" for i in range(1, 618541): s += f"rs{i}\trs{i}\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): self.resource.get_gsa_rsid() # Name Chr MapInfo deCODE(cM) s = "" for i in range(1, 665609): s += f"rs{i}\t1\t{i}\t0.0000\n" mock = mock_open(read_data=gzip.compress(s.encode())) with patch("urllib.request.urlopen", mock): self.resource.get_gsa_chrpos() def test_get_all_resources(self): def f(): # mock download of test data for each resource self._generate_test_gsa_resources() # generate test data for permutations of remapping data effects = [{"mappings": []} for _ in range(1, 26)] for k, v in self.NCBI36_GRCh37().items(): effects[int(k) - 1] = v mock = Mock(side_effect=effects * 6) with patch("snps.ensembl.EnsemblRestClient.perform_rest_action", mock): return self.resource.get_all_resources() resources = self.resource.get_all_resources( ) if self.downloads_enabled else f() for k, v in resources.items(): self.assertGreater(len(v), 0) # cleanup these test resources so other tests can use the file resources if os.path.exists("resources"): shutil.rmtree("resources") Singleton._instances = {} def test_download_example_datasets(self): def f(): with patch("urllib.request.urlopen", mock_open(read_data=b"")): return self.resource.download_example_datasets() paths = (self.resource.download_example_datasets() if self.downloads_enabled else f()) for path in paths: if not path or not os.path.exists(path): warnings.warn("Example dataset(s) not currently available") return def test_get_paths_reference_sequences_invalid_assembly(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="36") self.assertFalse(assembly) self.assertFalse(chroms) self.assertFalse(urls) self.assertFalse(paths) def run_reference_sequences_test(self, f, assembly="GRCh37"): if self.downloads_enabled: f() else: s = f">MT dna:chromosome chromosome:{assembly}:MT:1:16569:1 REF\n" for i in range(276): s += "A" * 60 s += "\n" s += "A" * 9 s += "\n" with patch("urllib.request.urlopen", mock_open(read_data=gzip.compress(s.encode()))): f() def run_create_reference_sequences_test(self, assembly_expect, url_expect): def f(): ( assembly, chroms, urls, paths, ) = self.resource._get_paths_reference_sequences( assembly=assembly_expect, chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) self.assertEqual(len(seqs), 1) self.assertEqual( seqs["MT"].__repr__(), f"ReferenceSequence(assembly='{assembly_expect}', ID='MT')", ) self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual(seqs["MT"].url, f"{url_expect}") self.assertEqual( seqs["MT"].path, os.path.relpath( f'{os.path.join(self.resource._resources_dir,"fasta", assembly_expect,os.path.basename(url_expect))}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, assembly_expect) self.assertEqual(seqs["MT"].build, f"B{assembly_expect[-2:]}") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f, assembly_expect) def test_create_reference_sequences_NCBI36(self): self.run_create_reference_sequences_test( "NCBI36", "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_GRCh37(self): self.run_create_reference_sequences_test( "GRCh37", "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_GRCh38(self): self.run_create_reference_sequences_test( "GRCh38", "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz", ) def test_create_reference_sequences_invalid_path(self): def f(): ( assembly, chroms, urls, paths, ) = self.resource._get_paths_reference_sequences(assembly="GRCh37", chroms=["MT"]) paths[0] = "" seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) self.assertEqual(len(seqs), 0) self.run_reference_sequences_test(f) def test_download_file_socket_timeout(self): mock = Mock(side_effect=socket.timeout) with patch("urllib.request.urlopen", mock): path = self.resource._download_file("http://url", "test.txt") self.assertEqual(path, "") def test_download_file_URL_error(self): mock = Mock(side_effect=urllib.error.URLError("test error")) with patch("urllib.request.urlopen", mock): path1 = self.resource._download_file("http://url", "test.txt") path2 = self.resource._download_file("ftp://url", "test.txt") self.assertEqual(path1, "") self.assertEqual(path2, "") def test_get_reference_sequences(self): def f(): seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 1) self.assertEqual(seqs["MT"].__repr__(), "ReferenceSequence(assembly='GRCh37', ID='MT')") self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual( seqs["MT"].url, "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) self.assertEqual( seqs["MT"].path, os.path.relpath( f'{os.path.join(self.resource._resources_dir,"fasta", "GRCh37","Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz")}' ), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, "GRCh37") self.assertEqual(seqs["MT"].build, "B37") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f) def test_get_all_reference_sequences(self): def f(): seqs = self.resource.get_all_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 3) self.assertEqual(len(seqs["NCBI36"]), 1) self.assertEqual( seqs["NCBI36"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "NCBI36", "Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz", )), ) self.assertEqual(len(seqs["GRCh37"]), 1) self.assertEqual( seqs["GRCh37"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh37", "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", )), ) self.assertEqual(len(seqs["GRCh38"]), 1) self.assertEqual( seqs["GRCh38"]["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh38", "Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz", )), ) self.run_reference_sequences_test(f) def test_get_reference_sequences_invalid_assembly(self): seqs = self.resource.get_reference_sequences(assembly="36") self.assertEqual(len(seqs), 0) def test_get_reference_sequences_chrom_not_available(self): def f(): self.resource.get_reference_sequences(chroms=["MT"]) del self.resource._reference_sequences["GRCh37"]["MT"] seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs), 1) self.assertEqual(seqs["MT"].__repr__(), "ReferenceSequence(assembly='GRCh37', ID='MT')") self.assertEqual(seqs["MT"].ID, "MT") self.assertEqual(seqs["MT"].chrom, "MT") self.assertEqual( seqs["MT"].url, "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", ) self.assertEqual( seqs["MT"].path, os.path.relpath( os.path.join( self.resource._resources_dir, "fasta", "GRCh37", "Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz", )), ) self.assertTrue(os.path.exists(seqs["MT"].path)) self.assertEqual(seqs["MT"].assembly, "GRCh37") self.assertEqual(seqs["MT"].build, "B37") self.assertEqual(seqs["MT"].species, "H**o sapiens") self.assertEqual(seqs["MT"].taxonomy, "x") self.run_reference_sequences_test(f) def run_reference_sequence_load_sequence_test(self, hash): def f(): seqs = self.resource.get_reference_sequences(chroms=["MT"]) self.assertEqual(len(seqs["MT"].sequence), 16569) self.assertEqual(seqs["MT"].md5, hash) self.assertEqual(seqs["MT"].start, 1) self.assertEqual(seqs["MT"].end, 16569) self.assertEqual(seqs["MT"].length, 16569) seqs["MT"].clear() self.assertEqual(seqs["MT"]._sequence.size, 0) self.assertEqual(seqs["MT"]._md5, "") self.assertEqual(seqs["MT"]._start, 0) self.assertEqual(seqs["MT"]._end, 0) self.assertEqual(seqs["MT"]._length, 0) self.assertEqual(len(seqs["MT"].sequence), 16569) self.assertEqual(seqs["MT"].md5, hash) self.assertEqual(seqs["MT"].start, 1) self.assertEqual(seqs["MT"].end, 16569) self.assertEqual(seqs["MT"].length, 16569) self.run_reference_sequences_test(f) def test_reference_sequence_load_sequence(self): if self.downloads_enabled: self.run_reference_sequence_load_sequence_test( "c68f52674c9fb33aef52dcf399755519") else: self.run_reference_sequence_load_sequence_test( "d432324413a21aa9247321c56c300ad3") def test_reference_sequence_generic_load_sequence(self): with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) self.assertEqual(seq.ID, "1") self.assertEqual(seq.chrom, "1") self.assertEqual(seq.path, dest) np.testing.assert_array_equal( seq.sequence, np.array( bytearray( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), dtype=np.uint8, ), ) self.assertListEqual(list("AGGCCGGAC"), list(map(chr, seq.sequence[100:109]))) self.assertEqual(seq.md5, "6ac6176535ad0e38aba2d05d786c39b6") self.assertEqual(seq.start, 1) self.assertEqual(seq.end, 117) self.assertEqual(seq.length, 117) def test_get_opensnp_datadump_filenames(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") filenames = self.resource.get_opensnp_datadump_filenames() self.assertListEqual(filenames, ["generic1.csv", "generic2.csv"]) self.resource._resources_dir = "resources" def test_load_opensnp_datadump_file(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") snps1 = SNPs( self.resource.load_opensnp_datadump_file("generic1.csv")) snps2 = SNPs( self.resource.load_opensnp_datadump_file("generic2.csv")) pd.testing.assert_frame_equal(snps1.snps, self.generic_snps(), check_exact=True) pd.testing.assert_frame_equal(snps2.snps, self.generic_snps(), check_exact=True) self.resource._resources_dir = "resources"
def _teardown_gsa_test(): r = Resources() r._resources_dir = "resources" r._gsa_resources = {}
def _teardown_gsa_test(): r = Resources() r._resources_dir = "resources" r._init_resource_attributes()
class TestResources(BaseSNPsTestCase): def setUp(self): self.resource = Resources(resources_dir="resources") self.del_output_dir_helper() def test_get_assembly_mapping_data(self): assembly_mapping_data = self.resource.get_assembly_mapping_data( "NCBI36", "GRCh37") assert len(assembly_mapping_data) == 25 def test_get_gsa_resources(self): gsa_resources = self.resource.get_gsa_resources() assert len(gsa_resources["rsid_map"]) == 618541 assert len(gsa_resources["chrpos_map"]) == 665609 def test_get_all_resources(self): resources = self.resource.get_all_resources() for k, v in resources.items(): if not v: assert False assert True def test_download_example_datasets(self): paths = self.resource.download_example_datasets() for path in paths: if not path or not os.path.exists(path): warnings.warn("Example dataset(s) not currently available") return assert True def test_get_paths_reference_sequences_invalid_assembly(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="36") assert not assembly assert not chroms assert not urls assert not paths def test_create_reference_sequences_NCBI36(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="NCBI36", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='NCBI36', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/release-54/fasta/homo_sapiens/dna/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "NCBI36" assert seqs["MT"].build == "B36" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_GRCh37(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh37", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_GRCh38(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh38", chroms=["MT"]) seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh38', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh38" assert seqs["MT"].build == "B38" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_create_reference_sequences_invalid_path(self): assembly, chroms, urls, paths = self.resource._get_paths_reference_sequences( assembly="GRCh38", chroms=["MT"]) paths[0] = "" seqs = self.resource._create_reference_sequences( assembly, chroms, urls, paths) assert len(seqs) == 0 def test_get_reference_sequences(self): seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_get_all_reference_sequences(self): seqs = self.resource.get_all_reference_sequences(chroms=["MT"]) assert len(seqs) == 3 assert len(seqs["NCBI36"]) == 1 assert ( seqs["NCBI36"]["MT"].path == "resources/fasta/NCBI36/Homo_sapiens.NCBI36.54.dna.chromosome.MT.fa.gz" ) assert len(seqs["GRCh37"]) == 1 assert ( seqs["GRCh37"]["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert len(seqs["GRCh38"]) == 1 assert ( seqs["GRCh38"]["MT"].path == "resources/fasta/GRCh38/Homo_sapiens.GRCh38.dna.chromosome.MT.fa.gz" ) def test_get_reference_sequences_invalid_assembly(self): seqs = self.resource.get_reference_sequences(assembly="36") assert len(seqs) == 0 def test_get_reference_sequences_chrom_not_available(self): self.resource.get_reference_sequences(chroms=["MT"]) del self.resource._reference_sequences["GRCh37"]["MT"] seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs) == 1 assert seqs["MT"].__repr__( ) == "ReferenceSequence(assembly='GRCh37', ID='MT')" assert seqs["MT"].ID == "MT" assert seqs["MT"].chrom == "MT" assert ( seqs["MT"].url == "ftp://ftp.ensembl.org/pub/grch37/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert ( seqs["MT"].path == "resources/fasta/GRCh37/Homo_sapiens.GRCh37.dna.chromosome.MT.fa.gz" ) assert os.path.exists(seqs["MT"].path) assert seqs["MT"].assembly == "GRCh37" assert seqs["MT"].build == "B37" assert seqs["MT"].species == "H**o sapiens" assert seqs["MT"].taxonomy == "x" def test_reference_sequence_load_sequence(self): seqs = self.resource.get_reference_sequences(chroms=["MT"]) assert len(seqs["MT"].sequence) == 16569 assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519" assert seqs["MT"].start == 1 assert seqs["MT"].end == 16569 assert seqs["MT"].length == 16569 seqs["MT"].clear() assert seqs["MT"]._sequence.size == 0 assert seqs["MT"]._md5 == "" assert seqs["MT"]._start == 0 assert seqs["MT"]._end == 0 assert seqs["MT"]._length == 0 assert len(seqs["MT"].sequence) == 16569 assert seqs["MT"].md5 == "c68f52674c9fb33aef52dcf399755519" assert seqs["MT"].start == 1 assert seqs["MT"].end == 16569 assert seqs["MT"].length == 16569 def test_reference_sequence_generic_load_sequence(self): with open("tests/input/generic.fa", "rb") as f_in: with atomic_write("tests/input/generic.fa.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz") assert seq.ID == "1" assert seq.chrom == "1" assert seq.path == "tests/input/generic.fa.gz" np.testing.assert_array_equal( seq.sequence, np.array( bytearray( "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGCCGGACNNNNNNNN", encoding="utf-8", errors="strict", ), dtype=np.uint8, ), ) assert list("AGGCCGGAC") == list(map(chr, seq.sequence[100:109])) assert seq.md5 == "6ac6176535ad0e38aba2d05d786c39b6" assert seq.start == 1 assert seq.end == 117 assert seq.length == 117 def test_get_opensnp_datadump_filenames(self): # temporarily set resources dir to tests self.resource._resources_dir = "tests/resources" # write test openSNP datadump zip with atomic_write("tests/resources/opensnp_datadump.current.zip", mode="wb", overwrite=True) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") filenames = self.resource.get_opensnp_datadump_filenames() assert filenames == ["generic1.csv", "generic2.csv"] self.resource._resources_dir = "resources"
from atomicwrites import atomic_write import pandas as pd from snps import SNPs from snps.resources import Resources from snps.utils import Parallelizer, save_df_as_csv, create_dir, clean_str OUTPUT_DIR = "output" EXTRACT_FILES = True # create output directory for this example create_dir(OUTPUT_DIR) # assume script is being run from examples dir r = Resources(resources_dir="../../resources") # setup logger to output to file in output directory logging.basicConfig( filename=f'{os.path.join(OUTPUT_DIR, "parse-opensnp-files.txt")}', format="%(asctime)s: %(message)s", filemode="w", level=logging.INFO, ) logger = logging.getLogger() def load_file(task): file = task["file"]
import numpy as np from matplotlib import patches import matplotlib.pyplot as plt import pandas as pd from snps import SNPs from snps.resources import Resources from snps.utils import Parallelizer, save_df_as_csv, create_dir OUTPUT_DIR = "output" # create output directory for this example create_dir(OUTPUT_DIR) # assume script is being run from examples dir r = Resources(resources_dir="../../resources") # setup logger to output to file in output directory logging.basicConfig( filename=f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.txt')}", format="%(asctime)s#%(message)s", filemode="w", level=logging.INFO, ) logger = logging.getLogger() def get_xy_chrom_snp_ratios(task): file = task["file"]
def setUp(self): self.resource = Resources(resources_dir="resources") self.del_output_dir_helper()
""" Get a file from the openSNP datadump for debugging. """ import os from atomicwrites import atomic_write from snps.resources import Resources from snps.utils import create_dir OUTPUT_DIR = "output" FILE = "user662_file340_yearofbirth_unknown_sex_unknown.23andme.txt" if __name__ == "__main__": # create output directory for this example create_dir(OUTPUT_DIR) # assume script is being run from examples dir r = Resources(resources_dir="../../resources") with atomic_write(os.path.join(OUTPUT_DIR, FILE), mode="wb") as f: f.write(r.load_opensnp_datadump_file(FILE))
filter_user_genotypes, get_1kg_samples, impute_missing, vcf2df, ) DATA_DIR = "data" OUTPUT_DIR = "output" aisnp_SET = "kidd et al. 55 aisnps" # {"kidd et al. 55 aisnps", "Seldin et al. 128 aisnps"} DIMENSIONALITY_REDUCTION_ALGORITHM = "pca" # {"pca", "umap", "t-SNE"} # create output directory for this example create_dir(OUTPUT_DIR) # assume `opensnp_datadump.current.zip` is found at this location r = Resources(resources_dir=DATA_DIR) # setup logger to output to file in output directory logging.basicConfig( filename=f'{os.path.join(OUTPUT_DIR, "opensnp_ancestry.txt")}', format="%(asctime)s: %(message)s", filemode="w", level=logging.INFO, ) def main(): logging.info("start analysis") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames()