def test_merge_invalid_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{"merged": True}, {}])
def test_merge_non_existent_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{}, {"merged": True}])
def test_save_snps_vcf_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # setup resource to use test FASTA reference sequence r = Resources() r._reference_sequences["GRCh37"] = {} with open("tests/input/generic.fa", "rb") as f_in: with atomic_write("tests/input/generic.fa.gz", mode="wb", overwrite=True) as f_out: with gzip.open(f_out, "wb") as f_gzip: shutil.copyfileobj(f_in, f_gzip) seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz") r._reference_sequences["GRCh37"]["1"] = seq # save phased data to VCF assert os.path.relpath( s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf" # read saved VCF s = SNPs("output/vcf_GRCh37.vcf") assert s.phased pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
def test_merge_list(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/GRCh37.csv")]) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assertEqual(s.source, "generic, generic") self.assertListEqual(s._source, ["generic", "generic"]) self.assert_results( results, [ { "merged": True }, { "merged": True, "common_rsids": pd.Index( [ "rs3094315", "rs2500347", "rsIndelTest", "rs11928389" ], name="rsid", ), }, ], )
def test_load_opensnp_datadump_file(self): with tempfile.TemporaryDirectory() as tmpdir: # temporarily set resources dir to tests self.resource._resources_dir = tmpdir # write test openSNP datadump zip with atomic_write( os.path.join(tmpdir, "opensnp_datadump.current.zip"), mode="wb", overwrite=True, ) as f: with zipfile.ZipFile(f, "w") as f_zip: f_zip.write("tests/input/generic.csv", arcname="generic1.csv") f_zip.write("tests/input/generic.csv", arcname="generic2.csv") snps1 = SNPs( self.resource.load_opensnp_datadump_file("generic1.csv")) snps2 = SNPs( self.resource.load_opensnp_datadump_file("generic2.csv")) pd.testing.assert_frame_equal(snps1.snps, self.generic_snps(), check_exact=True) pd.testing.assert_frame_equal(snps2.snps, self.generic_snps(), check_exact=True) self.resource._resources_dir = "resources"
def test_save_snps_csv_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # save phased data to CSV assert os.path.relpath(s.save_snps()) == "output/vcf_GRCh37.csv" # read saved CSV s = SNPs("output/vcf_GRCh37.csv") assert s.phased pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
def test_save_source(self): s = SNPs("tests/input/GRCh38.csv") self.assertEqual(os.path.relpath(s.save()), f"output{os.sep}generic_GRCh38.txt") snps = SNPs("output/generic_GRCh38.txt") self.assertEqual(snps.build, 38) self.assertTrue(snps.build_detected) self.assertEqual(snps.source, "generic") self.assertListEqual(snps._source, ["generic"]) pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38(), check_exact=True)
def f(): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False) df = s.discrepant_snps self.assertEqual(len(df), 4) pd.testing.assert_index_equal( df.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, )
def test_merge_exceed_discrepant_genotypes_threshold(self): s1 = SNPs("tests/input/generic.csv") s2 = SNPs("tests/input/generic.csv") s2._snps.loc["rs1", "genotype"] = "CC" results = s1.merge([s2], discrepant_genotypes_threshold=0) self.assertEqual(len(s1.discrepant_merge_positions), 0) self.assertEqual(len(s1.discrepant_merge_genotypes), 0) self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0) pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) self.assert_results(results, [{}])
def test_save_source(self): with tempfile.TemporaryDirectory() as tmpdir: s = SNPs("tests/input/GRCh38.csv", output_dir=tmpdir) dest = os.path.join(tmpdir, "generic_GRCh38.txt") self.assertEqual(s.save(), dest) snps = SNPs(dest) self.assertEqual(snps.build, 38) self.assertTrue(snps.build_detected) self.assertEqual(snps.source, "generic") self.assertListEqual(snps._source, ["generic"]) pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38(), check_exact=True)
def test_merge_chrom(self): s1 = SNPs("tests/input/generic.csv") df = s1.snps.append( self.create_snp_df( rsid=["rs100", "rs101", "rs102", "rs103"], chrom=["Y", "Y", "Y", "Y"], pos=[100, 101, 102, 103], genotype=["A", np.nan, "A", "A"], )) s1._snps = df.copy() s2 = SNPs() s2._build = 37 s2._snps = df.copy() # set values for chrom that will be ignored (that would otherwise result in # identification of discrepant SNPs or updating genotype) s2._snps.loc["rs3", "pos"] = 1003 # discrepant position s2._snps.loc["rs4", "genotype"] = "AA" # discrepant genotype s2._snps.loc["rs5", "genotype"] = "AA" # set values for chrom to be merged s2._snps.loc["rs100", "genotype"] = "T" # discrepant genotype s2._snps.loc["rs101", "genotype"] = "A" s2._snps.loc["rs102", "pos"] = 1002 # discrepant position # set expected values for merge result df.loc["rs100", "genotype"] = np.nan # discrepant genotype sets to np.nan df.loc["rs101", "genotype"] = "A" # updates np.nan results = s1.merge([s2], chrom="Y") pd.testing.assert_frame_equal(s1.snps, df, check_exact=True) self.assert_results( results, [{ "merged": True, "common_rsids": pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"), "discrepant_position_rsids": pd.Index(["rs102"], name="rsid"), "discrepant_genotype_rsids": pd.Index(["rs100"], name="rsid"), }], ) self.assertEqual(len(s1.discrepant_merge_positions), 1) self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
def test_source_snps(self): s = SNPs("tests/input/GRCh37.csv") self.assertEqual(s.source, "generic") results = s.merge((SNPs("tests/input/23andme.txt"),)) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) self.assertEqual( os.path.relpath(s.save()), f"output{os.sep}generic__23andMe_GRCh37.txt" ) s = SNPs("output/generic__23andMe_GRCh37.txt") self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True) self.assert_results(results, [{"merged": True}])
def test_source_snps(self): with tempfile.TemporaryDirectory() as tmpdir: s = SNPs("tests/input/GRCh37.csv", output_dir=tmpdir) self.assertEqual(s.source, "generic") results = s.merge((SNPs("tests/input/23andme.txt"), )) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) dest = os.path.join(tmpdir, "generic__23andMe_GRCh37.txt") self.assertEqual(s.save(), dest) s = SNPs(dest) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True) self.assert_results(results, [{"merged": True}])
def test_merge_remap_false(self): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False) self.assertEqual(len(s.discrepant_merge_positions), 4) pd.testing.assert_index_equal( s.discrepant_merge_positions.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, ) self.assertEqual(len(s.discrepant_merge_genotypes), 1) pd.testing.assert_index_equal( s.discrepant_merge_genotypes.index, results[0]["discrepant_genotype_rsids"], check_exact=True, check_names=True, ) self.assertEqual(len(s.discrepant_merge_positions_genotypes), 4) pd.testing.assert_index_equal( s.discrepant_merge_positions_genotypes.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, ) expected = self.snps_NCBI36() expected.loc[ "rs11928389", "genotype"] = np.nan # discrepant genotype is set to null / NA pd.testing.assert_frame_equal(s.snps, expected, check_exact=True) self.assert_results( results, [{ "merged": True, "common_rsids": pd.Index( ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], name="rsid", ), "discrepant_position_rsids": pd.Index( ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], name="rsid", ), "discrepant_genotype_rsids": pd.Index(["rs11928389"], name="rsid"), }], )
def test_remap_invalid_assembly(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(-1) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 0) self.assertEqual(len(chromosomes_not_remapped), 2)
def f(): snps = SNPs("tests/input/generic.csv") self.assertEqual( os.path.relpath(snps.save_snps(sep=",")), f"output{os.sep}generic_GRCh37.csv", ) self.run_parsing_tests("output/generic_GRCh37.csv", "generic")
def test_save_snps_phased(self): # read phased data s = SNPs("tests/input/testvcf_phased.vcf") # save phased data to TSV self.assertEqual(os.path.relpath(s.save()), "output/vcf_GRCh37.txt") # read saved TSV self.run_parsing_tests_vcf("output/vcf_GRCh37.txt", phased=True)
def test_remap_snps_invalid_assembly(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(-1) assert s.build == 37 assert s.assembly == "GRCh37" assert len(chromosomes_remapped) == 0 assert len(chromosomes_not_remapped) == 2
def test_snps_unannotated_vcf(self): # https://samtools.github.io/hts-specs/VCFv4.2.pdf # this tests for homozygous snps, heterozygous snps, multiallelic snps, # phased snps, and snps with missing rsID s = SNPs("tests/input/unannotated_testvcf.vcf") assert s.source == "vcf" assert s.unannotated_vcf
def test_save_snps_csv_filename(self): snps = SNPs("tests/input/generic.csv") self.assertEqual( os.path.relpath(snps.save("generic.csv", sep=",")), f"output{os.sep}generic.csv", ) self.run_parsing_tests("output/generic.csv", "generic")
def test_save_snps_vcf_false_positive_build(self): with tempfile.TemporaryDirectory() as tmpdir1: snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1) r = Resources() r._reference_sequences["GRCh37"] = {} output = os.path.join(tmpdir1, "vcf_GRCh37.vcf") with tempfile.TemporaryDirectory() as tmpdir2: dest = os.path.join(tmpdir2, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq self.assertEqual(snps.save(vcf=True), output) s = "" with open(output, "r") as f: for line in f.readlines(): if "snps v" in line: s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n' else: s += line with open(output, "w") as f: f.write(s) self.run_parsing_tests_vcf(output)
def f(): s = SNPs("tests/input/generic.csv") snps = self.generic_snps() snps.drop("rs5", inplace=True) pd.testing.assert_frame_equal(s.not_null_snps(), snps, check_exact=True)
def test_save_snps_vcf_discrepant_pos(self): s = SNPs("tests/input/testvcf.vcf") r = Resources() r._reference_sequences["GRCh37"] = {} with tempfile.TemporaryDirectory() as tmpdir: dest = os.path.join(tmpdir, "generic.fa.gz") gzip_file("tests/input/generic.fa", dest) seq = ReferenceSequence(ID="1", path=dest) r._reference_sequences["GRCh37"]["1"] = seq # create discrepant SNPs by setting positions outside reference sequence s._snps.loc["rs1", "pos"] = 0 s._snps.loc["rs17", "pos"] = 118 self.assertEqual(os.path.relpath(s.save(vcf=True)), f"output{os.sep}vcf_GRCh37.vcf") pd.testing.assert_frame_equal( s.discrepant_vcf_position, self.create_snp_df( rsid=["rs1", "rs17"], chrom=["1", "1"], pos=[0, 118], genotype=["AA", np.nan], ), check_exact=True, ) expected = self.generic_snps_vcf().drop(["rs1", "rs17"]) self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
def test_appending_dfs(self): s = SNPs() s._snps = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._duplicate = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._discrepant_XY = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s.merge([s]) df = self.create_snp_df(rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[1, 1], genotype=["AA", "AA"]) pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True) pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True) pd.testing.assert_frame_equal(s.heterozygous_MT, get_empty_snps_dataframe(), check_exact=True) pd.testing.assert_frame_equal(s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True)
def test__lookup_build_with_snp_pos_None(self): snps = SNPs() snps._snps = self.create_snp_df(rsid=["rs3094315"], chrom=["1"], pos=[1], genotype=["AA"]) self.assertFalse(snps.detect_build())
def test_remap_37_to_37(self): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(37) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 0) self.assertEqual(len(chromosomes_not_remapped), 2) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
def test_save_snps_tsv_filename(self): with tempfile.TemporaryDirectory() as tmpdir: snps = SNPs("tests/input/generic.tsv", output_dir=tmpdir) dest = os.path.join(tmpdir, "generic.tsv") self.assertEqual( snps.save("generic.tsv", sep="\t"), dest, ) self.run_parsing_tests(dest, "generic")
def f(): s = SNPs("tests/input/NCBI36.csv", parallelize=True) chromosomes_remapped, chromosomes_not_remapped = s.remap(37) self.assertEqual(s.build, 37) self.assertEqual(s.assembly, "GRCh37") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
def test_sex_Male_Y_chrom(self): s = self.simulate_snps(chrom="Y", pos_start=1, pos_max=59373566, pos_step=10000) file = s.save_snps() snps = SNPs(file) assert snps.sex == "Male"
def f(): s = SNPs("tests/input/GRCh37.csv") chromosomes_remapped, chromosomes_not_remapped = s.remap(36) self.assertEqual(s.build, 36) self.assertEqual(s.assembly, "NCBI36") self.assertEqual(len(chromosomes_remapped), 2) self.assertEqual(len(chromosomes_not_remapped), 0) pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)