def test_appending_dfs(self): s = SNPs() s._snps = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._duplicate = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s._discrepant_XY = self.create_snp_df(rsid=["rs1"], chrom=["1"], pos=[1], genotype=["AA"]) s.merge([s]) df = self.create_snp_df(rsid=["rs1", "rs1"], chrom=["1", "1"], pos=[1, 1], genotype=["AA", "AA"]) pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True) pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True) pd.testing.assert_frame_equal(s.heterozygous_MT, get_empty_snps_dataframe(), check_exact=True) pd.testing.assert_frame_equal(s.discrepant_vcf_position, get_empty_snps_dataframe(), check_exact=True)
def test_merge_invalid_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{"merged": True}, {}])
def test_merge_list(self): s = SNPs() results = s.merge( [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/GRCh37.csv")]) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assertEqual(s.source, "generic, generic") self.assertListEqual(s._source, ["generic", "generic"]) self.assert_results( results, [ { "merged": True }, { "merged": True, "common_rsids": pd.Index( [ "rs3094315", "rs2500347", "rsIndelTest", "rs11928389" ], name="rsid", ), }, ], )
def test_merge_non_existent_file(self): s = SNPs() results = s.merge( [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")] ) pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True) self.assert_results(results, [{}, {"merged": True}])
def f(): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False) df = s.discrepant_snps self.assertEqual(len(df), 4) pd.testing.assert_index_equal( df.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, )
def test_merge_exceed_discrepant_genotypes_threshold(self): s1 = SNPs("tests/input/generic.csv") s2 = SNPs("tests/input/generic.csv") s2._snps.loc["rs1", "genotype"] = "CC" results = s1.merge([s2], discrepant_genotypes_threshold=0) self.assertEqual(len(s1.discrepant_merge_positions), 0) self.assertEqual(len(s1.discrepant_merge_genotypes), 0) self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0) pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) self.assert_results(results, [{}])
def test_merge_chrom(self): s1 = SNPs("tests/input/generic.csv") df = s1.snps.append( self.create_snp_df( rsid=["rs100", "rs101", "rs102", "rs103"], chrom=["Y", "Y", "Y", "Y"], pos=[100, 101, 102, 103], genotype=["A", np.nan, "A", "A"], )) s1._snps = df.copy() s2 = SNPs() s2._build = 37 s2._snps = df.copy() # set values for chrom that will be ignored (that would otherwise result in # identification of discrepant SNPs or updating genotype) s2._snps.loc["rs3", "pos"] = 1003 # discrepant position s2._snps.loc["rs4", "genotype"] = "AA" # discrepant genotype s2._snps.loc["rs5", "genotype"] = "AA" # set values for chrom to be merged s2._snps.loc["rs100", "genotype"] = "T" # discrepant genotype s2._snps.loc["rs101", "genotype"] = "A" s2._snps.loc["rs102", "pos"] = 1002 # discrepant position # set expected values for merge result df.loc["rs100", "genotype"] = np.nan # discrepant genotype sets to np.nan df.loc["rs101", "genotype"] = "A" # updates np.nan results = s1.merge([s2], chrom="Y") pd.testing.assert_frame_equal(s1.snps, df, check_exact=True) self.assert_results( results, [{ "merged": True, "common_rsids": pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"), "discrepant_position_rsids": pd.Index(["rs102"], name="rsid"), "discrepant_genotype_rsids": pd.Index(["rs100"], name="rsid"), }], ) self.assertEqual(len(s1.discrepant_merge_positions), 1) self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
def test_source_snps(self): with tempfile.TemporaryDirectory() as tmpdir: s = SNPs("tests/input/GRCh37.csv", output_dir=tmpdir) self.assertEqual(s.source, "generic") results = s.merge((SNPs("tests/input/23andme.txt"), )) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) dest = os.path.join(tmpdir, "generic__23andMe_GRCh37.txt") self.assertEqual(s.save(), dest) s = SNPs(dest) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True) self.assert_results(results, [{"merged": True}])
def test_source_snps(self): s = SNPs("tests/input/GRCh37.csv") self.assertEqual(s.source, "generic") results = s.merge((SNPs("tests/input/23andme.txt"),)) self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) self.assertEqual( os.path.relpath(s.save()), f"output{os.sep}generic__23andMe_GRCh37.txt" ) s = SNPs("output/generic__23andMe_GRCh37.txt") self.assertEqual(s.source, "generic, 23andMe") self.assertListEqual(s._source, ["generic", "23andMe"]) pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True) self.assert_results(results, [{"merged": True}])
def test_merge_remap_false(self): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False) self.assertEqual(len(s.discrepant_merge_positions), 4) pd.testing.assert_index_equal( s.discrepant_merge_positions.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, ) self.assertEqual(len(s.discrepant_merge_genotypes), 1) pd.testing.assert_index_equal( s.discrepant_merge_genotypes.index, results[0]["discrepant_genotype_rsids"], check_exact=True, check_names=True, ) self.assertEqual(len(s.discrepant_merge_positions_genotypes), 4) pd.testing.assert_index_equal( s.discrepant_merge_positions_genotypes.index, results[0]["discrepant_position_rsids"], check_exact=True, check_names=True, ) expected = self.snps_NCBI36() expected.loc[ "rs11928389", "genotype"] = np.nan # discrepant genotype is set to null / NA pd.testing.assert_frame_equal(s.snps, expected, check_exact=True) self.assert_results( results, [{ "merged": True, "common_rsids": pd.Index( ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], name="rsid", ), "discrepant_position_rsids": pd.Index( ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], name="rsid", ), "discrepant_genotype_rsids": pd.Index(["rs11928389"], name="rsid"), }], )
def f(): s = SNPs("tests/input/NCBI36.csv") results = s.merge([SNPs("tests/input/GRCh37.csv")]) self.assertEqual(len(s.discrepant_merge_positions), 0) self.assertEqual(len(s.discrepant_merge_genotypes), 0) pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True) self.assert_results( results, [ { "merged": True, "common_rsids": pd.Index( ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"], name="rsid", ), } ], )
def test_merge_unphased(self): s1 = SNPs("tests/input/generic.csv") s2 = SNPs("tests/input/generic.csv") s1._phased = True results = s1.merge([s2]) self.assertFalse(s1.phased) pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True) self.assert_results( results, [ { "merged": True, "common_rsids": pd.Index( ["rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8"], name="rsid", ), } ], )
def test_merging_files_discrepant_snps(self): with tempfile.TemporaryDirectory() as tmpdir: dest1 = os.path.join(tmpdir, "discrepant_snps1.csv") dest2 = os.path.join(tmpdir, "discrepant_snps2.csv") df = pd.read_csv( "tests/input/discrepant_snps.csv", skiprows=1, na_values="--", names=[ "rsid", "chrom", "pos_file1", "pos_file2", "genotype_file1", "genotype_file2", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ], index_col=0, dtype={ "chrom": object, "pos_file1": np.uint32, "pos_file2": np.uint32, "discrepant_position": bool, "discrepant_genotype": bool, "expected_position": np.uint32, }, ) df1 = df[["chrom", "pos_file1", "genotype_file1"]] df2 = df[["chrom", "pos_file2", "genotype_file2"]] df1.to_csv(dest1, na_rep="--", header=["chromosome", "position", "genotype"]) df2.to_csv(dest2, na_rep="--", header=["chromosome", "position", "genotype"]) s = SNPs() s.merge([SNPs(dest1), SNPs(dest2)]) expected = df[[ "chrom", "discrepant_position", "discrepant_genotype", "expected_position", "expected_genotype", ]] expected = expected.rename(columns={ "expected_position": "pos", "expected_genotype": "genotype" }) expected_snps = SNPs() expected_snps._snps = expected expected_snps.sort() expected = expected_snps.snps pd.testing.assert_index_equal( s.discrepant_merge_positions.index, expected.loc[expected["discrepant_position"] == True].index, ) pd.testing.assert_index_equal( s.discrepant_merge_genotypes.index, expected.loc[expected["discrepant_genotype"] == True].index, ) pd.testing.assert_series_equal(s.snps["pos"], expected["pos"]) pd.testing.assert_series_equal(s.snps["genotype"], expected["genotype"])