Пример #1
0
 def test_appending_dfs(self):
     s = SNPs()
     s._snps = self.create_snp_df(rsid=["rs1"],
                                  chrom=["1"],
                                  pos=[1],
                                  genotype=["AA"])
     s._duplicate = self.create_snp_df(rsid=["rs1"],
                                       chrom=["1"],
                                       pos=[1],
                                       genotype=["AA"])
     s._discrepant_XY = self.create_snp_df(rsid=["rs1"],
                                           chrom=["1"],
                                           pos=[1],
                                           genotype=["AA"])
     s.merge([s])
     df = self.create_snp_df(rsid=["rs1", "rs1"],
                             chrom=["1", "1"],
                             pos=[1, 1],
                             genotype=["AA", "AA"])
     pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True)
     pd.testing.assert_frame_equal(s.heterozygous_MT,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_vcf_position,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
Пример #2
0
 def test_merge_invalid_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{"merged": True}, {}])
Пример #3
0
 def test_merge_list(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"),
          SNPs("tests/input/GRCh37.csv")])
     pd.testing.assert_frame_equal(s.snps,
                                   self.snps_GRCh37(),
                                   check_exact=True)
     self.assertEqual(s.source, "generic, generic")
     self.assertListEqual(s._source, ["generic", "generic"])
     self.assert_results(
         results,
         [
             {
                 "merged": True
             },
             {
                 "merged":
                 True,
                 "common_rsids":
                 pd.Index(
                     [
                         "rs3094315", "rs2500347", "rsIndelTest",
                         "rs11928389"
                     ],
                     name="rsid",
                 ),
             },
         ],
     )
Пример #4
0
 def test_merge_non_existent_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{}, {"merged": True}])
Пример #5
0
 def f():
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False)
     df = s.discrepant_snps
     self.assertEqual(len(df), 4)
     pd.testing.assert_index_equal(
         df.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
Пример #6
0
    def test_merge_exceed_discrepant_genotypes_threshold(self):
        s1 = SNPs("tests/input/generic.csv")
        s2 = SNPs("tests/input/generic.csv")
        s2._snps.loc["rs1", "genotype"] = "CC"

        results = s1.merge([s2], discrepant_genotypes_threshold=0)
        self.assertEqual(len(s1.discrepant_merge_positions), 0)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 0)
        self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0)
        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
        self.assert_results(results, [{}])
Пример #7
0
    def test_merge_chrom(self):
        s1 = SNPs("tests/input/generic.csv")
        df = s1.snps.append(
            self.create_snp_df(
                rsid=["rs100", "rs101", "rs102", "rs103"],
                chrom=["Y", "Y", "Y", "Y"],
                pos=[100, 101, 102, 103],
                genotype=["A", np.nan, "A", "A"],
            ))
        s1._snps = df.copy()
        s2 = SNPs()
        s2._build = 37
        s2._snps = df.copy()

        # set values for chrom that will be ignored (that would otherwise result in
        # identification of discrepant SNPs or updating genotype)
        s2._snps.loc["rs3", "pos"] = 1003  # discrepant position
        s2._snps.loc["rs4", "genotype"] = "AA"  # discrepant genotype
        s2._snps.loc["rs5", "genotype"] = "AA"

        # set values for chrom to be merged
        s2._snps.loc["rs100", "genotype"] = "T"  # discrepant genotype
        s2._snps.loc["rs101", "genotype"] = "A"
        s2._snps.loc["rs102", "pos"] = 1002  # discrepant position

        # set expected values for merge result
        df.loc["rs100",
               "genotype"] = np.nan  # discrepant genotype sets to np.nan
        df.loc["rs101", "genotype"] = "A"  # updates np.nan

        results = s1.merge([s2], chrom="Y")

        pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)

        self.assert_results(
            results,
            [{
                "merged":
                True,
                "common_rsids":
                pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"),
                "discrepant_position_rsids":
                pd.Index(["rs102"], name="rsid"),
                "discrepant_genotype_rsids":
                pd.Index(["rs100"], name="rsid"),
            }],
        )

        self.assertEqual(len(s1.discrepant_merge_positions), 1)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
Пример #8
0
 def test_source_snps(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         s = SNPs("tests/input/GRCh37.csv", output_dir=tmpdir)
         self.assertEqual(s.source, "generic")
         results = s.merge((SNPs("tests/input/23andme.txt"), ))
         self.assertEqual(s.source, "generic, 23andMe")
         self.assertListEqual(s._source, ["generic", "23andMe"])
         dest = os.path.join(tmpdir, "generic__23andMe_GRCh37.txt")
         self.assertEqual(s.save(), dest)
         s = SNPs(dest)
         self.assertEqual(s.source, "generic, 23andMe")
         self.assertListEqual(s._source, ["generic", "23andMe"])
         pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True)
         self.assert_results(results, [{"merged": True}])
Пример #9
0
 def test_source_snps(self):
     s = SNPs("tests/input/GRCh37.csv")
     self.assertEqual(s.source, "generic")
     results = s.merge((SNPs("tests/input/23andme.txt"),))
     self.assertEqual(s.source, "generic, 23andMe")
     self.assertListEqual(s._source, ["generic", "23andMe"])
     self.assertEqual(
         os.path.relpath(s.save()), f"output{os.sep}generic__23andMe_GRCh37.txt"
     )
     s = SNPs("output/generic__23andMe_GRCh37.txt")
     self.assertEqual(s.source, "generic, 23andMe")
     self.assertListEqual(s._source, ["generic", "23andMe"])
     pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True)
     self.assert_results(results, [{"merged": True}])
Пример #10
0
 def test_merge_remap_false(self):
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False)
     self.assertEqual(len(s.discrepant_merge_positions), 4)
     pd.testing.assert_index_equal(
         s.discrepant_merge_positions.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
     self.assertEqual(len(s.discrepant_merge_genotypes), 1)
     pd.testing.assert_index_equal(
         s.discrepant_merge_genotypes.index,
         results[0]["discrepant_genotype_rsids"],
         check_exact=True,
         check_names=True,
     )
     self.assertEqual(len(s.discrepant_merge_positions_genotypes), 4)
     pd.testing.assert_index_equal(
         s.discrepant_merge_positions_genotypes.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
     expected = self.snps_NCBI36()
     expected.loc[
         "rs11928389",
         "genotype"] = np.nan  # discrepant genotype is set to null / NA
     pd.testing.assert_frame_equal(s.snps, expected, check_exact=True)
     self.assert_results(
         results,
         [{
             "merged":
             True,
             "common_rsids":
             pd.Index(
                 ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"],
                 name="rsid",
             ),
             "discrepant_position_rsids":
             pd.Index(
                 ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"],
                 name="rsid",
             ),
             "discrepant_genotype_rsids":
             pd.Index(["rs11928389"], name="rsid"),
         }],
     )
Пример #11
0
 def f():
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")])
     self.assertEqual(len(s.discrepant_merge_positions), 0)
     self.assertEqual(len(s.discrepant_merge_genotypes), 0)
     pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)
     self.assert_results(
         results,
         [
             {
                 "merged": True,
                 "common_rsids": pd.Index(
                     ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"],
                     name="rsid",
                 ),
             }
         ],
     )
Пример #12
0
    def test_merge_unphased(self):
        s1 = SNPs("tests/input/generic.csv")
        s2 = SNPs("tests/input/generic.csv")
        s1._phased = True

        results = s1.merge([s2])
        self.assertFalse(s1.phased)
        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
        self.assert_results(
            results,
            [
                {
                    "merged": True,
                    "common_rsids": pd.Index(
                        ["rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8"],
                        name="rsid",
                    ),
                }
            ],
        )
Пример #13
0
    def test_merging_files_discrepant_snps(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dest1 = os.path.join(tmpdir, "discrepant_snps1.csv")
            dest2 = os.path.join(tmpdir, "discrepant_snps2.csv")

            df = pd.read_csv(
                "tests/input/discrepant_snps.csv",
                skiprows=1,
                na_values="--",
                names=[
                    "rsid",
                    "chrom",
                    "pos_file1",
                    "pos_file2",
                    "genotype_file1",
                    "genotype_file2",
                    "discrepant_position",
                    "discrepant_genotype",
                    "expected_position",
                    "expected_genotype",
                ],
                index_col=0,
                dtype={
                    "chrom": object,
                    "pos_file1": np.uint32,
                    "pos_file2": np.uint32,
                    "discrepant_position": bool,
                    "discrepant_genotype": bool,
                    "expected_position": np.uint32,
                },
            )

            df1 = df[["chrom", "pos_file1", "genotype_file1"]]
            df2 = df[["chrom", "pos_file2", "genotype_file2"]]

            df1.to_csv(dest1,
                       na_rep="--",
                       header=["chromosome", "position", "genotype"])

            df2.to_csv(dest2,
                       na_rep="--",
                       header=["chromosome", "position", "genotype"])

            s = SNPs()
            s.merge([SNPs(dest1), SNPs(dest2)])

            expected = df[[
                "chrom",
                "discrepant_position",
                "discrepant_genotype",
                "expected_position",
                "expected_genotype",
            ]]
            expected = expected.rename(columns={
                "expected_position": "pos",
                "expected_genotype": "genotype"
            })
            expected_snps = SNPs()
            expected_snps._snps = expected
            expected_snps.sort()
            expected = expected_snps.snps

            pd.testing.assert_index_equal(
                s.discrepant_merge_positions.index,
                expected.loc[expected["discrepant_position"] == True].index,
            )

            pd.testing.assert_index_equal(
                s.discrepant_merge_genotypes.index,
                expected.loc[expected["discrepant_genotype"] == True].index,
            )

            pd.testing.assert_series_equal(s.snps["pos"], expected["pos"])
            pd.testing.assert_series_equal(s.snps["genotype"],
                                           expected["genotype"])