예제 #1
0
 def test_merge_invalid_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"), SNPs("tests/input/empty.txt")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{"merged": True}, {}])
예제 #2
0
 def test_merge_non_existent_file(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/non_existent_file.csv"), SNPs("tests/input/GRCh37.csv")]
     )
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
     self.assert_results(results, [{}, {"merged": True}])
예제 #3
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")

        r._reference_sequences["GRCh37"]["1"] = seq

        # save phased data to VCF
        assert os.path.relpath(
            s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf"
        # read saved VCF
        s = SNPs("output/vcf_GRCh37.vcf")
        assert s.phased
        pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
예제 #4
0
 def test_merge_list(self):
     s = SNPs()
     results = s.merge(
         [SNPs("tests/input/GRCh37.csv"),
          SNPs("tests/input/GRCh37.csv")])
     pd.testing.assert_frame_equal(s.snps,
                                   self.snps_GRCh37(),
                                   check_exact=True)
     self.assertEqual(s.source, "generic, generic")
     self.assertListEqual(s._source, ["generic", "generic"])
     self.assert_results(
         results,
         [
             {
                 "merged": True
             },
             {
                 "merged":
                 True,
                 "common_rsids":
                 pd.Index(
                     [
                         "rs3094315", "rs2500347", "rsIndelTest",
                         "rs11928389"
                     ],
                     name="rsid",
                 ),
             },
         ],
     )
예제 #5
0
    def test_load_opensnp_datadump_file(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            # temporarily set resources dir to tests
            self.resource._resources_dir = tmpdir

            # write test openSNP datadump zip
            with atomic_write(
                    os.path.join(tmpdir, "opensnp_datadump.current.zip"),
                    mode="wb",
                    overwrite=True,
            ) as f:
                with zipfile.ZipFile(f, "w") as f_zip:
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic1.csv")
                    f_zip.write("tests/input/generic.csv",
                                arcname="generic2.csv")

            snps1 = SNPs(
                self.resource.load_opensnp_datadump_file("generic1.csv"))
            snps2 = SNPs(
                self.resource.load_opensnp_datadump_file("generic2.csv"))

            pd.testing.assert_frame_equal(snps1.snps,
                                          self.generic_snps(),
                                          check_exact=True)
            pd.testing.assert_frame_equal(snps2.snps,
                                          self.generic_snps(),
                                          check_exact=True)

            self.resource._resources_dir = "resources"
예제 #6
0
 def test_save_snps_csv_phased(self):
     # read phased data
     s = SNPs("tests/input/testvcf_phased.vcf")
     # save phased data to CSV
     assert os.path.relpath(s.save_snps()) == "output/vcf_GRCh37.csv"
     # read saved CSV
     s = SNPs("output/vcf_GRCh37.csv")
     assert s.phased
     pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
예제 #7
0
 def test_save_source(self):
     s = SNPs("tests/input/GRCh38.csv")
     self.assertEqual(os.path.relpath(s.save()), f"output{os.sep}generic_GRCh38.txt")
     snps = SNPs("output/generic_GRCh38.txt")
     self.assertEqual(snps.build, 38)
     self.assertTrue(snps.build_detected)
     self.assertEqual(snps.source, "generic")
     self.assertListEqual(snps._source, ["generic"])
     pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38(), check_exact=True)
예제 #8
0
 def f():
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False)
     df = s.discrepant_snps
     self.assertEqual(len(df), 4)
     pd.testing.assert_index_equal(
         df.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
예제 #9
0
    def test_merge_exceed_discrepant_genotypes_threshold(self):
        s1 = SNPs("tests/input/generic.csv")
        s2 = SNPs("tests/input/generic.csv")
        s2._snps.loc["rs1", "genotype"] = "CC"

        results = s1.merge([s2], discrepant_genotypes_threshold=0)
        self.assertEqual(len(s1.discrepant_merge_positions), 0)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 0)
        self.assertEqual(len(s1.discrepant_merge_positions_genotypes), 0)
        pd.testing.assert_frame_equal(s1.snps, self.generic_snps(), check_exact=True)
        self.assert_results(results, [{}])
예제 #10
0
 def test_save_source(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         s = SNPs("tests/input/GRCh38.csv", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "generic_GRCh38.txt")
         self.assertEqual(s.save(), dest)
         snps = SNPs(dest)
         self.assertEqual(snps.build, 38)
         self.assertTrue(snps.build_detected)
         self.assertEqual(snps.source, "generic")
         self.assertListEqual(snps._source, ["generic"])
         pd.testing.assert_frame_equal(snps.snps,
                                       self.snps_GRCh38(),
                                       check_exact=True)
예제 #11
0
    def test_merge_chrom(self):
        s1 = SNPs("tests/input/generic.csv")
        df = s1.snps.append(
            self.create_snp_df(
                rsid=["rs100", "rs101", "rs102", "rs103"],
                chrom=["Y", "Y", "Y", "Y"],
                pos=[100, 101, 102, 103],
                genotype=["A", np.nan, "A", "A"],
            ))
        s1._snps = df.copy()
        s2 = SNPs()
        s2._build = 37
        s2._snps = df.copy()

        # set values for chrom that will be ignored (that would otherwise result in
        # identification of discrepant SNPs or updating genotype)
        s2._snps.loc["rs3", "pos"] = 1003  # discrepant position
        s2._snps.loc["rs4", "genotype"] = "AA"  # discrepant genotype
        s2._snps.loc["rs5", "genotype"] = "AA"

        # set values for chrom to be merged
        s2._snps.loc["rs100", "genotype"] = "T"  # discrepant genotype
        s2._snps.loc["rs101", "genotype"] = "A"
        s2._snps.loc["rs102", "pos"] = 1002  # discrepant position

        # set expected values for merge result
        df.loc["rs100",
               "genotype"] = np.nan  # discrepant genotype sets to np.nan
        df.loc["rs101", "genotype"] = "A"  # updates np.nan

        results = s1.merge([s2], chrom="Y")

        pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)

        self.assert_results(
            results,
            [{
                "merged":
                True,
                "common_rsids":
                pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"),
                "discrepant_position_rsids":
                pd.Index(["rs102"], name="rsid"),
                "discrepant_genotype_rsids":
                pd.Index(["rs100"], name="rsid"),
            }],
        )

        self.assertEqual(len(s1.discrepant_merge_positions), 1)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
예제 #12
0
 def test_source_snps(self):
     s = SNPs("tests/input/GRCh37.csv")
     self.assertEqual(s.source, "generic")
     results = s.merge((SNPs("tests/input/23andme.txt"),))
     self.assertEqual(s.source, "generic, 23andMe")
     self.assertListEqual(s._source, ["generic", "23andMe"])
     self.assertEqual(
         os.path.relpath(s.save()), f"output{os.sep}generic__23andMe_GRCh37.txt"
     )
     s = SNPs("output/generic__23andMe_GRCh37.txt")
     self.assertEqual(s.source, "generic, 23andMe")
     self.assertListEqual(s._source, ["generic", "23andMe"])
     pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True)
     self.assert_results(results, [{"merged": True}])
예제 #13
0
 def test_source_snps(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         s = SNPs("tests/input/GRCh37.csv", output_dir=tmpdir)
         self.assertEqual(s.source, "generic")
         results = s.merge((SNPs("tests/input/23andme.txt"), ))
         self.assertEqual(s.source, "generic, 23andMe")
         self.assertListEqual(s._source, ["generic", "23andMe"])
         dest = os.path.join(tmpdir, "generic__23andMe_GRCh37.txt")
         self.assertEqual(s.save(), dest)
         s = SNPs(dest)
         self.assertEqual(s.source, "generic, 23andMe")
         self.assertListEqual(s._source, ["generic", "23andMe"])
         pd.testing.assert_frame_equal(s.snps, s.snps, check_exact=True)
         self.assert_results(results, [{"merged": True}])
예제 #14
0
 def test_merge_remap_false(self):
     s = SNPs("tests/input/NCBI36.csv")
     results = s.merge([SNPs("tests/input/GRCh37.csv")], remap=False)
     self.assertEqual(len(s.discrepant_merge_positions), 4)
     pd.testing.assert_index_equal(
         s.discrepant_merge_positions.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
     self.assertEqual(len(s.discrepant_merge_genotypes), 1)
     pd.testing.assert_index_equal(
         s.discrepant_merge_genotypes.index,
         results[0]["discrepant_genotype_rsids"],
         check_exact=True,
         check_names=True,
     )
     self.assertEqual(len(s.discrepant_merge_positions_genotypes), 4)
     pd.testing.assert_index_equal(
         s.discrepant_merge_positions_genotypes.index,
         results[0]["discrepant_position_rsids"],
         check_exact=True,
         check_names=True,
     )
     expected = self.snps_NCBI36()
     expected.loc[
         "rs11928389",
         "genotype"] = np.nan  # discrepant genotype is set to null / NA
     pd.testing.assert_frame_equal(s.snps, expected, check_exact=True)
     self.assert_results(
         results,
         [{
             "merged":
             True,
             "common_rsids":
             pd.Index(
                 ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"],
                 name="rsid",
             ),
             "discrepant_position_rsids":
             pd.Index(
                 ["rs3094315", "rs2500347", "rsIndelTest", "rs11928389"],
                 name="rsid",
             ),
             "discrepant_genotype_rsids":
             pd.Index(["rs11928389"], name="rsid"),
         }],
     )
예제 #15
0
 def test_remap_invalid_assembly(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(-1)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 0)
     self.assertEqual(len(chromosomes_not_remapped), 2)
예제 #16
0
 def f():
     snps = SNPs("tests/input/generic.csv")
     self.assertEqual(
         os.path.relpath(snps.save_snps(sep=",")),
         f"output{os.sep}generic_GRCh37.csv",
     )
     self.run_parsing_tests("output/generic_GRCh37.csv", "generic")
예제 #17
0
 def test_save_snps_phased(self):
     # read phased data
     s = SNPs("tests/input/testvcf_phased.vcf")
     # save phased data to TSV
     self.assertEqual(os.path.relpath(s.save()), "output/vcf_GRCh37.txt")
     # read saved TSV
     self.run_parsing_tests_vcf("output/vcf_GRCh37.txt", phased=True)
예제 #18
0
 def test_remap_snps_invalid_assembly(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap_snps(-1)
     assert s.build == 37
     assert s.assembly == "GRCh37"
     assert len(chromosomes_remapped) == 0
     assert len(chromosomes_not_remapped) == 2
예제 #19
0
 def test_snps_unannotated_vcf(self):
     # https://samtools.github.io/hts-specs/VCFv4.2.pdf
     # this tests for homozygous snps, heterozygous snps, multiallelic snps,
     # phased snps, and snps with missing rsID
     s = SNPs("tests/input/unannotated_testvcf.vcf")
     assert s.source == "vcf"
     assert s.unannotated_vcf
예제 #20
0
 def test_save_snps_csv_filename(self):
     snps = SNPs("tests/input/generic.csv")
     self.assertEqual(
         os.path.relpath(snps.save("generic.csv", sep=",")),
         f"output{os.sep}generic.csv",
     )
     self.run_parsing_tests("output/generic.csv", "generic")
예제 #21
0
    def test_save_snps_vcf_false_positive_build(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            snps = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                self.assertEqual(snps.save(vcf=True), output)

                s = ""
                with open(output, "r") as f:
                    for line in f.readlines():
                        if "snps v" in line:
                            s += '##source="vcf; snps v1.2.3.post85.dev0+gb386302; https://pypi.org/project/snps/"\n'
                        else:
                            s += line

                with open(output, "w") as f:
                    f.write(s)

            self.run_parsing_tests_vcf(output)
예제 #22
0
 def f():
     s = SNPs("tests/input/generic.csv")
     snps = self.generic_snps()
     snps.drop("rs5", inplace=True)
     pd.testing.assert_frame_equal(s.not_null_snps(),
                                   snps,
                                   check_exact=True)
예제 #23
0
    def test_save_snps_vcf_discrepant_pos(self):
        s = SNPs("tests/input/testvcf.vcf")

        r = Resources()
        r._reference_sequences["GRCh37"] = {}

        with tempfile.TemporaryDirectory() as tmpdir:
            dest = os.path.join(tmpdir, "generic.fa.gz")
            gzip_file("tests/input/generic.fa", dest)

            seq = ReferenceSequence(ID="1", path=dest)

            r._reference_sequences["GRCh37"]["1"] = seq

            # create discrepant SNPs by setting positions outside reference sequence
            s._snps.loc["rs1", "pos"] = 0
            s._snps.loc["rs17", "pos"] = 118

            self.assertEqual(os.path.relpath(s.save(vcf=True)),
                             f"output{os.sep}vcf_GRCh37.vcf")

        pd.testing.assert_frame_equal(
            s.discrepant_vcf_position,
            self.create_snp_df(
                rsid=["rs1", "rs17"],
                chrom=["1", "1"],
                pos=[0, 118],
                genotype=["AA", np.nan],
            ),
            check_exact=True,
        )

        expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
        self.run_parsing_tests_vcf("output/vcf_GRCh37.vcf", snps_df=expected)
예제 #24
0
 def test_appending_dfs(self):
     s = SNPs()
     s._snps = self.create_snp_df(rsid=["rs1"],
                                  chrom=["1"],
                                  pos=[1],
                                  genotype=["AA"])
     s._duplicate = self.create_snp_df(rsid=["rs1"],
                                       chrom=["1"],
                                       pos=[1],
                                       genotype=["AA"])
     s._discrepant_XY = self.create_snp_df(rsid=["rs1"],
                                           chrom=["1"],
                                           pos=[1],
                                           genotype=["AA"])
     s.merge([s])
     df = self.create_snp_df(rsid=["rs1", "rs1"],
                             chrom=["1", "1"],
                             pos=[1, 1],
                             genotype=["AA", "AA"])
     pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True)
     pd.testing.assert_frame_equal(s.heterozygous_MT,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_vcf_position,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
예제 #25
0
 def test__lookup_build_with_snp_pos_None(self):
     snps = SNPs()
     snps._snps = self.create_snp_df(rsid=["rs3094315"],
                                     chrom=["1"],
                                     pos=[1],
                                     genotype=["AA"])
     self.assertFalse(snps.detect_build())
예제 #26
0
 def test_remap_37_to_37(self):
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(37)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 0)
     self.assertEqual(len(chromosomes_not_remapped), 2)
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
예제 #27
0
 def test_save_snps_tsv_filename(self):
     with tempfile.TemporaryDirectory() as tmpdir:
         snps = SNPs("tests/input/generic.tsv", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "generic.tsv")
         self.assertEqual(
             snps.save("generic.tsv", sep="\t"), dest,
         )
         self.run_parsing_tests(dest, "generic")
예제 #28
0
 def f():
     s = SNPs("tests/input/NCBI36.csv", parallelize=True)
     chromosomes_remapped, chromosomes_not_remapped = s.remap(37)
     self.assertEqual(s.build, 37)
     self.assertEqual(s.assembly, "GRCh37")
     self.assertEqual(len(chromosomes_remapped), 2)
     self.assertEqual(len(chromosomes_not_remapped), 0)
     pd.testing.assert_frame_equal(s.snps, self.snps_GRCh37(), check_exact=True)
예제 #29
0
 def test_sex_Male_Y_chrom(self):
     s = self.simulate_snps(chrom="Y",
                            pos_start=1,
                            pos_max=59373566,
                            pos_step=10000)
     file = s.save_snps()
     snps = SNPs(file)
     assert snps.sex == "Male"
예제 #30
0
 def f():
     s = SNPs("tests/input/GRCh37.csv")
     chromosomes_remapped, chromosomes_not_remapped = s.remap(36)
     self.assertEqual(s.build, 36)
     self.assertEqual(s.assembly, "NCBI36")
     self.assertEqual(len(chromosomes_remapped), 2)
     self.assertEqual(len(chromosomes_not_remapped), 0)
     pd.testing.assert_frame_equal(s.snps, self.snps_NCBI36(), check_exact=True)