예제 #1
0
    def test_merge_chrom(self):
        s1 = SNPs("tests/input/generic.csv")
        df = s1.snps.append(
            self.create_snp_df(
                rsid=["rs100", "rs101", "rs102", "rs103"],
                chrom=["Y", "Y", "Y", "Y"],
                pos=[100, 101, 102, 103],
                genotype=["A", np.nan, "A", "A"],
            ))
        s1._snps = df.copy()
        s2 = SNPs()
        s2._build = 37
        s2._snps = df.copy()

        # set values for chrom that will be ignored (that would otherwise result in
        # identification of discrepant SNPs or updating genotype)
        s2._snps.loc["rs3", "pos"] = 1003  # discrepant position
        s2._snps.loc["rs4", "genotype"] = "AA"  # discrepant genotype
        s2._snps.loc["rs5", "genotype"] = "AA"

        # set values for chrom to be merged
        s2._snps.loc["rs100", "genotype"] = "T"  # discrepant genotype
        s2._snps.loc["rs101", "genotype"] = "A"
        s2._snps.loc["rs102", "pos"] = 1002  # discrepant position

        # set expected values for merge result
        df.loc["rs100",
               "genotype"] = np.nan  # discrepant genotype sets to np.nan
        df.loc["rs101", "genotype"] = "A"  # updates np.nan

        results = s1.merge([s2], chrom="Y")

        pd.testing.assert_frame_equal(s1.snps, df, check_exact=True)

        self.assert_results(
            results,
            [{
                "merged":
                True,
                "common_rsids":
                pd.Index(["rs100", "rs101", "rs102", "rs103"], name="rsid"),
                "discrepant_position_rsids":
                pd.Index(["rs102"], name="rsid"),
                "discrepant_genotype_rsids":
                pd.Index(["rs100"], name="rsid"),
            }],
        )

        self.assertEqual(len(s1.discrepant_merge_positions), 1)
        self.assertEqual(len(s1.discrepant_merge_genotypes), 1)
예제 #2
0
 def test_appending_dfs(self):
     s = SNPs()
     s._snps = self.create_snp_df(rsid=["rs1"],
                                  chrom=["1"],
                                  pos=[1],
                                  genotype=["AA"])
     s._duplicate = self.create_snp_df(rsid=["rs1"],
                                       chrom=["1"],
                                       pos=[1],
                                       genotype=["AA"])
     s._discrepant_XY = self.create_snp_df(rsid=["rs1"],
                                           chrom=["1"],
                                           pos=[1],
                                           genotype=["AA"])
     s.merge([s])
     df = self.create_snp_df(rsid=["rs1", "rs1"],
                             chrom=["1", "1"],
                             pos=[1, 1],
                             genotype=["AA", "AA"])
     pd.testing.assert_frame_equal(s.duplicate, df, check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_XY, df, check_exact=True)
     pd.testing.assert_frame_equal(s.heterozygous_MT,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
     pd.testing.assert_frame_equal(s.discrepant_vcf_position,
                                   get_empty_snps_dataframe(),
                                   check_exact=True)
예제 #3
0
 def test__lookup_build_with_snp_pos_None(self):
     snps = SNPs()
     snps._snps = self.create_snp_df(rsid=["rs3094315"],
                                     chrom=["1"],
                                     pos=[1],
                                     genotype=["AA"])
     self.assertFalse(snps.detect_build())
예제 #4
0
파일: __init__.py 프로젝트: apriha/snps
    def simulate_snps(
        self,
        chrom="1",
        pos_start=1,
        pos_max=248140902,
        pos_step=100,
        genotype="AA",
        insert_nulls=True,
        null_snp_step=101,
        complement_genotype_one_chrom=False,
        complement_genotype_two_chroms=False,
        complement_snp_step=50,
    ):
        s = SNPs()

        s._build = 37

        positions = np.arange(pos_start, pos_max, pos_step, dtype=np.uint32)
        snps = pd.DataFrame(
            {"chrom": chrom},
            index=pd.Index(["rs" + str(x + 1) for x in range(len(positions))],
                           name="rsid"),
        )
        snps["pos"] = positions
        snps["genotype"] = genotype

        if insert_nulls:
            snps.loc[snps.iloc[0::null_snp_step, :].index, "genotype"] = np.nan

        indices = snps.iloc[0::complement_snp_step, :].index
        if complement_genotype_two_chroms:
            snps.loc[indices,
                     "genotype"] = snps.loc[indices, "genotype"].apply(
                         self.complement_two_chroms)
        elif complement_genotype_one_chrom:
            snps.loc[indices,
                     "genotype"] = snps.loc[indices, "genotype"].apply(
                         self.complement_one_chrom)

        s._snps = snps

        return s
예제 #5
0
    def test_save_snps_vcf_discrepant_pos(self):
        with tempfile.TemporaryDirectory() as tmpdir1:
            s = SNPs("tests/input/testvcf.vcf", output_dir=tmpdir1)

            r = Resources()
            r._reference_sequences["GRCh37"] = {}

            output = os.path.join(tmpdir1, "vcf_GRCh37.vcf")
            with tempfile.TemporaryDirectory() as tmpdir2:
                dest = os.path.join(tmpdir2, "generic.fa.gz")
                gzip_file("tests/input/generic.fa", dest)

                seq = ReferenceSequence(ID="1", path=dest)

                r._reference_sequences["GRCh37"]["1"] = seq

                # create discrepant SNPs by setting positions outside reference sequence
                s._snps.loc["rs1", "pos"] = 0
                s._snps.loc["rs17", "pos"] = 118

                # esnure this is the right type after manual tweaking
                s._snps = s._snps.astype({"pos": np.uint32})

                self.assertEqual(s.save(vcf=True), output)

            pd.testing.assert_frame_equal(
                s.discrepant_vcf_position,
                self.create_snp_df(
                    rsid=["rs1", "rs17"],
                    chrom=["1", "1"],
                    pos=[0, 118],
                    genotype=["AA", np.nan],
                ),
                check_exact=True,
            )

            expected = self.generic_snps_vcf().drop(["rs1", "rs17"])
            self.run_parsing_tests_vcf(output, snps_df=expected)
예제 #6
0
    def test_merging_files_discrepant_snps(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dest1 = os.path.join(tmpdir, "discrepant_snps1.csv")
            dest2 = os.path.join(tmpdir, "discrepant_snps2.csv")

            df = pd.read_csv(
                "tests/input/discrepant_snps.csv",
                skiprows=1,
                na_values="--",
                names=[
                    "rsid",
                    "chrom",
                    "pos_file1",
                    "pos_file2",
                    "genotype_file1",
                    "genotype_file2",
                    "discrepant_position",
                    "discrepant_genotype",
                    "expected_position",
                    "expected_genotype",
                ],
                index_col=0,
                dtype={
                    "chrom": object,
                    "pos_file1": np.uint32,
                    "pos_file2": np.uint32,
                    "discrepant_position": bool,
                    "discrepant_genotype": bool,
                    "expected_position": np.uint32,
                },
            )

            df1 = df[["chrom", "pos_file1", "genotype_file1"]]
            df2 = df[["chrom", "pos_file2", "genotype_file2"]]

            df1.to_csv(dest1,
                       na_rep="--",
                       header=["chromosome", "position", "genotype"])

            df2.to_csv(dest2,
                       na_rep="--",
                       header=["chromosome", "position", "genotype"])

            s = SNPs()
            s.merge([SNPs(dest1), SNPs(dest2)])

            expected = df[[
                "chrom",
                "discrepant_position",
                "discrepant_genotype",
                "expected_position",
                "expected_genotype",
            ]]
            expected = expected.rename(columns={
                "expected_position": "pos",
                "expected_genotype": "genotype"
            })
            expected_snps = SNPs()
            expected_snps._snps = expected
            expected_snps.sort()
            expected = expected_snps.snps

            pd.testing.assert_index_equal(
                s.discrepant_merge_positions.index,
                expected.loc[expected["discrepant_position"] == True].index,
            )

            pd.testing.assert_index_equal(
                s.discrepant_merge_genotypes.index,
                expected.loc[expected["discrepant_genotype"] == True].index,
            )

            pd.testing.assert_series_equal(s.snps["pos"], expected["pos"])
            pd.testing.assert_series_equal(s.snps["genotype"],
                                           expected["genotype"])
예제 #7
0
 def test__lookup_build_with_snp_pos_None(self):
     snps = SNPs()
     snps._snps = self.snps_discrepant_pos()
     assert not snps.detect_build()
예제 #8
0
    def test_merging_files_discrepant_snps(self):
        df = pd.read_csv(
            "tests/input/discrepant_snps.csv",
            skiprows=1,
            na_values="--",
            names=[
                "rsid",
                "chrom",
                "pos_file1",
                "pos_file2",
                "genotype_file1",
                "genotype_file2",
                "discrepant_position",
                "discrepant_genotype",
                "expected_position",
                "expected_genotype",
            ],
            index_col=0,
            dtype={
                "chrom": object,
                "pos_file1": np.int64,
                "pos_file2": np.int64,
                "discrepant_position": bool,
                "discrepant_genotype": bool,
            },
        )

        df1 = df[["chrom", "pos_file1", "genotype_file1"]]
        df2 = df[["chrom", "pos_file2", "genotype_file2"]]

        df1.to_csv(
            "tests/input/discrepant_snps1.csv",
            na_rep="--",
            header=["chromosome", "position", "genotype"],
        )

        df2.to_csv(
            "tests/input/discrepant_snps2.csv",
            na_rep="--",
            header=["chromosome", "position", "genotype"],
        )

        sc = SNPsCollection([
            "tests/input/discrepant_snps1.csv",
            "tests/input/discrepant_snps2.csv"
        ])

        expected = df[[
            "chrom",
            "discrepant_position",
            "discrepant_genotype",
            "expected_position",
            "expected_genotype",
        ]]
        expected = expected.rename(columns={
            "expected_position": "pos",
            "expected_genotype": "genotype"
        })
        expected_snps = SNPs()
        expected_snps._snps = expected
        expected_snps.sort_snps()
        expected = expected_snps.snps

        pd.testing.assert_index_equal(
            sc.discrepant_positions.index,
            expected.loc[expected["discrepant_position"] == True].index,
        )

        pd.testing.assert_index_equal(
            sc.discrepant_genotypes.index,
            expected.loc[expected["discrepant_genotype"] == True].index,
        )

        pd.testing.assert_series_equal(sc.snps["pos"], expected["pos"])
        pd.testing.assert_series_equal(sc.snps["genotype"],
                                       expected["genotype"])