Python DelayedDataFrame примеры, mbf_genomics.DelayedDataFrame Python примеры использования

Пример #1

0

Показать файл

    def test_annos_dependening_none(self):
        class A(Annotator):
            cache_name = "hello"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "hello2"
            columns = ["ab"]

            def calc(self, df):
                return df["aa"] + "b"

            def dep_annos(self):
                return [None, A(), None]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        a += B()
        a.annotate()
        assert "ab" in a.df.columns
        assert "aa" in a.df.columns
        assert (a.df["ab"] == (a.df["aa"] + "b")).all()

Пример #2

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_ttest_paired(self):
        data = pd.DataFrame({
            "A.R1": [0, 0, 0, 0],
            "A.R2": [0, 0, 0, 0],
            "A.R3": [0, 0.001, 0.001, 0.001],
            "B.R1": [0.95, 0, 0.56, 0],
            "B.R2": [0.99, 0, 0.56, 0],
            "B.R3": [0.98, 0, 0.57, 0.5],
            "C.R1": [0.02, 0.73, 0.59, 0],
            "C.R2": [0.03, 0.75, 0.57, 0],
            "C.R3": [0.05, 0.7, 0.58, 1],
        })
        ddf = DelayedDataFrame("ex1", data)
        gts = {
            k: list(v)
            for (k,
                 v) in itertools.groupby(sorted(data.columns), lambda x: x[0])
        }

        c = Comparisons(ddf, gts)
        a = c.a_vs_b("A", "B", TTestPaired())
        force_load(ddf.add_annotator(a))
        run_pipegraph()
        assert ddf.df[a["p"]].iloc[0] == pytest.approx(8.096338300746213e-07,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[1] == pytest.approx(0.42264973081037427,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[2] == pytest.approx(0.041378369826042816,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[3] == pytest.approx(0.42264973081037427,
                                                       abs=1e-4)
        assert ddf.df[a["FDR"]].values == pytest.approx(
            [3.238535e-06, 4.226497e-01, 8.275674e-02, 4.226497e-01], abs=1e-4)

Пример #3

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_edgeR(self):
        df = self._get_tuch_data()

        ddf = DelayedDataFrame("ex1", df)
        gts = {
            "T": [x for x in df.columns if ".T" in x],
            "N": [x for x in df.columns if ".N" in x],
        }

        c = Comparisons(ddf, gts)
        a = c.a_vs_b("T", "N", EdgeRUnpaired())
        force_load(ddf.add_annotator(a))
        run_pipegraph()
        # these are from the last run - the manual has no simple a vs b comparison...
        # at least we'l notice if this changes
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][
            a["log2FC"]].values == approx([4.003122])
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["FDR"]].values == approx(
            [1.332336e-11])
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["p"]].values == approx(
            [5.066397e-15])
        df = ddf.df.set_index("nameOfGene")
        t_columns = [x[1] for x in gts["T"]]
        n_columns = [x[1] for x in gts["N"]]
        assert df.loc["PTHLH"][t_columns].sum(
        ) > df.loc["PTHLH"][n_columns].sum()

        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][
            a["log2FC"]].values == approx([-5.127508])
        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["FDR"]].values == approx(
            [6.470885e-10])
        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["p"]].values == approx(
            [3.690970e-13])
        assert df.loc["PTGFR"][t_columns].sum(
        ) < df.loc["PTGFR"][n_columns].sum()

Пример #4

0

Показать файл

    def test_annos_dependening(self):
        class A(Annotator):
            cache_name = "hello"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "hello2"
            columns = ["ab"]

            def calc(self, df):
                return df["aa"] + "b"

            def dep_annos(self):
                return [A()]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        a += B()
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        ppg.run_pipegraph()
        assert "ab" in a.df.columns
        assert "aa" in a.df.columns
        assert (a.df["ab"] == (a.df["aa"] + "b")).all()

Пример #5

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_edgeR_paired(self):
        df = self._get_tuch_data()

        ddf = DelayedDataFrame("ex1", df)
        gts = {
            "T": [x for x in sorted(df.columns) if ".T" in x],
            "N": [x for x in sorted(df.columns) if ".N" in x],
        }

        c = Comparisons(ddf, gts)
        a = c.a_vs_b("T", "N", EdgeRPaired())
        force_load(ddf.add_annotator(a))
        run_pipegraph()
        # these are from the last run - the manual has no simple a vs b comparison...
        # at least we'l notice if this changes
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][
            a["log2FC"]].values == approx([3.97], abs=1e-3)
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["FDR"]].values == approx(
            [4.27e-18])
        assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["p"]].values == approx(
            [8.13e-22])
        df = ddf.df.set_index("nameOfGene")
        t_columns = [x[1] for x in gts["T"]]
        n_columns = [x[1] for x in gts["N"]]
        assert df.loc["PTHLH"][t_columns].sum(
        ) > df.loc["PTHLH"][n_columns].sum()

        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][
            a["log2FC"]].values == approx([-5.18], abs=1e-2)
        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["FDR"]].values == approx(
            [3.17e-19])
        assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["p"]].values == approx(
            [3.01e-23])
        assert df.loc["PTGFR"][t_columns].sum(
        ) < df.loc["PTGFR"][n_columns].sum()

Пример #6

0

Показать файл

    def test_nested_anno_dependencies(self):
        class Nested(Annotator):
            columns = ["b"]

            def calc(self, df):
                return pd.Series([10] * len(df))

            def dep_annos(self):
                return [Constant("Nestedconst", 5)]

        class Nesting(Annotator):
            columns = ["a"]

            def calc(self, df):
                return pd.Series([15] * len(df))

            def dep_annos(self):
                return [Constant("Nestingconst", 5), Nested()]

        anno = Nesting()
        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
        )
        a += anno
        a.write()
        ppg.run_pipegraph()
        assert (a.df["a"] == 15).all()
        assert (a.df["b"] == 10).all()
        assert (a.df["Nestedconst"] == 5).all()
        assert (a.df["Nestingconst"] == 5).all()

Пример #7

0

Показать файл

    def test_filtering(self):
        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "B"
            columns = ["ab"]

            def calc(self, df):
                return df["aa"] + "b"

            def dep_annos(self):
                return [A()]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        a += Constant("C", "c")
        assert "C" in a.df.columns
        b = a.filter("sha", lambda df: df["A"] == 1)
        assert "C" in b.df.columns
        a += A()
        assert "aa" in a.df.columns
        assert "aa" in b.df.columns
        b += B()
        assert "ab" in b.df.columns
        assert not "ab" in a.df.columns

Пример #8

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_multi_plus_filter(self, clear_annotators):
        d = DelayedDataFrame(
            "ex1",
            pd.DataFrame({
                "a1": [1 / 0.99, 2 / 0.99, 3 / 0.99],
                "a2": [1 * 0.99, 2 * 0.99, 3 * 0.99],
                "b1": [2 * 0.99, 8 * 0.99, (16 * 3) * 0.99],
                "b2": [2 / 0.99, 8 / 0.99, (16 * 3) / 0.99],
                "delta": [10, 20, 30],
            }),
        )
        c = Comparisons(d, {"a": ["a1", "a2"], "b": ["b1", "b2"]})
        a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0)
        anno1 = Constant("shu1", 5)
        anno2 = Constant("shu2", 5)  # noqa: F841
        anno3 = Constant("shu3", 5)  # noqa: F841
        to_test = [
            (("log2FC", "==", -1.0), [-1.0]),
            (("log2FC", ">", -2.0), [-1.0]),
            (("log2FC", "<", -2.0), [-4.0]),
            (("log2FC", ">=", -2.0), [-1.0, -2.0]),
            (("log2FC", "<=", -2.0), [-2.0, -4.0]),
            (("log2FC", "|>", 2.0), [-4.0]),
            (("log2FC", "|<", 2.0), [-1.0]),
            (("log2FC", "|>=", 2.0), [-2.0, -4.0]),
            (("log2FC", "|<=", 2.0), [-1.0, -2.0]),
            ((a["log2FC"], "<", -2.0), [-4.0]),
            (("log2FC", "|", -2.0), ValueError),
            ([("log2FC", "|>=", 2.0), ("log2FC", "<=", 0)], [-2.0, -4.0]),
            ((anno1, ">=", 5), [-1, -2.0, -4.0]),
            (((anno1, 0), ">=", 5), [-1, -2.0, -4.0]),
            (("shu2", ">=", 5), [-1, -2.0, -4.0]),
            (("delta", ">", 10), [-2.0, -4.0]),
        ]
        if not ppg.inside_ppg():  # can't test for missing columns in ppg.
            to_test.extend([(("log2FC_no_such_column", "<", -2.0), KeyError)])
        filtered = {}
        for ii, (f, r) in enumerate(to_test):
            if r in (ValueError, KeyError):
                with pytest.raises(r):
                    a.filter([f], "new%i" % ii)
            else:
                filtered[tuple(f)] = a.filter(
                    [f] if isinstance(f, tuple) else f, "new%i" % ii)
                assert filtered[tuple(f)].name == "new%i" % ii
                force_load(filtered[tuple(f)].annotate(),
                           filtered[tuple(f)].name)

        force_load(d.add_annotator(a), "somethingsomethingjob")
        run_pipegraph()
        c = a["log2FC"]
        assert (d.df[c] == [-1.0, -2.0, -4.0]).all()
        for f, r in to_test:
            if r not in (ValueError, KeyError):
                try:
                    assert filtered[tuple(f)].df[c].values == approx(r)
                except AssertionError:
                    print(f)
                    raise

Пример #9

0

Показать файл

 def test_annotator(self):
     a = DelayedDataFrame(
         "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
     )
     a += Constant("column", "value")
     a.annotate()
     assert "column" in a.df.columns
     assert (a.df["column"] == "value").all()

Пример #10

0

Показать файл

 def test_annotator_basic(self):
     a = DelayedDataFrame(
         "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
     )
     a += Constant("aa", "aa")
     force_load(a.annotate())
     ppg.run_pipegraph()
     assert (a.df["aa"] == "aa").all()

Пример #11

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_deseq2_with_and_without_additional_columns(self):
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]
        print(pasilla_data.columns)
        pasilla_data = pasilla_data.assign(
            treated_fake=pasilla_data.treated2fb,
            untreated_fake=pasilla_data.untreated2fb,
        )

        gts = {
            "treated": [
                x for x in pasilla_data.columns
                if x.startswith("treated") and "3" not in x
            ],
            "untreated": [
                x for x in pasilla_data.columns
                if x.startswith("untreated") and "3" not in x
            ],
            "other": [x for x in pasilla_data.columns if "3" in x],
        }
        assert len(gts["other"]) == 2
        assert sum(
            (len(x)
             for x in gts.values())) + 1 == len(pasilla_data.columns)  # GeneId
        ddf = DelayedDataFrame("ex", pasilla_data)
        c = Comparisons(ddf, gts)
        with_other = c.a_vs_b(
            "treated",
            "untreated",
            DESeq2Unpaired(),
            include_other_samples_for_variance=True,
        )
        without_other = c.a_vs_b(
            "treated",
            "untreated",
            DESeq2Unpaired(),
            include_other_samples_for_variance=False,
        )
        force_load(ddf.add_annotator(with_other))
        force_load(ddf.add_annotator(without_other))
        # run_pipegraph()
        df = ddf.df
        print(df.head())
        df.to_csv("test.csv")
        # this is a fairly weak test, but it shows that it at least does *something*
        assert (df[with_other["p"]] != pytest.approx(
            df[without_other["p"]])).all()
        assert (df[with_other["log2FC"]] != pytest.approx(
            df[without_other["log2FC"]])).all()

Пример #12

0

Показать файл

Файл: test_genes_annotators.py Проект: MarcoMernberger/mbf_genomics

 def test_missing_external_genome(self):
     g = DelayedDataFrame("ex",
                          pd.DataFrame({"gene_stable_id": ["a", "c", "b"]}))
     anno = genes.annotators.Description()
     g += anno
     force_load(g.annotate())
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "ddf had no .genome and no genome was passed to Description" in str(
         g.anno_jobs[anno.get_cache_name()].lfg.exception)

Пример #13

0

Показать файл

    def test_write(self):
        test_df = pd.DataFrame({"A": [1, 2]})

        def load():
            return test_df

        a = DelayedDataFrame("shu", load)
        fn = a.write()[0]
        ppg.run_pipegraph()
        assert Path(fn.filenames[0]).exists()
        assert_frame_equal(pd.read_csv(fn.filenames[0], sep="\t"), test_df)

Пример #14

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

 def test_simple(self):
     d = DelayedDataFrame(
         "ex1", pd.DataFrame({
             "a": [1, 2, 3],
             "b": [2, 8, 16 * 3]
         }))
     c = Comparisons(d, {"a": ["a"], "b": ["b"]})
     a = c.a_vs_b("a", "b", Log2FC, laplace_offset=0)
     assert d.has_annotator(a)
     force_load(d.add_annotator(a), "fl1")
     run_pipegraph()
     assert (d.df[a["log2FC"]] == [-1.0, -2.0, -4.0]).all()

Пример #15

0

Показать файл

    def test_filteringC(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        # a += LenAnno("C")
        b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set())
        b.write()
        ppg.run_pipegraph()
        assert "C" in a.df
        assert "C" in b.df

Пример #16

0

Показать файл

    def test_create(self):
        test_df = pd.DataFrame({"A": [1, 2]})

        def load():
            return test_df

        a = DelayedDataFrame("shu", load)
        assert not hasattr(a, "df")
        force_load(a.load(), False)
        ppg.run_pipegraph()
        assert_frame_equal(a.df, test_df)
        assert a.non_annotator_columns == "A"

Пример #17

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

 def test_simple_from_anno_plus_column_pos(self):
     d = DelayedDataFrame(
         "ex1", pd.DataFrame({
             "a": [1, 2, 3],
             "b": [2, 8, 16 * 3]
         }))
     a = Constant("five", 5)
     b = Constant("ten", 10)
     c = Comparisons(d, {"a": [(a, 0)], "b": [(b, 0)]})
     a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0)
     force_load(d.add_annotator(a), "fl1")
     run_pipegraph()
     assert (d.df[a["log2FC"]] == [-1, -1, -1]).all()

Пример #18

0

Показать файл

    def test_write_excel(self):
        test_df = pd.DataFrame({"A": [1, 2]})

        def load():
            return test_df

        a = DelayedDataFrame("shu", load, result_dir="sha")
        assert Path("sha").exists()
        assert_frame_equal(a.df, test_df)
        assert a.non_annotator_columns == "A"
        fn = a.write("sha.xls")[1]
        assert fn.exists()
        assert_frame_equal(pd.read_excel(fn), test_df)

Пример #19

0

Показать файл

 def test_annotator_coliding_with_non_anno_column(self):
     a = DelayedDataFrame(
         "shu",
         lambda: pd.DataFrame(
             {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
         ).set_index("idx"),
     )
     a += Constant("A", "aa")
     lj = a.anno_jobs["A"]
     ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "were already present" in str(lj().exception)

Пример #20

0

Показать файл

    def test_filteringA(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1)
        a += LenAnno("C")
        b.write()
        ppg.run_pipegraph()
        assert "C" in b.df.columns
        assert "C" in a.df.columns
        assert (b.df["C"] == "C2").all()
        assert (a.df["C"] == "C2").all()

Пример #21

0

Показать файл

    def test_write(self):
        test_df = pd.DataFrame({"A": [1, 2]})

        def load():
            return test_df

        a = DelayedDataFrame("shu", load, result_dir="sha")
        assert Path("sha").exists()
        assert_frame_equal(a.df, test_df)
        assert a.non_annotator_columns == "A"
        fn = a.write()[1]
        assert "/sha" in str(fn.parent)
        assert fn.exists()
        assert_frame_equal(pd.read_csv(fn, sep="\t"), test_df)

Пример #22

0

Показать файл

    def test_write_excel2(self):
        data = {}
        for i in range(0, 257):
            c = "A%i" % i
            d = [1, 1]
            data[c] = d
        test_df = pd.DataFrame(data)

        def load():
            return test_df

        a = DelayedDataFrame("shu", load, result_dir="sha")
        fn = a.write("sha.xls")[1]
        assert fn.exists()
        assert_frame_equal(pd.read_csv(fn, sep="\t"), test_df)

Пример #23

0

Показать файл

    def test_anno_returing_right_length_but_wrong_start_range_index(self):
        a = DelayedDataFrame("shu", lambda: pd.DataFrame({"A": [1, 2, 3]}))

        class BadAnno(Annotator):
            columns = ["X"]

            def calc(self, df):
                return pd.Series(["a", "b", "c"], index=pd.RangeIndex(5, 5 + 3))

        a += BadAnno()
        force_load(a.annotate())
        lj = a.anno_jobs["X"]
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "Index mismatch" in str(lj().exception)

Пример #24

0

Показать файл

    def normed_ddf(self, input_ddf):
        def load():
            df = input_ddf.df[[ac[1] for ac in self.columns]]
            normed_df = self.normalization_strategy.calc(
                df, [ac[1] for ac in self.columns])
            return normed_df

        output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name
        if ppg.inside_ppg():
            deps = [
                self.ddf.add_annotator(ac[0])
                for ac in self.columns if ac[0] is not None
            ] + [
                self.normalization_strategy.deps(),
                input_ddf.load(),
                ppg.FunctionInvariant(output_name + '_calc',
                                      self.normalization_strategy.calc)
            ]
        else:
            deps = []

        return DelayedDataFrame(
            output_name,
            load,
            deps,
            input_ddf.result_dir,
        )

Пример #25

0

Показать файл

    def test_anno_not_returning_enough_rows_and_no_index_range_index_on_df(self):
        class BrokenAnno(Annotator):
            columns = ["X"]

            def calc(self, df):
                return pd.DataFrame({"X": [1]})

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
        )
        a += BrokenAnno()
        lj = a.anno_jobs["X"]
        ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate())
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "Length and index mismatch " in str(lj().exception)

Пример #26

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_volcano_plot(self):
        ppg.util.global_pipegraph.quiet = False
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]
        treated = [x for x in pasilla_data.columns if x.startswith("treated")]
        untreated = [
            x for x in pasilla_data.columns if x.startswith("untreated")
        ]
        pasilla_data = DelayedDataFrame("pasilla", pasilla_data)
        comp = Comparisons(pasilla_data, {
            "treated": treated,
            "untreated": untreated
        }).a_vs_b("treated", "untreated", TTest())
        comp.filter([("log2FC", "|>=", 2.0), ("FDR", "<=", 0.05)])
        prune_qc(lambda job: "volcano" in job.job_id)
        run_pipegraph()
        qc_jobs = list(get_qc_jobs())
        qc_jobs = [x for x in qc_jobs if not x._pruned]
        print(qc_jobs)
        assert len(qc_jobs) == 1
        assert_image_equal(qc_jobs[0].filenames[0])

Пример #27

0

Показать файл

    def test_filtering_result_dir(self):
        counts = collections.Counter()

        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                counts["A"] += 1
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1, result_dir="shu2")
        assert b.result_dir.absolute() == Path("shu2").absolute()

Пример #28

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_deseq2(self):
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]

        gts = {
            "treated":
            [x for x in pasilla_data.columns if x.startswith("treated")],
            "untreated":
            [x for x in pasilla_data.columns if x.startswith("untreated")],
        }
        ddf = DelayedDataFrame("ex", pasilla_data)
        c = Comparisons(ddf, gts)
        a = c.a_vs_b("treated", "untreated", DESeq2Unpaired())
        force_load(ddf.add_annotator(a))
        run_pipegraph()
        check = """# This is deseq2 version specific data- probably needs fixing if upgrading deseq2
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0039155 453 -3.72 0.160 -23.2 1.63e-119 1.35e-115
## FBgn0029167 2165 -2.08 0.103 -20.3 1.43e-91 5.91e-88
## FBgn0035085 367 -2.23 0.137 -16.3 6.38e-60 1.75e-56
## FBgn0029896 258 -2.21 0.159 -13.9 5.40e-44 1.11e-40
## FBgn0034736 118 -2.56 0.185 -13.9 7.66e-44 1.26e-40
"""
        df = ddf.df.sort_values(a["FDR"])
        df = df.set_index("Gene")
        for row in check.split("\n"):
            row = row.strip()
            if row and not row[0] == "#":
                row = row.split()
                self.assertAlmostEqual(df.ix[row[0]][a["log2FC"]],
                                       float(row[2]),
                                       places=2)
                self.assertAlmostEqual(df.ix[row[0]][a["p"]],
                                       float(row[5]),
                                       places=2)
                self.assertAlmostEqual(df.ix[row[0]][a["FDR"]],
                                       float(row[6]),
                                       places=2)

Пример #29

0

Показать файл

Файл: test_comparisons.py Проект: MarcoMernberger/mbf_comparisons

    def test_double_comparison_with_different_strategies(self):
        data = pd.DataFrame({
            "A.R1": [0, 0, 0, 0],
            "A.R2": [0, 0, 0, 0],
            "A.R3": [0, 0.001, 0.001, 0.001],
            "B.R1": [0.95, 0, 0.56, 0],
            "B.R2": [0.99, 0, 0.56, 0],
            "B.R3": [0.98, 0, 0.57, 0.5],
            "C.R1": [0.02, 0.73, 0.59, 0],
            "C.R2": [0.03, 0.75, 0.57, 0],
            "C.R3": [0.05, 0.7, 0.58, 1],
        })
        ddf = DelayedDataFrame("ex1", data)
        gts = {
            k: list(v)
            for (k,
                 v) in itertools.groupby(sorted(data.columns), lambda x: x[0])
        }

        c = Comparisons(ddf, gts)
        a = c.a_vs_b("A", "B", TTestPaired())
        force_load(ddf.add_annotator(a))
        b = c.a_vs_b("A", "B", TTest())
        force_load(ddf.add_annotator(b))
        run_pipegraph()
        assert ddf.df[a["p"]].iloc[0] == pytest.approx(8.096338300746213e-07,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[1] == pytest.approx(0.42264973081037427,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[2] == pytest.approx(0.041378369826042816,
                                                       abs=1e-4)
        assert ddf.df[a["p"]].iloc[3] == pytest.approx(0.42264973081037427,
                                                       abs=1e-4)
        assert ddf.df[a["FDR"]].values == pytest.approx(
            [3.238535e-06, 4.226497e-01, 8.275674e-02, 4.226497e-01], abs=1e-4)
        assert ddf.df[b["p"]].iloc[0] == pytest.approx(8.096e-07, abs=1e-4)
        # value calculated with scipy to double check.
        assert ddf.df[b["p"]].iloc[1] == pytest.approx(0.42264973081037427,
                                                       abs=1e-4)
        assert ddf.df[b["p"]].iloc[2] == pytest.approx(0.04157730613277929,
                                                       abs=1e-4)
        assert ddf.df[b["p"]].iloc[3] == pytest.approx(0.703158104919873,
                                                       abs=1e-4)
        assert ddf.df[b["FDR"]].values == pytest.approx(
            [3.238535e-06, 5.635329e-01, 8.315462e-02, 7.031581e-01], abs=1e-4)

Пример #30

0

Показать файл

    def test_write_mangle(self):
        test_df = pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})

        def load():
            return test_df

        a = DelayedDataFrame("shu", load)
        assert_frame_equal(a.df, test_df)
        assert (a.non_annotator_columns == ["A", "B"]).all()

        def mangle(df):
            df = df.drop("A", axis=1)
            df = df[df.B == "c"]
            return df

        fn = a.write("test.csv", mangle)[1]
        assert fn.exists()
        assert_frame_equal(pd.read_csv(fn, sep="\t"), mangle(test_df))

Python DelayedDataFrame примеры использования