示例#1
0
    def test_grow2(self):
        def sample_data():
            return pd.DataFrame([
                {
                    "chr": "1",
                    "start": 7774885,
                    "stop": 7791673
                },
                {
                    "chr": "1",
                    "start": 8286026,
                    "stop": 8298500
                },
                {
                    "chr": "1",
                    "start": 8323232,
                    "stop": 8342008
                },
            ])

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("agrown",
                      regions.convert.grow(50000),
                      on_overlap="merge")
        force_load(b.load())
        run_pipegraph()
        assert (b.df["start"] == [7724885, 8236026]).all()
        assert (b.df["stop"] == [7841673, 8392008]).all()
示例#2
0
    def test_random_same_number(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2", "1"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
                "column_that_will_disappear": ["A", "b", "c"],
            })

        def convert(df):
            res = df[["chr", "start", "stop"]]
            res = res.assign(start=res["start"] + 1)
            return res

        if ppg.inside_ppg():
            deps = [ppg.ParameterInvariant("shuParam", ("hello"))]
        else:
            deps = []
        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        a.add_annotator(Constant("Constant", 5))
        a.annotate()
        b = a.convert("a+1", convert, dependencies=deps)
        force_load(b.load())
        for d in deps:
            assert d in b.load().lfg.prerequisites
        run_pipegraph()
        assert len(a.df) == len(b.df)
        assert (a.df["start"] == b.df["start"] - 1).all()
        assert "column_that_will_disappear" in a.df.columns
        assert not ("column_that_will_disappear" in b.df.columns)
示例#3
0
    def test_cookiecutter_summit(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2"],
                "start": [0, 0],
                "stop": [200, 99 * 3]
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("cookie",
                      regions.convert.cookie_summit(a.summit_annotator, 220))
        c = a.convert(
            "cookieB",
            regions.convert.cookie_summit(a.summit_annotator,
                                          220,
                                          drop_those_outside_chromosomes=True),
        )
        force_load(b.load())
        force_load(c.load())
        run_pipegraph()
        assert len(b.df) == 2
        assert len(c.df) == 1
        assert (b.df["start"] == [0, math.floor(99 * 3 / 2) - 110]).all()
        assert (b.df["stop"] == [100 + 110,
                                 math.floor(99 * 3 / 2) + 110]).all()
示例#4
0
    def test_name_must_be_string(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2"],
                "start": [0, 0],
                "stop": [200, 99 * 3]
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        with pytest.raises(ValueError):
            a.convert(123, regions.convert.shift(50))
示例#5
0
    def test_grow(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "1", "2"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("agrown", regions.convert.grow(12))
        force_load(b.load())
        run_pipegraph()
        assert (b.df["start"] == [0, 100 - 12, 1000 - 12]).all()
        assert (b.df["stop"] == [24, 110 + 12, 1110 + 12]).all()
示例#6
0
    def test_merge_connected_2(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "1", "2"],
                "start": [27897200, 27898600, 0],
                "stop": [27897300, 27898700, 100],
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("agrown", regions.convert.merge_connected())
        force_load(b.load())
        run_pipegraph()
        assert (b.df["start"] == [27897200, 27898600, 0]).all()
        assert (b.df["stop"] == [27897300, 27898700, 100]).all()
示例#7
0
    def test_summit(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2"],
                "start": [0, 0],
                "stop": [200, 99 * 3]
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("cookie", regions.convert.summit(a.summit_annotator))
        force_load(b.load())
        run_pipegraph()
        assert len(b.df) == 2
        assert (b.df["start"] == [100, 148]).all()
        assert (b.df["stop"] == [101, 149]).all()
示例#8
0
    def test_raises_on_conversion_function_not_returning_df(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2", "1"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
            })

        def convert(df):
            return None

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        with pytest.raises(ValueError):
            a.convert("a+1", convert)
            force_load(a.load())
            run_pipegraph()
示例#9
0
    def test_cookie_cutter(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2"],
                "start": [0, 0],
                "stop": [200, 99 * 3],
                "strand": [-1, 1],
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("cookie", regions.convert.cookie_cutter(100))
        force_load(b.load())
        run_pipegraph()
        assert (b.df["start"] == [50, 98]).all()
        assert (b.df["stop"] == [50 + 100, 98 + 100]).all()
        assert (b.df["strand"] == [-1, 1]).all()
示例#10
0
    def test_liftover(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "1", "2"],
                "start": [27897200, 27898600, 100000],
                "stop": [27897300, 27898700, 100100],
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("hg38", regions.convert.lift_over("hg19ToHg38"))
        force_load(b.load())
        run_pipegraph()
        # made these with the ucsc web liftover utility
        # http://genome.ucsc.edu/cgi-bin/hgLiftOver
        assert (b.df["start"] == [27570689, 27572089, 100000]).all()
        assert (b.df["stop"] == [27570789, 27572189, 100100]).all()
示例#11
0
    def test_overlap_genes_requires_two_genes(self):
        genome = get_genome()
        a = genes.Genes(genome)

        def sample_data():
            return pd.DataFrame({
                "chr": ["Chromosome"],
                "start": [1000],
                "stop": [1100]
            })

        b = regions.GenomicRegions("sha", sample_data, [], genome)
        force_load(a.load())
        force_load(b.load())
        run_pipegraph()

        with pytest.raises(ValueError):
            a.overlap_genes(b)
示例#12
0
    def test_raises_on_non_genome(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2", "1"],
                "start": [10, 100, 1000],
                "stop": [12, 110, 1110],
            })

        def convert(df):
            res = df[:]
            res["start"] += 1
            return res

        genome = get_genome_chr_length()
        a = regions.GenomicRegions("sharum", sample_data, [], genome)
        a.convert("a+1", convert, genome)

        with pytest.raises(ValueError):
            a.convert("a+1b", convert, "hello")
示例#13
0
    def test_windows(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["1", "2"],
                "start": [0, 0],
                "stop": [200, 99 * 3]
            })

        a = regions.GenomicRegions("sharum", sample_data, [],
                                   get_genome_chr_length())
        b = a.convert("ndiwo", regions.convert.windows(99, False))
        c = a.convert("ndiwo2", regions.convert.windows(99, True))
        force_load(b.load())
        force_load(c.load())
        run_pipegraph()
        assert (b.df["start"] == [0, 99, 99 * 2, 0, 99, 99 * 2]).all()
        assert (b.df["stop"] == [99, 99 * 2, 200, 99, 99 * 2, 99 * 3]).all()
        assert (c.df["start"] == [0, 99, 0, 99, 99 * 2]).all()
        assert (c.df["stop"] == [99, 99 * 2, 99, 99 * 2, 99 * 3]).all()
示例#14
0
    def test_liftover_filter_chr(self):
        def sample_data():
            return pd.DataFrame({
                "chr": ["2", "1", "11_gl000202_random", "MT"],
                "start": [100, 27897200, 500, 100000],
                "stop": [10000, 27897300, 5000, 100100],
                "copy": ["D", "A", "B", "C"],
                "name": ["d", "a", "b", "c"],
            })

        a = regions.GenomicRegions(
            "sharum",
            sample_data,
            [],
            get_genome_chr_length({
                "1": 100000,
                "2": 100000,
                "11_gl000202_random": 100000,
                "MT": 100000,
                "11": 1000000,
            }),
        )
        b = a.convert(
            "hg38",
            regions.convert.lift_over("hg19ToHg38",
                                      keep_name=True,
                                      filter_to_these_chromosomes=["1"]),
        )
        force_load(b.load())
        run_pipegraph()
        # made these with the ucsc web liftover utility
        # http://genome.ucsc.edu/cgi-bin/hgLiftOver
        print(b.df)
        assert (b.df["start"] == [27570689]).all()
        assert (b.df["stop"] == [27570789]).all()
        assert (b.df["copy"] == ["A"]).all()
        assert (b.df["name"] == ["a"]).all()
        assert (b.df["chr"] == ["1"]).all()
示例#15
0
    def test_merge_connected(self):
        def sample_data():
            return pd.DataFrame({
                "chr": [
                    "1",
                    "1",
                    "1",
                    "1",
                    "1",
                    "2",
                    "3",
                    "3",
                    "3",
                    "4",
                    "4",
                    "4",
                    "5",
                ],
                "start": [
                    10,
                    13,
                    110,
                    300,
                    400,
                    102,
                    5,
                    6,
                    6000,
                    10,
                    100,
                    200,
                    100,
                ],
                "stop": [
                    18,
                    100,
                    200,
                    400,
                    410,
                    1000,
                    5000,
                    4900,
                    6010,
                    100,
                    150,
                    300,
                    110,
                ],
            })

        a = regions.GenomicRegions("sharum",
                                   sample_data, [],
                                   get_genome_chr_length(),
                                   on_overlap="ignore")
        b = a.convert("agrown", regions.convert.merge_connected())
        force_load(b.load())
        run_pipegraph()
        assert (b.df["chr"] == ["1", "1", "1", "2", "3", "3", "4", "4",
                                "5"]).all()
        assert (b.df["start"] == [10, 110, 300, 102, 5, 6000, 10, 200,
                                  100]).all()
        assert (b.df["stop"] == [
            100, 200, 410, 1000, 5000, 6010, 150, 300, 110
        ]).all()