def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union([dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)):1, ((-1, rdd2.id), (-1, rdd3.id)):1} stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list(s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == {((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1} else: assert False pprint(s.get_pipeline_graph())
def get_rdd(self): dpark = DparkContext() return dpark.union( [dpark.textFile(path, splitSize=64 << 20) for path in self.paths] ).map(Weblog.from_line)
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], {(0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1})
def test_call_graph_union(): dc = DparkContext() Scope.reset() r1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) r2 = dc.union([dc.makeRDD([(3, 4)]) for _ in range(2)]) rdd = r1.union(r2) dc.scheduler.current_scope = Scope.get("") g = dc.scheduler.get_call_graph(rdd) # pprint(g) fg = dc.scheduler.fmt_call_graph(g) # pprint(fg) assert g == ([0, 1, 2, 3, 4, 5], { (0, 1): 2, (1, 4): 1, (2, 3): 2, (3, 4): 1, (4, 5): 1 })
def test_lineage(): Scope.reset() dc = DparkContext() rdd1 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(5)]) assert len(rdd1.dep_lineage_counts) == 1 rdd2 = dc.union([dc.makeRDD([(1, 2)]) for _ in range(3)]) rdd3 = rdd1.union(rdd2) assert len(rdd3.dep_lineage_counts) == 2 rdd4 = dc.union( [dc.union([dc.makeRDD([(1, 2)]) for _ in range(2)]) for _ in range(4)]) assert len(rdd4.dep_lineage_counts) == 1 assert len(list(rdd4.dependencies)[0].rdd.dep_lineage_counts) == 1 rdd5 = rdd3.groupWith(rdd4) print("rdd1", rdd1.id, rdd1.dep_lineage_counts) stage = dc.scheduler.newStage(rdd1, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd1.id] assert stage.pipeline_edges == {} stage = dc.scheduler.newStage(rdd3, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert stage.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } stage = dc.scheduler.newStage(rdd4, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert list(stage.pipelines.keys()) == [rdd4.id] assert stage.pipeline_edges == {} print("rdd5", rdd5.id, rdd3.id, rdd4.id) stage = dc.scheduler.newStage(rdd5, None) pprint(stage.pipelines) pprint(stage.pipeline_edges) assert sorted(list(stage.pipelines.keys())) == [rdd5.id] assert sorted(stage.pipeline_edges) == sorted([ ((s.id, s.rdd.id), (-1, rdd5.id)) for s in stage.parents ]) print('-' * 100) pprint(stage.get_pipeline_graph()) for s in stage.parents: if s.rdd.id == rdd4.id: assert list(s.pipelines.keys()) == [rdd4.id] assert s.pipeline_edges == {} elif s.rdd.id == rdd3.id: assert sorted(list( s.pipelines.keys())) == [rdd1.id, rdd2.id, rdd3.id] assert s.pipeline_edges == { ((-1, rdd1.id), (-1, rdd3.id)): 1, ((-1, rdd2.id), (-1, rdd3.id)): 1 } else: assert False pprint(s.get_pipeline_graph())
def m(x): return x def r(x, y): return x + y def src(): return dc.makeRDD([(1,1)],2) dc = DparkContext("mesos") rdd1 = src() rdd2 = src().reduceByKey(r) to_union_1_a = [src() for _ in range(2)] to_union_1_b = [src()] to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)] to_union_2_b = [rdd2, rdd1] to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)] to_union_3_b = [rdd2] rdd3 = dc.union(to_union_3_a + to_union_3_b) rdd4 = rdd2.join(rdd2) rdd1.collect() rdd2.collect() rdd3.collect() rdd4.collect()
def get_rdd(self): dpark = DparkContext() return dpark.union([ dpark.textFile(path, splitSize=64 << 20) for path in self.paths ]).map(Weblog.from_line)
# -*- coding: utf-8 -*- from dpark import DparkContext dc = DparkContext() def get_rdd(): return dc.makeRDD([(1, 1)]) rdd1 = get_rdd() rdd2 = dc.union([get_rdd() for i in range(2)]) rdd3 = get_rdd().groupByKey() dc.union([rdd1, rdd2, rdd3]).collect()
def m(x): return x def r(x, y): return x + y def src(): return dc.makeRDD([(1, 1)], 2) dc = DparkContext("mesos") rdd1 = src() rdd2 = src().reduceByKey(r) to_union_1_a = [src() for _ in range(2)] to_union_1_b = [src()] to_union_2_a = [dc.union(to_union_1_a + to_union_1_b) for _ in range(2)] to_union_2_b = [rdd2, rdd1] to_union_3_a = [dc.union(to_union_2_a + to_union_2_b).map(m).reduceByKey(r)] to_union_3_b = [rdd2] rdd3 = dc.union(to_union_3_a + to_union_3_b) rdd4 = rdd2.join(rdd2) rdd1.collect() rdd2.collect() rdd3.collect() rdd4.collect()