def test_producer_consume_2_processes_inline(self): data = [1, 2, 3, 4, 5] workflow = Iterable(data) | Parallelize(two_split) | ( Map(add100) | Map(add100)) | Join() | StoreAndPickle() workflow.run() #workflow ref to StoreAndPickle() instace that is the only leaf of the DAG actual = workflow.load() #need to sort result because with symmetric parallelims order is not guaranteed self.assertEqual(sorted(actual), [d + 200 for d in data])
def test_producer_map_consume_with_3_process_plus_2_braches(self): data = [1, 2, 3, 4, 5] workflow = Iterable(data) sync_brach = workflow | StoreAndPickle() async_branch = workflow | SpawnThread() | Map(add100) async_branch1 = async_branch | StoreAndPickle() async_branch2 = async_branch | Map(add100) | SpawnThread() | StoreAndPickle() workflow.run() self.assertEqual(sync_brach.load(), data) self.assertEqual(async_branch1.load(), [d+100 for d in data]) self.assertEqual(async_branch2.load(), [d+200 for d in data])
def test_iter_filter_map_list(self): producer = Iterable([1, 2, 3]) flt = Filter(lambda x: x > 1) map = Map(lambda x: x + 10) tolist = AsList() producer.add_child(flt) flt.add_child(map) map.add_child(tolist) producer.run() self.assertEqual(tolist.list, [12, 13])
def main(): """Demostrate that for I/O-bound operation python interpreter releases the GIL. The idea is to implement two operations using pypelines: one slow I/O-bound communication followed by one slow CPU-bound computation. To emulate an inefficient I/O-bound communication we will use an HTTP GET to an external web server which will compute very inefficiently Fibonacci value fib(n): HTTPClient("http://127.0.0.1:12345/fib/32"). To run the web server execute 'python fib_web.py' in a second shell. To emulate a CPU-bound long running calculation we will re-compute fib(n) with the same value of n: Map(compute_fib). Clearly this is a useless example, but it is easy to note that: 1. if we run this pypeline synchronously, we get as total time approximately the sum of the duration of the two operations. 2. if we run this pypeline asynchronously using asymmetric parallelism with one thread, we can observe that the time is almost half of the synchronous case. This means that while the HTTP client is waiting for the results (the web server takes a while to compute fib(n)), the Python interpreter release the GIL and the thread running compute_fib start to execute.""" #Execute: python fib_web.py print("Run Pypelines synchronously...") workflow = Repeat(lambda x: x > 10) | HTTPClient( "http://127.0.0.1:12345/fib/32") | Map(compute_fib) | StdOut() t1 = time.time() workflow.run() t2 = time.time() print("Took " + str(t2 - t1) + " seconds.") print( "Now, run Pypelines asynchronously using asymmetric parallelims with one thread, should be faster..." ) workflow = Repeat(lambda x: x > 10) | HTTPClient( "http://127.0.0.1:12345/fib/32") | SpawnThread() | Map( compute_fib) | StdOut() t1 = time.time() workflow.run() t2 = time.time() print("Took " + str(t2 - t1) + " seconds.")
def test_producer_map_consume_with_3_process(self): data = [1, 2, 3, 4, 5] workflow = Iterable(data) | SpawnThread() | Map(add100) | SpawnThread() | StoreAndPickle() workflow.run() #workflow ref to StoreAndPickle() instace that is the only leaf of the DAG actual = workflow.load() self.assertEqual(actual, [d+100 for d in data])
def test_lambda_with_func_import(self): data = [1, 2, 3, 4, 5] workflow = Iterable(data) | SpawnProcess() | Map(lambda x: add100(x)) | StoreAndPickle() workflow.run() #workflow ref to StoreAndPickle() instace that is the only leaf of the DAG actual = workflow.load() self.assertEqual(actual, [d+100 for d in data])
def test_query_search_for_leaf_on_two_branches(self): workflow = Iterable(range(10)) branch1 = workflow | Map(lambda x: x + 1) | StdOut() branch2 = workflow | Filter(lambda x: x > 5) | Assert( self, [6, 7, 8, 9]) self.assertEqual( workflow.query("Iterable/Map/StdOut").name(), "StdOut") self.assertEqual( workflow.query("Iterable/Filter/Assert").name(), "Assert")
def test_producer_consume_10_processes(self): data = range(20) parallel = Map(add100) workflow = Iterable(data) | Parallelize( ten_split) | parallel | Join() | StoreAndPickle() workflow.run() #workflow ref to StoreAndPickle() instace that is the only leaf of the DAG actual = workflow.load() #need to sort result because with symmetric parallelims order is not guaranteed self.assertEqual(sorted(actual), [d + 100 for d in data])
def test_producer_map_consume_with_2_process(self): data = [1, 2, 3, 4, 5] #CAUTION!!!! #Cannot use lambda (e.g. Map(lambda x: x+100)) yet due to pickle problem in multiprocessing lib #possible solution is to hook the import of pickle im multiprocessing lib #and substitute with dill. #See: #http://chimera.labs.oreilly.com/books/1230000000393/ch10.html#_solution_180 workflow = Iterable(data) | SpawnProcess() | Map(lambda x: x+100) | StoreAndPickle() workflow.run() #workflow ref to StoreAndPickle() instace that is the only leaf of the DAG actual = workflow.load() self.assertEqual(actual, [d+100 for d in data])
def test_dinasty_third_level(self): workflow = Iterable(range(1000)) | Map(lambda x: x + 1) | StdOut() self.assertEqual(workflow.dinasty(), "Iterable/Map/StdOut")
from pypelines import Map, Filter, Sum, StdOut from pypelines.io import HTTPClient workflow = HTTPClient('http://www.gutenberg.org/cache/epub/1232/pg1232.txt', readlines=True) | Filter(lambda line: line != "") | Map( lambda line: line.split(' ')) | Map( lambda words: len(words)) | Sum() | StdOut() workflow.run()
from pypelines import Map, Filter, Sum, StdOut, CountByKey, FlatMap, Sort, Head from pypelines.io import HTTPClient, TextFile workflow = HTTPClient('http://www.gutenberg.org/cache/epub/1232/pg1232.txt', readlines=True) | Filter(lambda line: line != "") savefile = workflow | TextFile("macchiavelli.txt") wordcount = workflow | Map(lambda line: line.split(' ')) | Map( lambda words: len(words)) | Sum() | StdOut() histogram = workflow | FlatMap(lambda line: line.split(' ')) | Filter( lambda word: word != "") | Map(lambda word: (word, 1)) | CountByKey() | Sort( key_func=lambda data: data[1], reverse=True) | Head(10) | StdOut() workflow.run()
def test_leafs_3_nodes_dag(self): workflow = Iterable(range(10)) | Map(lambda x: x + 1) | StdOut() self.assertEqual([n.name() for n in workflow.leafs()], ["StdOut"])
def test_leafs_2_breanches_balanced(self): workflow = Iterable(range(1000)) branch1 = workflow | Map(lambda x: x + 1) branch2 = workflow | Filter(lambda x: x > 500) self.assertEqual([n.name() for n in workflow.leafs()], ["Map", "Filter"])
def test_depth_second_level_is_1(self): workflow = Iterable(range(1000)) | Map(lambda x: x + 1) self.assertEqual(workflow.depth(), 1)
def test_query_search_for_leaf_from_second_level(self): workflow = Iterable(range(10)) map = workflow | Map(lambda x: x + 1) stdout = map | StdOut() self.assertEqual(map.query("Map/StdOut").name(), "StdOut")
def test_query_search_for_leaf(self): workflow = Iterable(range(1000)) branch1 = workflow | Map(lambda x: x + 1) | StdOut() self.assertAlmostEqual( workflow.query("Iterable/Map/StdOut").name(), "StdOut")
def test_dinasty_third_level_2_two_branches(self): workflow = Iterable(range(1000)) branch1 = workflow | Map(lambda x: x + 1) | StdOut() branch2 = workflow | Filter(lambda x: x > 500) | StdOut() self.assertEqual(branch1.dinasty(), "Iterable/Map/StdOut") self.assertEqual(branch2.dinasty(), "Iterable/Filter/StdOut")
def test_dinasty_compose_dag(self): sub_workflow = Map(lambda x: x + 1) | Map(lambda x: x + 1) | Map( lambda x: x + 1) workflow = Iterable(range(1000)) | sub_workflow | StdOut() self.assertEqual(workflow.dinasty(), "Iterable/Map/Map/Map/StdOut")
def test_depth_third_level_is_2(self): workflow = Iterable(range(1000)) | Map(lambda x: x + 1) | StdOut() self.assertEqual(workflow.depth(), 2)
def test_depth_third_level_2_two_branches(self): workflow = Iterable(range(1000)) branch1 = workflow | Map(lambda x: x + 1) | StdOut() branch2 = workflow | Map(lambda x: x + 1) | StdOut() self.assertEqual(branch1.depth(), 2) self.assertEqual(branch2.depth(), 2)