def verify_PipelineManager_exceptions(): pipeline_1 = Pipeline({ Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3') }) pipeline_2 = Pipeline({Job(u'worker_1'): Job(u'worker_2')}) pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_manager.start(pipeline_1) raise_1, raise_2 = False, False try: pipeline_manager.start(pipeline_1) except ValueError: raise_1 = True try: pipeline_manager.finished(pipeline_2) except ValueError: raise_2 = True pipeline_manager.disconnect() return { 'raise_1': raise_1, 'raise_2': raise_2, 'started_at': pipeline_1.started_at }
def send_pipeline_and_wait_finished(): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipelines = [] for i in range(10): pipeline = Pipeline( { Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3') }, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) assert pipeline_manager.started_pipelines == 10 assert pipeline_manager.finished_pipelines == 0 start = time() pipeline_manager.finished(pipelines[0]) # only for testing this method while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines: pipeline_manager.update(0.5) end = time() pipeline_manager.disconnect() return { 'duration': pipeline.duration, 'real_duration': end - start, 'finished_pipelines': pipeline_manager.finished_pipelines, 'started_pipelines': pipeline_manager.started_pipelines }
def test_repr(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_ids = [uuid4().hex for i in range(10)] pipeline_ids_copy = pipeline_ids[:] pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = \ lambda: {'pipeline id': pipeline_ids.pop()} pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \ for i in range(10)] for pipeline in pipelines: pipeline_manager.start(pipeline) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>') messages = [ 'pipeline finished: id={}, duration=0.1'.format(pipeline_id) for pipeline_id in pipeline_ids_copy[:3] ] poll = [False, True, True, True] def new_poll(timeout): return poll.pop() def new_broadcast_receive(): return messages.pop() pipeline_manager.broadcast_poll = new_poll pipeline_manager.broadcast_receive = new_broadcast_receive pipeline_manager.update(0.1) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
def send_pipeline(): pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}) pipeline_manager = PipelineManager(api='tcp://localhost:5550', broadcast='tcp://localhost:5551') before = pipeline.id pipeline_id = pipeline_manager.start(pipeline) pipeline_manager.disconnect() return before, pipeline_id, pipeline.id
def send_pipeline(): pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}) pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) before = pipeline.id pipeline_id = pipeline_manager.start(pipeline) pipeline_manager.disconnect() return before, pipeline_id, pipeline.id
def test_repr(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_ids = [uuid4().hex for i in range(10)] pipeline_ids_copy = pipeline_ids[:] pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = \ lambda: {'pipeline id': pipeline_ids.pop()} pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \ for i in range(10)] for pipeline in pipelines: pipeline_manager.start(pipeline) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>') messages = ['pipeline finished: id={}, duration=0.1'.format(pipeline_id) for pipeline_id in pipeline_ids_copy[:3]] poll = [False, True, True, True] def new_poll(timeout): return poll.pop() def new_broadcast_receive(): return messages.pop() pipeline_manager.broadcast_poll = new_poll pipeline_manager.broadcast_receive = new_broadcast_receive pipeline_manager.update(0.1) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
def send_pipeline(): pipeline = Pipeline({ Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3') }) pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) before = pipeline.id pipeline_id = pipeline_manager.start(pipeline) pipeline_manager.disconnect() return before, pipeline_id, pipeline.id
def test_should_return_all_pipelines(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = lambda: {'pipeline id': uuid4().hex} iterations = 10 pipelines = [] for i in range(iterations): pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) self.assertEqual(set(pipeline_manager.pipelines), set(pipelines))
def main(): pipeline_definition = {Job('Downloader'): (Job('GetTextAndWords'), Job('GetLinks'))} urls = ['http://www.fsf.org', 'https://creativecommons.org', 'https://github.com', 'http://emap.fgv.br', 'https://twitter.com/turicas'] pipeline_manager = PipelineManager(api='tcp://127.0.0.1:5555', broadcast='tcp://127.0.0.1:5556') print 'Sending pipelines...' my_pipelines = [] for index, url in enumerate(urls): filename = '/tmp/{}.data'.format(index) data = json.dumps({'url': url}) with open(filename, 'w') as fp: fp.write(data) pipeline = Pipeline(pipeline_definition, data={'filename': filename}) pipeline_manager.start(pipeline) my_pipelines.append(pipeline) print ' Sent pipeline for url={}'.format(url) print 'Waiting for pipelines to finish...' pipelines_finished = 0 while pipelines_finished < len(urls): counter = 0 for pipeline in my_pipelines: if pipeline_manager.finished(pipeline): counter += 1 if counter != pipelines_finished: print ' # of finished pipelines: {}'.format(counter) pipelines_finished = counter durations = [str(pipeline.duration) for pipeline in my_pipelines] print 'Pipeline durations (in seconds) = {}'.format(', '.join(durations)) for index, url in enumerate(urls): filename = '/tmp/{}.data'.format(index) with open(filename) as fp: data = json.loads(fp.read()) print (' url={url}, download_duration={download_duration}, ' 'number_of_words={number_of_words}, ' 'number_of_links={number_of_links}'.format(**data))
def main(): stdout_write = sys.stdout.write stdout_flush = sys.stdout.flush pipeline_manager = PipelineManager(api=ROUTER_API, broadcast=ROUTER_BROADCAST) pipeline_definition = {Job('Dummy1'): Job('Dummy2')} process = psutil.Process(os.getpid()) version = sys.argv[1] filename = 'test-{}_pipelines-pypelinin-{}.dat'.format( NUMBER_OF_PIPELINES, version) data = open(filename, 'w') my_pipelines = [] for i in xrange(NUMBER_OF_PIPELINES): pipeline = Pipeline(pipeline_definition, data={'index': i}) start_time = time() pipeline_manager.start(pipeline) end_time = time() my_pipelines.append(pipeline) memory_info = process.get_memory_info() info = (i + 1, end_time - start_time, memory_info.vms, memory_info.rss) data.write('{}\t{}\t{}\t{}\n'.format(*info)) if (i + 1) % UPDATE_INTERVAL == 0: stdout_write('\r{} out of {}'.format(i + 1, NUMBER_OF_PIPELINES)) stdout_flush() stdout_write('\rfinished sending pipelines! \o/\n') stdout_write('Waiting for pipelines to finish...\n') pipelines_finished = 0 finished = pipeline_manager.finished while pipelines_finished < NUMBER_OF_PIPELINES: finished(my_pipelines[0]) # just need one call to update state of all counter = [pipeline.finished for pipeline in my_pipelines].count(True) if counter != pipelines_finished: stdout_write('\r # of finished pipelines: {}/{}'.format( counter, NUMBER_OF_PIPELINES)) stdout_flush() pipelines_finished = counter stdout_write('\n') data.close()
def verify_PipelineManager_exceptions(): pipeline_1 = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}) pipeline_2 = Pipeline({Job(u'worker_1'): Job(u'worker_2')}) pipeline_manager = PipelineManager(api='tcp://localhost:5550', broadcast='tcp://localhost:5551') pipeline_manager.start(pipeline_1) raise_1, raise_2 = False, False try: pipeline_manager.start(pipeline_1) except ValueError: raise_1 = True try: pipeline_manager.finished(pipeline_2) except ValueError: raise_2 = True pipeline_manager.disconnect() return {'raise_1': raise_1, 'raise_2': raise_2, 'started_at': pipeline_1.started_at}
def verify_PipelineManager_exceptions(): pipeline_1 = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}) pipeline_2 = Pipeline({Job(u'worker_1'): Job(u'worker_2')}) pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_manager.start(pipeline_1) raise_1, raise_2 = False, False try: pipeline_manager.start(pipeline_1) except ValueError: raise_1 = True try: pipeline_manager.finished(pipeline_2) except ValueError: raise_2 = True pipeline_manager.disconnect() return {'raise_1': raise_1, 'raise_2': raise_2, 'started_at': pipeline_1.started_at}
def send_pipeline_and_wait_finished(): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipelines = [] for i in range(10): pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) assert pipeline_manager.started_pipelines == 10 assert pipeline_manager.finished_pipelines == 0 start = time() pipeline_manager.finished(pipelines[0]) # only for testing this method while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines: pipeline_manager.update(0.5) end = time() pipeline_manager.disconnect() return {'duration': pipeline.duration, 'real_duration': end - start, 'finished_pipelines': pipeline_manager.finished_pipelines, 'started_pipelines': pipeline_manager.started_pipelines}
def main(): pipeline_definition = {Job('Downloader'): (Job('GetTextAndWords'), Job('GetLinks'))} urls = ['http://www.fsf.org', 'https://creativecommons.org', 'http://emap.fgv.br', 'https://twitter.com/turicas', 'http://www.pypln.org', 'http://www.zeromq.org', 'http://www.python.org', 'http://www.mongodb.org', 'http://github.com', 'http://pt.wikipedia.org'] pipeline_manager = PipelineManager(api='tcp://127.0.0.1:5555', broadcast='tcp://127.0.0.1:5556') print 'Sending pipelines...' start_time = time() my_pipelines = [] for index, url in enumerate(urls): filename = '/tmp/{}.dat'.format(index) data = json.dumps({'url': url}) with open(filename, 'w') as fp: fp.write(data) pipeline = Pipeline(pipeline_definition, data={'filename': filename}) pipeline_manager.start(pipeline) print ' Sent pipeline for url={}'.format(url) print print 'Waiting for pipelines to finish...' total_pipelines = pipeline_manager.started_pipelines finished_pipelines = 0 while finished_pipelines < total_pipelines: pipeline_manager.update(0.5) finished_pipelines = pipeline_manager.finished_pipelines percentual = 100 * (float(finished_pipelines) / total_pipelines) sys.stdout.write('\rFinished pipelines: {}/{} ({:5.2f}%)'\ .format(finished_pipelines, total_pipelines, percentual)) sys.stdout.flush() end_time = time() print '\rAll pipelines finished in {} seconds'.format(end_time - start_time) durations = [pipeline.duration for pipeline in pipeline_manager.pipelines] average_duration = sum(durations) / len(durations) print 'Average pipeline duration (seconds) = {} (min={}, max={})'\ .format(average_duration, min(durations), max(durations)) print print 'Some data saved by store:' for index, url in enumerate(urls): filename = '/tmp/{}.dat'.format(index) with open(filename) as fp: data = json.loads(fp.read()) print (' url={url}, download_duration={download_duration}, ' 'number_of_words={number_of_words}, ' 'number_of_links={number_of_links}'.format(**data))
def send_pipeline_and_wait_finished(): import time pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}) pipeline_manager = PipelineManager(api='tcp://localhost:5550', broadcast='tcp://localhost:5551') pipeline_manager.start(pipeline) start = time.time() while not pipeline_manager.finished(pipeline): time.sleep(0.1) end = time.time() pipeline_manager.disconnect() return {'duration': pipeline.duration, 'real_duration': end - start}
def test_should_return_all_pipelines(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = lambda: {'pipeline id': uuid4().hex} iterations = 10 pipelines = [] for i in range(iterations): pipeline = Pipeline( { Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3') }, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) self.assertEqual(set(pipeline_manager.pipelines), set(pipelines))
def create_pipeline(data): manager = PipelineManager(settings.ROUTER_API, settings.ROUTER_BROADCAST) pipeline = Pipeline(default_pipeline, data=data) manager.start(pipeline)