def test_repr(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_ids = [uuid4().hex for i in range(10)] pipeline_ids_copy = pipeline_ids[:] pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = \ lambda: {'pipeline id': pipeline_ids.pop()} pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \ for i in range(10)] for pipeline in pipelines: pipeline_manager.start(pipeline) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>') messages = [ 'pipeline finished: id={}, duration=0.1'.format(pipeline_id) for pipeline_id in pipeline_ids_copy[:3] ] poll = [False, True, True, True] def new_poll(timeout): return poll.pop() def new_broadcast_receive(): return messages.pop() pipeline_manager.broadcast_poll = new_poll pipeline_manager.broadcast_receive = new_broadcast_receive pipeline_manager.update(0.1) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
def send_pipeline_and_wait_finished(): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipelines = [] for i in range(10): pipeline = Pipeline( { Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3') }, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) assert pipeline_manager.started_pipelines == 10 assert pipeline_manager.finished_pipelines == 0 start = time() pipeline_manager.finished(pipelines[0]) # only for testing this method while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines: pipeline_manager.update(0.5) end = time() pipeline_manager.disconnect() return { 'duration': pipeline.duration, 'real_duration': end - start, 'finished_pipelines': pipeline_manager.finished_pipelines, 'started_pipelines': pipeline_manager.started_pipelines }
def test_repr(self): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipeline_ids = [uuid4().hex for i in range(10)] pipeline_ids_copy = pipeline_ids[:] pipeline_manager.send_api_request = lambda x: None pipeline_manager.get_api_reply = \ lambda: {'pipeline id': pipeline_ids.pop()} pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \ for i in range(10)] for pipeline in pipelines: pipeline_manager.start(pipeline) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>') messages = ['pipeline finished: id={}, duration=0.1'.format(pipeline_id) for pipeline_id in pipeline_ids_copy[:3]] poll = [False, True, True, True] def new_poll(timeout): return poll.pop() def new_broadcast_receive(): return messages.pop() pipeline_manager.broadcast_poll = new_poll pipeline_manager.broadcast_receive = new_broadcast_receive pipeline_manager.update(0.1) result = repr(pipeline_manager) self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
def main(): pipeline_definition = {Job('Downloader'): (Job('GetTextAndWords'), Job('GetLinks'))} urls = ['http://www.fsf.org', 'https://creativecommons.org', 'http://emap.fgv.br', 'https://twitter.com/turicas', 'http://www.pypln.org', 'http://www.zeromq.org', 'http://www.python.org', 'http://www.mongodb.org', 'http://github.com', 'http://pt.wikipedia.org'] pipeline_manager = PipelineManager(api='tcp://127.0.0.1:5555', broadcast='tcp://127.0.0.1:5556') print 'Sending pipelines...' start_time = time() my_pipelines = [] for index, url in enumerate(urls): filename = '/tmp/{}.dat'.format(index) data = json.dumps({'url': url}) with open(filename, 'w') as fp: fp.write(data) pipeline = Pipeline(pipeline_definition, data={'filename': filename}) pipeline_manager.start(pipeline) print ' Sent pipeline for url={}'.format(url) print print 'Waiting for pipelines to finish...' total_pipelines = pipeline_manager.started_pipelines finished_pipelines = 0 while finished_pipelines < total_pipelines: pipeline_manager.update(0.5) finished_pipelines = pipeline_manager.finished_pipelines percentual = 100 * (float(finished_pipelines) / total_pipelines) sys.stdout.write('\rFinished pipelines: {}/{} ({:5.2f}%)'\ .format(finished_pipelines, total_pipelines, percentual)) sys.stdout.flush() end_time = time() print '\rAll pipelines finished in {} seconds'.format(end_time - start_time) durations = [pipeline.duration for pipeline in pipeline_manager.pipelines] average_duration = sum(durations) / len(durations) print 'Average pipeline duration (seconds) = {} (min={}, max={})'\ .format(average_duration, min(durations), max(durations)) print print 'Some data saved by store:' for index, url in enumerate(urls): filename = '/tmp/{}.dat'.format(index) with open(filename) as fp: data = json.loads(fp.read()) print (' url={url}, download_duration={download_duration}, ' 'number_of_words={number_of_words}, ' 'number_of_links={number_of_links}'.format(**data))
def send_pipeline_and_wait_finished(): pipeline_manager = PipelineManager(api=API_ADDRESS, broadcast=BROADCAST_ADDRESS) pipelines = [] for i in range(10): pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'), Job(u'worker_2'): Job(u'worker_3')}, data={'index': i}) pipeline_manager.start(pipeline) pipelines.append(pipeline) assert pipeline_manager.started_pipelines == 10 assert pipeline_manager.finished_pipelines == 0 start = time() pipeline_manager.finished(pipelines[0]) # only for testing this method while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines: pipeline_manager.update(0.5) end = time() pipeline_manager.disconnect() return {'duration': pipeline.duration, 'real_duration': end - start, 'finished_pipelines': pipeline_manager.finished_pipelines, 'started_pipelines': pipeline_manager.started_pipelines}