def run(self, job_name, mapper_spec, reducer_spec, input_reader_spec, output_writer_spec=None, mapper_params=None, reducer_params=None, shards=None): map_pipeline = yield MapPipeline(job_name, mapper_spec, input_reader_spec, params=mapper_params, shards=shards) shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline) reducer_pipeline = yield ReducePipeline(job_name, reducer_spec, output_writer_spec, reducer_params, shuffler_pipeline) with pipeline.After(reducer_pipeline): all_temp_files = yield pipeline_common.Extend( map_pipeline, shuffler_pipeline) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(reducer_pipeline)
def run(self, job_name, filenames): hashed_files = yield _HashPipeline(job_name, filenames) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend( hashed_files, sorted_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job): jobs = service.jobs() status = jobs.get(projectId=config.project_id, jobId=job).execute() job_state = status['status']['state'] if job_state == 'PENDING' or job_state == 'RUNNING': delay = yield pipeline.common.Delay(seconds=1) with pipeline.After(delay): yield BqCheck(job) else: yield pipeline.common.Return(status)
def run(self, job_name, filenames): sort_mappers = [] for i in range(len(filenames)): filename = filenames[i] sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchRecordsReader", None, { "files": [filename], "processing_rate": 1000000, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, filenames): mapper = yield mapper_pipeline.MapperPipeline( "sort", __name__ + "._sort_records", __name__ + "._BatchRecordsReader", None, { "files": filenames, "processing_rate": 1000000, }, shards=1) # TODO(user): delete _OutputFile entities after collect with pipeline.After(mapper): yield _CollectOutputFiles(mapper.job_id)
def run(self, job_name, filenames, shards=None): if files.shuffler.available(): yield _ShuffleServicePipeline(job_name, filenames) else: hashed_files = yield _HashPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, params, parser_params, shards=8): extract_domain_files = yield _ExactDomainMapreducePipeline( job_name, params=params, shard_count=shards) robots_files = yield _RobotsFetchPipeline(job_name, extract_domain_files, shards) fetch_set_buffer_files = yield _FetchSetsBufferPipeline( job_name, robots_files) fetch_files = yield _FetchPagePipeline(job_name, fetch_set_buffer_files, shards) outlinks_files = yield _ExtractOutlinksPipeline( job_name, fetch_files, parser_params, shards) results_files = yield _FetchContentPipeline(job_name, outlinks_files, shards) temp_files = [ extract_domain_files, robots_files, fetch_set_buffer_files, fetch_files ] with pipeline.After(results_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files)
def run(self, job): jobs = service.jobs() status = jobs.get(projectId=bqproject, jobId=job).execute() if status['status']['state'] == 'PENDING' or status['status'][ 'state'] == 'RUNNING': message( self.root_pipeline_id, '<span class="label label-warning">{{ status }}</span> bq://jobs/{{ job }}', job=job, status=status['status']['state'].lower()) delay = yield pipeline.common.Delay(seconds=1) with pipeline.After(delay): yield BqCheck(job) else: message( self.root_pipeline_id, '<span class="label label-success">{{ status }}</span> bq://jobs/{{ job }} <a href="{{ base_path }}/status?root={{ root_pipeline_id }}#pipeline-{{ pipeline_id }}">pipeline</a>', job=job, status=status['status']['state'].lower(), base_path=self.base_path, pipeline_id=self.pipeline_id) yield pipeline.common.Return(status)