def run(self, job_name, reducer_spec, output_writer_spec, params, bucket_name, filenames, combiner_spec=None, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) new_params = dict(params or {}) new_params.update({ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, } }) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params, shards=shards)
def run(self, job_name, bucket_name, filenames): sort_mappers = [] for i in range(len(filenames)): filenames_only = util.strip_prefix_from_items("/%s/" % bucket_name, filenames[i]) sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchGCSRecordsReader", None, { "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append(*[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)
def run(self, job_name, reducer_spec, output_writer_spec, params, filenames): new_params = dict(params or {}) new_params.update({"files": filenames}) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + ".KeyValuesReader", output_writer_spec, new_params)
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + ".RecordsReader", output_writer_spec=__name__ + "._HashingBlobstoreOutputWriter", params={'files': filenames}, shards=len(filenames))
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", params={'files': filenames}, shards=len(filenames))
def run(self, job_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", params={ _MergingReader.FILES_PARAM: filenames, _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT, _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE, }, shards=len(filenames))
def run(self, filenames): mapper = yield mapper_pipeline.MapperPipeline( "sort", __name__ + "._sort_records", __name__ + "._BatchRecordsReader", None, { "files": filenames, "processing_rate": 1000000, }, shards=1) with pipeline.After(mapper): yield _CollectOutputFiles(mapper.job_id)
def run(self, job_name, bucket_name, filenames, shards=None): if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + "._GoogleCloudStorageRecordInputReader", output_writer_spec=__name__ + "._HashingBlobstoreOutputWriter", params={ "input_reader": { "bucket_name": bucket_name, "objects": filenames, }, }, shards=shards)
def run(self, job_name, bucket_name, filenames): yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-merge", __name__ + "._merge_map", __name__ + "._MergingReader", output_writer_spec=output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter", params={ _MergingReader.FILES_PARAM: filenames, _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT, _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE, "output_writer": { "bucket_name": bucket_name, }, }, shards=len(filenames))
def run(self, job_name, reducer_spec, output_writer_spec, params, filenames, combiner_spec=None): new_params = dict(params or {}) new_params.update({"files": filenames}) if combiner_spec: new_params.update({ "combiner_spec": combiner_spec, }) yield mapper_pipeline.MapperPipeline(job_name + "-reduce", reducer_spec, __name__ + "._ReducerReader", output_writer_spec, new_params)
def run(self, job_name, bucket_name, filenames, shards=None): filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames)) if shards is None: shards = len(filenames) yield mapper_pipeline.MapperPipeline( job_name + "-shuffle-hash", __name__ + "._hashing_map", input_readers.__name__ + "._GoogleCloudStorageRecordInputReader", output_writer_spec=__name__ + "._HashingGCSOutputWriter", params={ "input_reader": { "bucket_name": bucket_name, "objects": filenames_only, }, "output_writer": { "bucket_name": bucket_name, }, }, shards=shards)
def run(self, job_name, filenames): sort_mappers = [] for i in range(len(filenames)): filename = filenames[i] sort_mapper = yield mapper_pipeline.MapperPipeline( "%s-shuffle-sort-%s" % (job_name, str(i)), __name__ + "._sort_records_map", __name__ + "._BatchRecordsReader", None, { "files": [filename], "processing_rate": 1000000, }, shards=1) sort_mappers.append(sort_mapper) with pipeline.After(*sort_mappers): job_ids = yield pipeline_common.Append( *[mapper.job_id for mapper in sort_mappers]) result = yield _CollectOutputFiles(job_ids) with pipeline.After(result): yield _CleanupOutputFiles(job_ids) yield pipeline_common.Return(result)