def run(self,
         job_name,
         mapper_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         reducer_params=None,
         shards=None,
         combiner_spec=None):
   map_pipeline = yield MapPipeline(job_name,
                                    mapper_spec,
                                    input_reader_spec,
                                    params=mapper_params,
                                    shards=shards)
   shuffler_pipeline = yield ShufflePipeline(
       job_name, map_pipeline)
   reducer_pipeline = yield ReducePipeline(
       job_name,
       reducer_spec,
       output_writer_spec,
       reducer_params,
       shuffler_pipeline,
       combiner_spec=combiner_spec)
   with pipeline.After(reducer_pipeline):
     all_temp_files = yield pipeline_common.Extend(
         map_pipeline, shuffler_pipeline)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(reducer_pipeline)
示例#2
0
 def run(self,
         job_name,
         mapper_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         reducer_params=None,
         shards=None,
         combiner_spec=None):
     map_pipeline = yield MapPipeline(job_name,
                                      mapper_spec,
                                      input_reader_spec,
                                      params=mapper_params,
                                      shards=shards)
     shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline)
     reducer_pipeline = yield ReducePipeline(job_name,
                                             reducer_spec,
                                             output_writer_spec,
                                             reducer_params,
                                             shuffler_pipeline,
                                             combiner_spec=combiner_spec)
     with pipeline.After(reducer_pipeline):
         all_temp_files = yield pipeline_common.Extend(
             map_pipeline, shuffler_pipeline)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
     yield pipeline_common.Return(reducer_pipeline)
示例#3
0
 def run(self, job_name, filenames):
   hashed_files = yield _HashPipeline(job_name, filenames)
   sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
   merged_files = yield _MergePipeline(job_name, sorted_files)
   with pipeline.After(merged_files):
     all_temp_files = yield pipeline_common.Extend(
         hashed_files, sorted_files)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(merged_files)
示例#4
0
 def run(self, job_name, filenames):
     hashed_files = yield _HashPipeline(job_name, filenames)
     sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
     merged_files = yield _MergePipeline(job_name, sorted_files)
     with pipeline.After(merged_files):
         all_temp_files = yield pipeline_common.Extend(
             hashed_files, sorted_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
     yield pipeline_common.Return(merged_files)
示例#5
0
  def run(self, job_name, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergePipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
示例#6
0
  def run(self, job_name, mapper_params, filenames, shards=None):
    bucket_name = mapper_params["bucket_name"]
    filenames_only = _strip_bucket_name(bucket_name, filenames)
    hashed_files = yield _HashPipeline(job_name, bucket_name,
                                       filenames_only, shards=shards)
    sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
    temp_files = [hashed_files, sorted_files]

    merged_files = yield _MergePipeline(job_name, sorted_files)

    with pipeline.After(merged_files):
      all_temp_files = yield pipeline_common.Extend(*temp_files)
      yield mapper_pipeline._CleanupPipeline(all_temp_files)

    yield pipeline_common.Return(merged_files)
  def run(self, job_name, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergePipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
示例#8
0
    def run(self, job_name, mapper_params, filenames, shards=None):
        bucket_name = mapper_params["bucket_name"]
        filenames_only = _strip_bucket_name(bucket_name, filenames)
        hashed_files = yield _HashPipeline(job_name,
                                           bucket_name,
                                           filenames_only,
                                           shards=shards)
        sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
        temp_files = [hashed_files, sorted_files]

        merged_files = yield _MergePipeline(job_name, sorted_files)

        with pipeline.After(merged_files):
            all_temp_files = yield pipeline_common.Extend(*temp_files)
            yield mapper_pipeline._CleanupPipeline(all_temp_files)

        yield pipeline_common.Return(merged_files)