def run(self,
         job_name,
         mapper_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         reducer_params=None,
         shards=None):
   map_pipeline = yield MapPipeline(job_name,
                                    mapper_spec,
                                    input_reader_spec,
                                    params=mapper_params,
                                    shards=shards)
   shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline)
   reducer_pipeline = yield ReducePipeline(job_name,
                                           reducer_spec,
                                           output_writer_spec,
                                           reducer_params,
                                           shuffler_pipeline)
   with pipeline.After(reducer_pipeline):
     all_temp_files = yield pipeline_common.Extend(
         map_pipeline, shuffler_pipeline)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(reducer_pipeline)
예제 #2
0
 def run(self, job_name, filenames):
     hashed_files = yield _HashPipeline(job_name, filenames)
     sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
     merged_files = yield _MergePipeline(job_name, sorted_files)
     with pipeline.After(merged_files):
         all_temp_files = yield pipeline_common.Extend(
             hashed_files, sorted_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
     yield pipeline_common.Return(merged_files)
예제 #3
0
 def run(self, job_name, filenames):
   hashed_files = yield _HashPipeline(job_name, filenames)
   sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
   merged_files = yield _MergePipeline(job_name, sorted_files)
   with pipeline.After(merged_files):
     all_temp_files = yield pipeline_common.Extend(
         hashed_files, sorted_files)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(merged_files)
  def run(self, job_name, shuffler_params, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashGSPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergeGSPipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
예제 #5
0
  def run(self, job_name, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergePipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
예제 #6
0
 def run(self,
         job_name,
         params,
         parser_params,
         shards=8):
   extract_domain_files = yield _ExactDomainMapreducePipeline(job_name,
       params=params,
       shard_count=shards)
   robots_files = yield _RobotsFetchPipeline(job_name, extract_domain_files, shards)
   fetch_set_buffer_files = yield _FetchSetsBufferPipeline(job_name, robots_files)
   fetch_files = yield _FetchPagePipeline(job_name, fetch_set_buffer_files, shards)
   outlinks_files = yield _ExtractOutlinksPipeline(job_name, fetch_files, parser_params, shards)
   results_files = yield _FetchContentPipeline(job_name, outlinks_files, shards)
   temp_files = [extract_domain_files, robots_files, fetch_set_buffer_files, fetch_files]
   with pipeline.After(results_files):
     all_temp_files = yield pipeline_common.Extend(*temp_files)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
예제 #7
0
  def run(self, job_name, filenames, shards=None, combine_spec=None):
    hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
    sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
    temp_files = [hashed_files, sorted_files]

    if combine_spec:
      sorted_files = yield _CombinePipeline(
          job_name, sorted_files, combine_spec)
      temp_files.append(sorted_files)

    merged_files = yield _MergePipeline(job_name, sorted_files)

    with pipeline.After(merged_files):
      all_temp_files = yield pipeline_common.Extend(*temp_files)
      yield mapper_pipeline._CleanupPipeline(all_temp_files)

    yield pipeline_common.Return(merged_files)
예제 #8
0
    def testCleanup_ListOfLists(self):
        """Tests cleaning up a list of file lists."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run map
        p = mapper_pipeline.MapperPipeline(
            "test",
            handler_spec=__name__ + ".test_map",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".KeyValueBlobstoreOutputWriter",
            params={
                "input_reader": {
                    "entity_kind": __name__ + ".TestEntity",
                },
            },
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        for name in file_list:
            files.open(name, "r").read(0)

        grouped_list = [file_list]

        # Cleanup
        cleanup = mapper_pipeline._CleanupPipeline(grouped_list)
        cleanup.start()
        test_support.execute_until_empty(self.taskqueue)

        # Cannot open files
        for name in file_list:
            self.assertRaises(files.Error, files.open, name, "r")
  def testCleanup_ListOfLists(self):
    """Tests cleaning up a list of file lists."""
    # Prepare test data
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run map
    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".KeyValueBlobstoreOutputWriter",
        params={
            "input_reader": {
                "entity_kind": __name__ + ".TestEntity",
                },
            },
        )
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)

    # Can open files
    file_list = finished_map.outputs.default.value
    self.assertTrue(len(file_list) > 0)
    for name in file_list:
      files.open(name, "r").read(0)

    grouped_list = [file_list]

    # Cleanup
    cleanup = mapper_pipeline._CleanupPipeline(grouped_list)
    cleanup.start()
    test_support.execute_until_empty(self.taskqueue)

    # Cannot open files
    for name in file_list:
      self.assertRaises(files.Error, files.open, name, "r")
예제 #10
0
 def run(self, job_name, params, parser_params, shards=8):
     extract_domain_files = yield _ExactDomainMapreducePipeline(
         job_name, params=params, shard_count=shards)
     robots_files = yield _RobotsFetchPipeline(job_name,
                                               extract_domain_files, shards)
     fetch_set_buffer_files = yield _FetchSetsBufferPipeline(
         job_name, robots_files)
     fetch_files = yield _FetchPagePipeline(job_name,
                                            fetch_set_buffer_files, shards)
     outlinks_files = yield _ExtractOutlinksPipeline(
         job_name, fetch_files, parser_params, shards)
     results_files = yield _FetchContentPipeline(job_name, outlinks_files,
                                                 shards)
     temp_files = [
         extract_domain_files, robots_files, fetch_set_buffer_files,
         fetch_files
     ]
     with pipeline.After(results_files):
         all_temp_files = yield pipeline_common.Extend(*temp_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)