def testHandlerSerialization(self):
    """Test serializable handler works with MR and shard retry."""
    entity_count = 10

    for _ in range(entity_count):
      TestEntity(int_property=-1).put()

    # Force handler to serialize on every call.
    parameters.config._SLICE_DURATION_SEC = 0

    control.start_map(
        "test_map",
        __name__ + ".SerializableHandler",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=1,
        base_path="/mapreduce_base_path")

    task_run_counts = test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(
        task_run_counts[handlers.MapperWorkerCallbackHandler],
        # Shard retries + one per entity + one to exhaust input reader + one for
        # finalization.
        SerializableHandler.FAILURES_INDUCED_BY_INSTANCE + entity_count + 1 + 1)
    vals = [e.int_property for e in TestEntity.all()]
    vals.sort()
    # SerializableHandler updates int_property to be incremental from 0 to 9.
    self.assertEquals(range(10), vals)
  def _run_test(self, num_shards, num_files):
    bucket_name = "testing"
    object_prefix = "file-"
    job_name = "test_map"
    input_class = (input_readers.__name__ + "." +
                   input_readers._GoogleCloudStorageInputReader.__name__)

    expected_content = self.create_test_content(bucket_name,
                                                object_prefix,
                                                num_files)

    control.start_map(
        job_name,
        __name__ + "." + "_input_reader_memory_mapper",
        input_class,
        {
            "input_reader": {
                "bucket_name": bucket_name,
                "objects": [object_prefix + "*"]
            },
        },
        shard_count=num_shards)

    test_support.execute_until_empty(self.taskqueue)
    self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
def start_map(
    fbl,
    name,
    handler_spec,
    entity_kind,
    filters=None,
    handle_batch_size=None,
    output_writer_spec=None,
    output_writer=None,
    queue='slow-queue',
    extra_mapper_params=None,
    randomize_tokens=True,
):
    filters = filters or []
    output_writer = output_writer or {}
    extra_mapper_params = extra_mapper_params or {}
    mapper_params = {
        'entity_kind': entity_kind,
        'handle_batch_size': handle_batch_size,
        'filters': filters,
        'output_writer': output_writer,
    }
    mapper_params.update(get_fblookup_params(fbl, randomize_tokens=randomize_tokens))
    mapper_params.update(extra_mapper_params)
    control.start_map(
        name=name,
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec=handler_spec,
        output_writer_spec=output_writer_spec,
        shard_count=16,  # since we want to stick it in the slow-queue, and don't care how fast it executes
        queue_name=queue,
        mapper_parameters=mapper_params,
    )
示例#4
0
 def get(self):
     mapreduce_params = {
           'entity_kind': 'models.Topic',
     }
     
     control.start_map("DeleteOldUpdates", "mapjob.keep_thirty_updates", "mapreduce.input_readers.DatastoreInputReader", mapreduce_params, 2)
     self.response.out.write("ok")
示例#5
0
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_data = [str(i) for i in range(100)]

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)

        control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            input_readers.__name__ + ".GoogleCloudStorageRecordInputReader",
            {
                "input_reader": {
                    "bucket_name": bucket_name,
                    "objects": [test_filename],
                    # the parameter can't be compressed and wouldn't fit into
                    # taskqueue payload
                    "huge_parameter": random_string(900000)
                }
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
示例#6
0
    def testRecordsReader(self):
        """End-to-end test for records reader."""
        input_data = [str(i) for i in range(100)]

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)

        control.start_map("test_map",
                          __name__ + ".TestHandler",
                          input_readers.__name__ +
                          ".GoogleCloudStorageRecordInputReader", {
                              "input_reader": {
                                  "bucket_name": bucket_name,
                                  "objects": [test_filename]
                              }
                          },
                          shard_count=4,
                          base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
示例#7
0
    def testHandlerSerialization(self):
        """Test serializable handler works with MR and shard retry."""
        entity_count = 10

        for _ in range(entity_count):
            TestEntity(int_property=-1).put()

        # Force handler to serialize on every call.
        parameters.config._SLICE_DURATION_SEC = 0

        control.start_map(
            "test_map",
            __name__ + ".SerializableHandler",
            "mapreduce.input_readers.DatastoreInputReader", {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shard_count=1,
            base_path="/mapreduce_base_path")

        task_run_counts = test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(
            task_run_counts[handlers.MapperWorkerCallbackHandler],
            # Shard retries + one per entity + one to exhaust input reader
            SerializableHandler.FAILURES_INDUCED_BY_INSTANCE + entity_count +
            1)
        vals = [e.int_property for e in TestEntity.all()]
        vals.sort()
        # SerializableHandler updates int_property to be incremental from 0 to 9.
        self.assertEquals(range(10), vals)
    def testHandlerSerialization(self):
        """Test serializable handler works with MR and shard retry."""
        entity_count = 10

        for _ in range(entity_count):
            TestEntity(int_property=-1).put()

        # Force handler to serialize on every call.
        handlers._SLICE_DURATION_SEC = 0

        control.start_map(
            "test_map",
            __name__ + ".SerializableHandler",
            "mapreduce.input_readers.DatastoreInputReader",
            {"entity_kind": __name__ + "." + TestEntity.__name__},
            shard_count=1,
            base_path="/mapreduce_base_path",
        )

        task_run_counts = test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(
            task_run_counts[handlers.MapperWorkerCallbackHandler],
            entity_count + 1 + SerializableHandler.TASKS_CONSUMED_BY_RETRY,
        )
        vals = [e.int_property for e in TestEntity.all()]
        vals.sort()
        # SerializableHandler updates int_property to be incremental from 0 to 9.
        self.assertEquals(range(10), vals)
def start_map(
    fbl,
    name,
    handler_spec,
    entity_kind,
    filters=None,
    handle_batch_size=None,
    output_writer_spec=None,
    output_writer=None,
    queue='slow-queue',
    extra_mapper_params=None,
    randomize_tokens=True,
):
    filters = filters or []
    output_writer = output_writer or {}
    extra_mapper_params = extra_mapper_params or {}
    mapper_params = {
        'entity_kind': entity_kind,
        'handle_batch_size': handle_batch_size,
        'filters': filters,
        'output_writer': output_writer,
    }
    mapper_params.update(
        get_fblookup_params(fbl, randomize_tokens=randomize_tokens))
    mapper_params.update(extra_mapper_params)
    control.start_map(
        name=name,
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec=handler_spec,
        output_writer_spec=output_writer_spec,
        shard_count=
        16,  # since we want to stick it in the slow-queue, and don't care how fast it executes
        queue_name=queue,
        mapper_parameters=mapper_params,
    )
示例#10
0
文件: mr.py 项目: n054/catapult
 def get(self):
     # TODO(qyearsley): Add test coverage. See catapult:#1346.
     name = "Update test deprecation status."
     handler = "dashboard.mr.DeprecateTestsMapper"
     reader = "mapreduce.input_readers.DatastoreInputReader"
     mapper_parameters = {"entity_kind": ("dashboard.models.graph_data.TestMetadata"), "filters": []}
     mr_control.start_map(name, handler, reader, mapper_parameters)
示例#11
0
 def get(self):
     name = 'Update test deprecation status.'
     handler = ('dashboard.mr.DeprecateTestsMapper')
     reader = 'mapreduce.input_readers.DatastoreInputReader'
     mapper_parameters = {
         'entity_kind': ('dashboard.models.graph_data.TestMetadata'),
         'filters': [],
     }
     mr_control.start_map(name, handler, reader, mapper_parameters)
示例#12
0
文件: mr.py 项目: joelagnel/catapult
 def get(self):
   name = 'Update anomalies with units.'
   handler = ('dashboard.mr.StoreUnitsInAnomalyEntity')
   reader = 'mapreduce.input_readers.DatastoreInputReader'
   mapper_parameters = {
       'entity_kind': ('dashboard.models.graph_data.Anomaly'),
       'filters': [],
   }
   mr_control.start_map(name, handler, reader, mapper_parameters)
 def get(self):
     control.start_map(
         name='Compute User-Event Stats',
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='users.user_event_tasks.map_compute_user_stats',
         mapper_parameters={'entity_kind': 'users.users.User'},
         queue_name='fast-queue',
         shard_count=5,
     )
 def get(self):
     control.start_map(
         name='Compute User-Event Stats',
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='users.user_event_tasks.map_compute_user_stats',
         mapper_parameters={'entity_kind': 'users.users.User'},
         queue_name='fast-queue',
         shard_count=5,
     )
示例#15
0
    def get(self):
        mapreduce_params = {
            'entity_kind': 'models.Topic',
        }

        control.start_map("DeleteOldUpdates", "mapjob.keep_thirty_updates",
                          "mapreduce.input_readers.DatastoreInputReader",
                          mapreduce_params, 2)
        self.response.out.write("ok")
示例#16
0
 def get(self):
   name = 'Update anomalies with units.'
   handler = ('dashboard.mr.StoreUnitsInAnomalyEntity')
   reader = 'mapreduce.input_readers.DatastoreInputReader'
   mapper_parameters = {
       'entity_kind': ('dashboard.models.graph_data.Anomaly'),
       'filters': [],
   }
   mr_control.start_map(name, handler, reader, mapper_parameters)
示例#17
0
 def get(self):
     # TODO(qyearsley): Add test coverage. See http://crbug.com/447432
     name = 'Update test deprecation status.'
     handler = ('dashboard.mr.DeprecateTestsMapper')
     reader = 'mapreduce.input_readers.DatastoreInputReader'
     mapper_parameters = {
         'entity_kind': ('dashboard.models.graph_data.Test'),
         'filters': [('has_rows', '=', True), ('deprecated', '=', False)],
     }
     mr_control.start_map(name, handler, reader, mapper_parameters)
示例#18
0
def delete_all_questions():
    logging.info("Delete all existing questions")
    ctrl.start_map("Delete all Question entities", 
                   'locql.delete_entity', 
                   'mapreduce.input_readers.DatastoreKeyInputReader', 
                   {'entity_kind': 'locql.Question'})
    ctrl.start_map("Delete all TermStat entities", 
                   'locql.delete_entity', 
                   'mapreduce.input_readers.DatastoreKeyInputReader',
                   {'entity_kind': 'locql.TermStat'})
示例#19
0
 def get(self):
     # TODO(qyearsley): Add test coverage. See catapult:#1346.
     name = 'Update test deprecation status.'
     handler = ('dashboard.mr.DeprecateTestsMapper')
     reader = 'mapreduce.input_readers.DatastoreInputReader'
     mapper_parameters = {
         'entity_kind': ('dashboard.models.graph_data.TestMetadata'),
         'filters': [('has_rows', '=', True), ('deprecated', '=', False)],
     }
     mr_control.start_map(name, handler, reader, mapper_parameters)
示例#20
0
 def get(self):
     table = 'rankings.cities.City'
     control.start_map(
         name='Delete %s' % table,
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.servlets.tools.delete_table',
         mapper_parameters={
             'entity_kind': table,
         },
     )
示例#21
0
 def get(self):
     control.start_map(
         name='Fixup Events',
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.servlets.tools.resave_object',
         mapper_parameters={
             'entity_kind': 'dancedeets.events.eventdata.DBEvent',
         },
         shard_count=16,
     )
示例#22
0
 def get(self):
     table = self.request.get('table') # users.users.User or events.eventdata.DBEvent or ...
     control.start_map(
         name='Resave %s' % table,
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='servlets.tools.resave_table',
         mapper_parameters={
             'entity_kind': table,
         },
     )
示例#23
0
 def get(self):
     table = 'rankings.cities.City'
     control.start_map(
         name='Delete %s' % table,
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.servlets.tools.delete_table',
         mapper_parameters={
             'entity_kind': table,
         },
     )
示例#24
0
 def get(self):
     control.start_map(
         name='Fixup Events',
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.servlets.tools.resave_object',
         mapper_parameters={
             'entity_kind': 'dancedeets.events.eventdata.DBEvent',
         },
         shard_count=16,
     )
 def get(self):
   mr_control.start_map(
    self.request.get("name"),
    self.request.get("reader_spec", "your_mapreduce.map"),
    self.request.get("reader_parameters",
                     "mapreduce.input_readers.DatastoreInputReader"),
    { "entity_kind": self.request.get("entity_kind", "models.YourModel"),
      "processing_rate": int(self.request.get("processing_rate", 100)) },
    mapreduce_parameters={"done_callback": self.request.get("done_callback",
                                                            None) } )
   self.response.out.write("MapReduce scheduled");
def begin_user_ranking_calculations():
    control.start_map(
        name='Compute City Rankings by Users',
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec='dancedeets.rankings.rankings.count_user_for_city',
        mapper_parameters={'entity_kind': 'dancedeets.users.users.User'},
        queue_name='fast-queue',
        shard_count=16,
        _app=USER_FOR_CITY_RANKING,
    )
    _compute_summary(expiry=5 * 60)  # 5 minutes
示例#27
0
def begin_user_ranking_calculations():
    control.start_map(
        name='Compute City Rankings by Users',
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec='dancedeets.rankings.rankings.count_user_for_city',
        mapper_parameters={'entity_kind': 'dancedeets.users.users.User'},
        queue_name='fast-queue',
        shard_count=16,
        _app=USER_FOR_CITY_RANKING,
    )
    _compute_summary(expiry=5 * 60)  # 5 minutes
示例#28
0
 def get(self):
     table = self.request.get(
         'table')  # users.users.User or events.eventdata.DBEvent or ...
     control.start_map(
         name='Resave %s' % table,
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.servlets.tools.resave_table',
         mapper_parameters={
             'entity_kind': table,
         },
     )
示例#29
0
def start_map(name, params=None, eta=None, countdown=None):
  for config in status.MapReduceYaml.to_dict(status.get_mapreduce_yaml()):
    if config.get('name') == name:
      break

  # Add the mapreduce specific parameters to the params dictionary
  config['mapper_params'].update(params if params else {})

  control.start_map(config['name'], config['mapper_handler'],
                    config['mapper_input_reader'], config['mapper_params'],
                    eta=eta,
                    countdown=countdown)
示例#30
0
def mr_import_fc_file(blob_key):
    logging.info('mr_import_fc_file start bk:%s' % (blob_key))
    mr_ctl.start_map(
        name='Import a fortune file',
        handler_spec='main.mr_import_fc_line',
        reader_spec='mapreduce.input_readers.BlobstoreLineInputReader',
        mapper_parameters={
            # FIXME add blob key to done callback
            'done_callback': '/mr_done',
            'blob_keys': str(blob_key),
        },
    )
    logging.info('mr_import_fc_file end')
示例#31
0
    def get(self):
        processing_rate = 3
        shard_count = 2

        control.start_map(
            "Iterate over all Dummy objects in the DB",  # this an arbitrary description string
            "tasks.mapper_function",  # this is the function that will bne
            "mapreduce.input_readers.DatastoreInputReader",
            {
                "entity_kind": "dummy.Dummy",  # the model that you will iterate over
                "processing_rate": processing_rate,  # how many entities will each shard process
            },
            shard_count=shard_count,  # how many shards will be created by every MR controller
            queue_name="default",  # the name of the queue that will be used for this MR's jobs, I used default to minimize config
        )
示例#32
0
文件: pipes.py 项目: stucox/djangae
    def run(self,
            job_name,
            handler_spec,
            input_reader_spec,
            output_writer_spec=None,
            params=None,
            shards=None):
        """
            Overwriting this method allows us to pass the base_path properly, I know it's stupid but I think
            this is the cleanest way that still gives us a working Pipeline that we can chain
        """
        if shards is None:
            shards = parameters.config.SHARD_COUNT

        mapreduce_id = control.start_map(
            job_name,
            handler_spec,
            input_reader_spec,
            params or {},
            mapreduce_parameters={
                "done_callback": self.get_callback_url(),
                "done_callback_method": "GET",
                "pipeline_id": self.pipeline_id,
                "base_path": BASE_PATH,
            },
            shard_count=shards,
            output_writer_spec=output_writer_spec,
            queue_name=self.queue_name,
        )
        self.fill(self.outputs.job_id, mapreduce_id)
        self.set_status(console_url="%s/detail?mapreduce_id=%s" %
                        ((parameters.config.BASE_PATH, mapreduce_id)))
    def testMultipleShards(self):
        entity_count = 1000

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "output_sharding": "input",
            },
            shard_count=4,
            base_path="/mapreduce_base_path",
            output_writer_spec=BLOBSTORE_WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)

        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = output_writers.BlobstoreOutputWriter.get_filenames(
            mapreduce_state)
        self.assertEqual(4, len(set(filenames)))

        file_lengths = []
        for filename in filenames:
            self.assertTrue(filename.startswith("/blobstore/"))
            self.assertFalse(filename.startswith("/blobstore/writable:"))

            with files.open(filename, "r") as f:
                data = f.read(10000000)
                file_lengths.append(len(data.strip().split("\n")))

        self.assertEqual(1000, sum(file_lengths))
  def _runTest(self, num_shards):
    entity_count = 1000
    bucket_name = "bucket"
    job_name = "test_map"

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        job_name,
        __name__ + ".test_handler_yield_key_str",
        DATASTORE_READER_NAME,
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_writer": {
                "bucket_name": bucket_name,
            },
        },
        shard_count=num_shards,
        output_writer_spec=self.WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)
    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

    self.assertEqual(num_shards, len(filenames))
    total_entries = 0
    for shard in range(num_shards):
      self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name,
                                                              job_name)))
      data = cloudstorage.open(filenames[shard]).read()
      # strip() is used to remove the last newline of each file so that split()
      # does not retrun extraneous empty entries.
      total_entries += len(data.strip().split("\n"))
    self.assertEqual(entity_count, total_entries)
示例#35
0
def begin_event_ranking_calculations(vertical):
    filters = [('verticals', '=', vertical)]

    control.start_map(
        name='Compute City Rankings by %s Events' % vertical,
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec='dancedeets.rankings.rankings.count_event_for_city',
        mapper_parameters={
            'entity_kind': 'dancedeets.events.eventdata.DBEvent',
            'filters': filters,
        },
        queue_name='fast-queue',
        shard_count=16,
        _app=_get_app_id(EVENT_FOR_CITY_RANKING, vertical),
    )
    _compute_summary(expiry=5 * 60)  # 5 minutes
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader",
            {
                "file": input_file,
                # the parameter can't be compressed and wouldn't fit into
                # taskqueue payload
                "huge_parameter": random_string(900000)
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
    def testStartMap_RaisingHooks(self):
        """Tests that MR can be scheduled with a dummy hook class installed.

    The dummy hook class raises NotImplementedError for all method calls so the
    default scheduling logic should be used.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
        TestEntity().put()

        shard_count = 4
        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler",
            "mapreduce.input_readers.DatastoreInputReader", {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shard_count,
            mapreduce_parameters={"foo": "bar"},
            base_path="/mapreduce_base_path",
            queue_name="crazy-queue",
            hooks_class_name=hooks.__name__ + "." + hooks.Hooks.__name__)

        self.validate_map_started(mapreduce_id)
  def testMultipleShards(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        DATASTORE_READER_NAME,
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_sharding": "input",
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(4, len(filenames))

    file_lengths = []
    for filename in filenames:
      self.assertTrue(filename.startswith("/blobstore/"))
      self.assertFalse(filename.startswith("/blobstore/writable:"))

      with files.open(filename, "r") as f:
        data = f.read(10000000)
        file_lengths.append(len(data.strip().split("\n")))

    self.assertEqual(1000, sum(file_lengths))
    def testStartMap_Eta(self):
        """Test that MR can be scheduled into the future.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
        TestEntity().put()

        # MR should be scheduled into the future.
        eta = datetime.datetime.utcnow() + datetime.timedelta(hours=1)

        shard_count = 4
        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler",
            "mapreduce.input_readers.DatastoreInputReader", {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shard_count,
            mapreduce_parameters={"foo": "bar"},
            base_path="/mapreduce_base_path",
            queue_name=self.QUEUE_NAME,
            eta=eta)

        task_eta = self.validate_map_started(mapreduce_id)
        self.assertEquals(eta.strftime("%Y/%m/%d %H:%M:%S"), task_eta)
    def testStartMap_Hooks(self):
        """Tests that MR can be scheduled with a hook class installed.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
        TestEntity().put()

        shard_count = 4
        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler",
            "mapreduce.input_readers.DatastoreInputReader", {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shard_count,
            mapreduce_parameters={"foo": "bar"},
            base_path="/mapreduce_base_path",
            queue_name="crazy-queue",
            hooks_class_name=__name__ + "." + TestHooks.__name__)

        self.assertTrue(mapreduce_id)
        task, queue_name = TestHooks.enqueue_kickoff_task_calls[0]
        self.assertEqual(
            "/mapreduce_base_path/kickoffjob_callback/" + mapreduce_id,
            task.url)
        self.assertEqual("crazy-queue", queue_name)
    def testStartMap_Countdown(self):
        """Test that MR can be scheduled into the future.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
        TestEntity().put()

        # MR should be scheduled into the future.
        now_sec = long(time.time())

        shard_count = 4
        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler",
            "mapreduce.input_readers.DatastoreInputReader", {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shard_count,
            mapreduce_parameters={"foo": "bar"},
            base_path="/mapreduce_base_path",
            queue_name=self.QUEUE_NAME,
            countdown=1000)

        task_eta = self.validate_map_started(mapreduce_id)
        eta_sec = time.mktime(time.strptime(task_eta, "%Y/%m/%d %H:%M:%S"))
        self.assertTrue(now_sec + 1000 <= eta_sec)
    def _runTest(self, num_shards):
        entity_count = 1000
        bucket_name = "bucket"
        job_name = "test_map"

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            job_name,
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "output_writer": {
                    "bucket_name": bucket_name,
                },
            },
            shard_count=num_shards,
            output_writer_spec=self.WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)
        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

        self.assertEqual(num_shards, len(set(filenames)))
        total_entries = 0
        for shard in range(num_shards):
            self.assertTrue(filenames[shard].startswith(
                "/%s/%s" % (bucket_name, job_name)))
            data = cloudstorage.open(filenames[shard]).read()
            # strip() is used to remove the last newline of each file so that split()
            # does not retrun extraneous empty entries.
            total_entries += len(data.strip().split("\n"))
        self.assertEqual(entity_count, total_entries)
    def testDedicatedParams(self):
        entity_count = 1000

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "input_reader": {
                    "entity_kind": __name__ + "." + TestEntity.__name__,
                },
                "output_writer": {
                    "filesystem": "gs",
                    "gs_bucket_name": "bucket",
                },
            },
            shard_count=4,
            base_path="/mapreduce_base_path",
            output_writer_spec=FILE_WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)

        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = output_writers.FileOutputWriter.get_filenames(
            mapreduce_state)
        self.assertEqual(1, len(filenames))
        self.assertTrue(filenames[0].startswith("/gs/bucket/"))

        with files.open(filenames[0], "r") as f:
            data = f.read(10000000)
            self.assertEquals(1000, len(data.strip().split("\n")))
  def testHugeTaskUseDatastore(self):
    """Test map job with huge parameter values."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file,
            # the parameter can't be compressed and wouldn't fit into
            # taskqueue payload
            "huge_parameter": random_string(900000)
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
    self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testRecordsReader(self):
    """End-to-end test for records reader."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
示例#47
0
    def run(self, job_name, handler_spec, input_reader_spec, output_writer_spec=None, params=None, shards=None):
        """
            Overwriting this method allows us to pass the base_path properly, I know it's stupid but I think
            this is the cleanest way that still gives us a working Pipeline that we can chain
        """
        if shards is None:
          shards = parameters.config.SHARD_COUNT

        mapreduce_id = control.start_map(
            job_name,
            handler_spec,
            input_reader_spec,
            params or {},
            mapreduce_parameters={
                "done_callback": self.get_callback_url(),
                "done_callback_method": "GET",
                "pipeline_id": self.pipeline_id,
                "base_path": BASE_PATH,
            },
            shard_count=shards,
            output_writer_spec=output_writer_spec,
            queue_name=self.queue_name,
            )
        self.fill(self.outputs.job_id, mapreduce_id)
        self.set_status(console_url="%s/detail?mapreduce_id=%s" % (
            (parameters.config.BASE_PATH, mapreduce_id)))
示例#48
0
 def post(self):
   """ Generate data sets here """
   if self.request.get("generate"):
     # For SDK only generate 1k and less 
     num_entries = int(self.request.get("num_entries"))
     user = self.request.get("user")
     name = self.request.get("name")
     char_per_word = int(self.request.get("char_per_word"))
     entries_pp = int(self.request.get("entries_per_pipe"))
     route = gen_data(num_entries, user, name, char_per_word, entries_pp) 
     self.redirect('/data/wc')
     # pipeline seems broken
     #self.redirect(route)
   elif self.request.get("delete"):
     name = self.request.get("name") 
     dataset = WCDataSet.get_by_key_name(name)
     num_entries = dataset.num_entries
     mapreduce_id = control.start_map(
           name="Word removal",
           handler_spec="data.wordcount.delete_dataset",
           reader_spec="mapreduce.input_readers.DatastoreInputReader",
           mapper_parameters={
               "entity_kind": "data.wordcount." + get_word_class(num_entries),
               "processing_rate": 200
           },
           shard_count=64,
           mapreduce_parameters={model.MapreduceSpec.PARAM_DONE_CALLBACK: 
                      '/data/wc/delete_callback'},
           queue_name="default",
         )
     dataset.state = "Deleting" 
     dataset.mr_id = mapreduce_id
     dataset.put()
     self.redirect('/data/wc')
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testStartMap_RaisingHooks(self):
    """Tests that MR can be scheduled with a dummy hook class installed.

    The dummy hook class raises NotImplementedError for all method calls so the
    default scheduling logic should be used.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
    TestEntity().put()

    shard_count = 4
    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count,
        mapreduce_parameters={"foo": "bar"},
        base_path="/mapreduce_base_path",
        queue_name="crazy-queue",
        hooks_class_name=hooks.__name__+"."+hooks.Hooks.__name__)

    self.validate_map_started(mapreduce_id)
def begin_event_ranking_calculations(vertical):
    filters = [('verticals', '=', vertical)]

    control.start_map(
        name='Compute City Rankings by %s Events' % vertical,
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec='dancedeets.rankings.rankings.count_event_for_city',
        mapper_parameters={
            'entity_kind': 'dancedeets.events.eventdata.DBEvent',
            'filters': filters,
        },
        queue_name='fast-queue',
        shard_count=16,
        _app=_get_app_id(EVENT_FOR_CITY_RANKING, vertical),
    )
    _compute_summary(expiry=5 * 60)  # 5 minutes
  def testStartMap_Hooks(self):
    """Tests that MR can be scheduled with a hook class installed.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
    TestEntity().put()

    shard_count = 4
    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count,
        mapreduce_parameters={"foo": "bar"},
        base_path="/mapreduce_base_path",
        queue_name="crazy-queue",
        hooks_class_name=__name__+"."+TestHooks.__name__)

    self.assertTrue(mapreduce_id)
    task, queue_name = TestHooks.enqueue_kickoff_task_calls[0]
    self.assertEquals("/mapreduce_base_path/kickoffjob_callback", task.url)
    self.assertEquals("crazy-queue", queue_name)
  def testStartMap_Eta(self):
    """Test that MR can be scheduled into the future.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
    TestEntity().put()

    # MR should be scheduled into the future.
    eta = datetime.datetime.utcnow() + datetime.timedelta(hours=1)

    shard_count = 4
    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count,
        mapreduce_parameters={"foo": "bar"},
        base_path="/mapreduce_base_path",
        queue_name=self.QUEUE_NAME,
        eta=eta)

    task_eta = self.validate_map_started(mapreduce_id)
    self.assertEquals(eta.strftime("%Y/%m/%d %H:%M:%S"), task_eta)
  def testStartMap_Countdown(self):
    """Test that MR can be scheduled into the future.

    Most of start_map functionality is already tested by handlers_test.
    Just a smoke test is enough.
    """
    TestEntity().put()

    # MR should be scheduled into the future.
    now_sec = long(time.time())

    shard_count = 4
    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count,
        mapreduce_parameters={"foo": "bar"},
        base_path="/mapreduce_base_path",
        queue_name=self.QUEUE_NAME,
        countdown=1000)

    task_eta = self.validate_map_started(mapreduce_id)
    eta_sec = time.mktime(time.strptime(task_eta, "%Y/%m/%d %H:%M:%S"))
    self.assertTrue(now_sec + 1000 <= eta_sec)
  def testDedicatedParams(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            "output_writer": {
                "filesystem": "gs",
                "gs_bucket_name": "bucket",
            },
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=FILE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state)
    self.assertEqual(1, len(filenames))
    self.assertTrue(filenames[0].startswith("/gs/bucket/"))

    with files.open(filenames[0], "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
示例#56
0
 def get(self):
     token_nickname = self.request.get('token_nickname', None)
     mapper_params = {
         'entity_kind': 'dancedeets.events.eventdata.DBEvent',
         'handle_batch_size': 20,
         'filters': [('search_time_period', '=', dates.TIME_FUTURE)],
         'token_nickname': token_nickname,
     }
     control.start_map(
         name='Post Future Japan Events',
         reader_spec='mapreduce.input_readers.DatastoreInputReader',
         handler_spec='dancedeets.pubsub.pubsub_tasks.map_post_jp_event',
         shard_count=
         8,  # since we want to stick it in the slow-queue, and don't care how fast it executes
         queue_name='fast-queue',
         mapper_parameters=mapper_params,
     )
示例#57
0
def mr_delete_bad_sources():
    mapper_params = {
        'entity_kind': 'event_scraper.thing_db.Source',
        'output_writer': {
            'mime_type': 'text/plain',
            'bucket_name': 'dancedeets-hrd.appspot.com',
        },
    }
    control.start_map(
        name='Delete Bad Sources',
        reader_spec='mapreduce.input_readers.DatastoreInputReader',
        handler_spec='event_scraper.thing_scraper.delete_bad_source',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        shard_count=8,
        queue_name='fast-queue',
        mapper_parameters=mapper_params,
    )
示例#58
0
def UpdateUserTestBeaconCounts(request):
  """Starts mapreducer.UserTestBeaconCount."""
  mr_id = control.start_map(
      'UserTest beacon_count update',
      'base.mapreducer.UserTestBeaconCount',
      'mapreduce.input_readers.DatastoreInputReader',
      {'entity_kind': 'models.user_test.Test'})
  return http.HttpResponse('Started MR w/ ID:%s' % mr_id)
示例#59
0
    def testLotsOfNdbEntities(self):
        entity_count = 1000

        for _ in range(entity_count):
            NdbTestEntity().put()

        control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            input_readers.__name__ + ".DatastoreInputReader", {
                "entity_kind": __name__ + "." + NdbTestEntity.__name__,
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(entity_count, len(TestHandler.processed_entites))