def test_split_input(self):
        SHARD_COUNT = 10
        BATCH_SIZE = 2
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            {
                "input_reader": {
                    "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                    "batch_size": BATCH_SIZE,
                }
            },
            SHARD_COUNT)

        def num_expected():
            batch_size = min(len(self.dataSet), BATCH_SIZE)
            free_division = abs(len(self.dataSet) / batch_size)
            return min(free_division, SHARD_COUNT)

        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        self.assertEqual(3, num_expected())  # 1-3, 3-5, 5-None
        self.assertEqual(3, len(ds_input_readers))

        # batch_size = dataSet bigger half
        BATCH_SIZE = int(math.ceil(len(self.dataSet) / 2.0))
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            {
                "input_reader": {
                    "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                    "batch_size": BATCH_SIZE,
                }
            },
            SHARD_COUNT)
        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        self.assertEqual(2, num_expected())  # 1-4, 4-None
        self.assertEqual(2, len(ds_input_readers))

        # batch_size > dataSet itself
        BATCH_SIZE = len(self.dataSet) * 2
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            {
                "input_reader": {
                    "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                    "batch_size": BATCH_SIZE,
                }
            },
            SHARD_COUNT)
        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        self.assertEqual(1, num_expected())  # 1-None
        self.assertEqual(1, len(ds_input_readers))
    def testToJson(self):
        mapper_spec = model.MapperSpec(self.TEST_HANDLER, self.TEST_READER,
                                       {"entity_kind": self.ENTITY_KIND}, 8)
        self.assertEquals(self.default_json, mapper_spec.to_json())

        mapper_spec = model.MapperSpec(self.TEST_HANDLER,
                                       self.TEST_READER,
                                       {"entity_kind": self.ENTITY_KIND},
                                       8,
                                       output_writer_spec=self.TEST_WRITER)
        d = dict(self.default_json)
        d["mapper_output_writer"] = self.TEST_WRITER
        self.assertEquals(d, mapper_spec.to_json())
示例#3
0
    def handle(self):
        """Handles start request."""
        # Mapper spec as form arguments.
        mapreduce_name = self._get_required_param("name")
        mapper_input_reader_spec = self._get_required_param(
            "mapper_input_reader")
        mapper_handler_spec = self._get_required_param("mapper_handler")
        mapper_output_writer_spec = self.request.get("mapper_output_writer")
        mapper_params = self._get_params("mapper_params_validator",
                                         "mapper_params.")
        params = self._get_params("params_validator", "params.")

        # Set some mapper param defaults if not present.
        mapper_params["processing_rate"] = int(
            mapper_params.get("processing_rate")
            or model._DEFAULT_PROCESSING_RATE_PER_SEC)
        queue_name = mapper_params["queue_name"] = mapper_params.get(
            "queue_name", "default")

        # Validate the Mapper spec, handler, and input reader.
        mapper_spec = model.MapperSpec(
            mapper_handler_spec,
            mapper_input_reader_spec,
            mapper_params,
            int(mapper_params.get("shard_count", model._DEFAULT_SHARD_COUNT)),
            output_writer_spec=mapper_output_writer_spec)

        mapreduce_id = type(self)._start_map(mapreduce_name,
                                             mapper_spec,
                                             params,
                                             base_path=self.base_path(),
                                             queue_name=queue_name,
                                             _app=mapper_params.get("_app"))
        self.json_response["mapreduce_id"] = mapreduce_id
    def test_with_filter_factory(self):
        SHARD_COUNT = 10
        FF_PATH = \
            "test_mapreduce_utils.DatastoreQueryInputReaderTest." \
            "simple_parametrized_filter_factory"

        params = {
            "input_reader": {
                "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                "filter_factory_spec": {
                    "name": FF_PATH,
                    "args": ["B"]
                }
            }
        }
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            params,
            SHARD_COUNT)

        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        got = reduce(operator.add,
            (list(reader) for reader in ds_input_readers))
        self.assertEqual(2, len(got))
        data1, data2, = filter(lambda i: i['type'] == "B", self.dataSet)
        got.sort(key=lambda i: i.name)
        self.assertDictEqual(data1, db.to_dict(got.pop(0)))
        self.assertDictEqual(data2, db.to_dict(got.pop(0)))
示例#5
0
    def testGeneratorWithKeyRange(self):
        """Test DjangoModelInputReader as generator using KeyRanges."""
        expected_entities = []
        for i in range(0, 100):
            entity = TestModel(test_property=i)
            entity.save()
            expected_entities.append(entity)

        params = {
            "entity_kind": ENTITY_KIND,
        }
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "djangoappengine.mapreduce.input_readers.DjangoModelInputReader",
            params, 1)

        input_ranges = DjangoModelInputReader.split_input(mapper_spec)

        entities = []
        for query_range in input_ranges:
            for entity in query_range:
                entities.append(entity)

        self.assertEquals(100, len(entities))
        self.assertEquals(expected_entities, entities)
示例#6
0
def start_map(name,
              handler_spec,
              reader_spec,
              mapper_parameters,
              shard_count=_DEFAULT_SHARD_COUNT,
              output_writer_spec=None,
              mapreduce_parameters=None,
              base_path=base_handler._DEFAULT_BASE_PATH,
              queue_name="default",
              eta=None,
              countdown=None,
              hooks_class_name=None,
              _app=None,
              transactional=False):
    """Start a new, mapper-only mapreduce.

  Args:
    name: mapreduce name. Used only for display purposes.
    handler_spec: fully qualified name of mapper handler function/class to call.
    reader_spec: fully qualified name of mapper reader to use
    mapper_parameters: dictionary of parameters to pass to mapper. These are
      mapper-specific and also used for reader initialization.
    shard_count: number of shards to create.
    mapreduce_parameters: dictionary of mapreduce parameters relevant to the
      whole job.
    base_path: base path of mapreduce library handler specified in app.yaml.
      "/mapreduce" by default.
    queue_name: executor queue name to be used for mapreduce tasks.
    eta: Absolute time when the MR should execute. May not be specified
        if 'countdown' is also supplied. This may be timezone-aware or
        timezone-naive.
    countdown: Time in seconds into the future that this MR should execute.
        Defaults to zero.
    hooks_class_name: fully qualified name of a hooks.Hooks subclass.
    transactional: Specifies if job should be started as a part of already
      opened transaction.

  Returns:
    mapreduce id as string.
  """
    if not shard_count:
        shard_count = _DEFAULT_SHARD_COUNT
    mapper_spec = model.MapperSpec(handler_spec,
                                   reader_spec,
                                   mapper_parameters,
                                   shard_count,
                                   output_writer_spec=output_writer_spec)

    return handlers.StartJobHandler._start_map(
        name,
        mapper_spec,
        mapreduce_parameters or {},
        base_path=base_path,
        queue_name=queue_name,
        eta=eta,
        countdown=countdown,
        hooks_class_name=hooks_class_name,
        _app=_app,
        transactional=transactional)
示例#7
0
 def testValidate_NoEntityFails(self):
     """Test validate function raises exception with no entity parameter."""
     params = {}
     mapper_spec = model.MapperSpec(
         "FooHandler",
         "djangoappengine.mapreduce.input_readers.DjangoModelInputReader",
         params, 1)
     self.assertRaises(input_readers.BadReaderParamsError,
                       DjangoModelInputReader.validate, mapper_spec)
示例#8
0
 def testValidate_Passes(self):
     """Test validate function accepts valid parameters."""
     params = {
         "entity_kind": ENTITY_KIND,
     }
     mapper_spec = model.MapperSpec(
         "FooHandler",
         "djangoappengine.mapreduce.input_readers.DjangoModelInputReader",
         params, 1)
     DjangoModelInputReader.validate(mapper_spec)
 def create_mapper_spec(self,
                        output_writer_spec=BLOBSTORE_WRITER_NAME,
                        params=None):
     params = params or {}
     mapper_spec = model.MapperSpec(
         "FooHandler",
         "mapreduce.input_readers.DatastoreInputReader",
         params,
         10,
         output_writer_spec=output_writer_spec)
     return mapper_spec
示例#10
0
 def testGetTaskHeaders(self):
     mr_spec = model.MapreduceSpec(name="foo",
                                   mapreduce_id="foo_id",
                                   mapper_spec=model.MapperSpec(
                                       "foo", "foo", {}, 8).to_json())
     task = taskqueue.Task(url="/relative_url",
                           headers=util._get_task_headers(
                               mr_spec.mapreduce_id))
     self.assertEqual("foo_id", task.headers[util._MR_ID_TASK_HEADER])
     self.assertEqual("v7.foo-module.foo.appspot.com", task.headers["Host"])
     self.assertEqual("v7.foo-module", task.target)
示例#11
0
 def testValidate_BadEntityKind(self):
     """Test validate function with bad entity kind."""
     params = {
         "entity_kind": "foo",
     }
     mapper_spec = model.MapperSpec(
         "FooHandler",
         "djangoappengine.mapreduce.input_readers.DjangoModelInputReader",
         params, 1)
     self.assertRaises(input_readers.BadReaderParamsError,
                       DjangoModelInputReader.validate, mapper_spec)
示例#12
0
  def _get_mapper_spec(self):
    """Converts self to model.MapperSpec."""
    # pylint: disable=g-import-not-at-top
    from mapreduce import model

    return model.MapperSpec(
        handler_spec=util._obj_to_path(self.mapper),
        input_reader_spec=util._obj_to_path(self.input_reader_cls),
        params=self._get_mapper_params(),
        shard_count=self.shard_count,
        output_writer_spec=util._obj_to_path(self.output_writer_cls))
示例#13
0
 def testValidate_BadNamespace(self):
     """Test validate function with bad namespace."""
     params = {
         "entity_kind": ENTITY_KIND,
         "namespace": 'namespace',
     }
     mapper_spec = model.MapperSpec(
         "FooHandler",
         "djangoappengine.mapreduce.input_readers.DjangoModelInputReader",
         params, 1)
     self.assertRaises(input_readers.BadReaderParamsError,
                       DjangoModelInputReader.validate, mapper_spec)
示例#14
0
    def create_mapper_spec(self, output_params=None):
        """Create a Mapper specification using the GoogleCloudStorageOutputWriter.

    The specification generated uses a dummy handler and input reader. The
    number of shards is 10 (some number greater than 1).

    Args:
      output_params: parameters for the output writer.

    Returns:
      a model.MapperSpec with default settings and specified output_params.
    """
        return model.MapperSpec("DummyHandler",
                                "DummyInputReader",
                                {"output_writer": output_params or {}},
                                self.NUM_SHARDS,
                                output_writer_spec=self.WRITER_NAME)
    def test_world(self):
        SHARD_COUNT = 10

        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            {
                "input_reader": {
                    "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                }
            },
            SHARD_COUNT)

        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        got = reduce(operator.add,
            (list(reader) for reader in ds_input_readers))
        self.assertEqual(len(self.dataSet), len(got))
示例#16
0
def start_map(name,
              handler_spec,
              reader_spec,
              reader_parameters,
              shard_count,
              mapreduce_parameters={},
              base_path="/mapreduce",
              queue_name="default",
              eta=None,
              countdown=None,
              _app=None):
    """Start a new, mapper-only mapreduce.

  Args:
    name: mapreduce name. Used only for display purposes.
    handler_spec: fully qualified name of mapper handler function/class to call.
    reader_spec: fully qualified name of mapper reader to use
    reader_parameters: dictionary of parameters to pass to reader. These are
      reader-specific.
    shard_count: number of shards to create.
    mapreduce_parameters: dictionary of mapreduce parameters relevant to the
      whole job.
    base_path: base path of mapreduce library handler specified in app.yaml.
      "/mapreduce" by default.
    queue_name: executor queue name to be used for mapreduce tasks.
    eta: Absolute time when the MR should execute. May not be specified
        if 'countdown' is also supplied. This may be timezone-aware or
        timezone-naive.
    countdown: Time in seconds into the future that this MR should execute.
        Defaults to zero.

  Returns:
    mapreduce id as string.
  """
    mapper_spec = model.MapperSpec(handler_spec, reader_spec,
                                   reader_parameters, shard_count)

    return handlers.StartJobHandler._start_map(name,
                                               mapper_spec,
                                               mapreduce_parameters,
                                               base_path=base_path,
                                               queue_name=queue_name,
                                               eta=eta,
                                               countdown=countdown,
                                               _app=_app)
    def test_with_query_filters(self):
        SHARD_COUNT = 10
        mapper_spec = model.MapperSpec(
            "FooHandler",
            "mapreduce_utils.DatastoreQueryInputReader",
            {
                "input_reader": {
                    "entity_kind": self.TEST_ENTITY_IMPORT_PATH,
                    "filters": [("type", "=", "C")],
                }
            },
            SHARD_COUNT)

        ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec)
        got = reduce(operator.add,
            (list(reader) for reader in ds_input_readers))
        self.assertEqual(3, len(got))
        data1, data2, data3 = filter(lambda i: i['type'] == "C", self.dataSet)
        got.sort(key=lambda i: i.name)
        self.assertDictEqual(data1, db.to_dict(got.pop(0)))
        self.assertDictEqual(data2, db.to_dict(got.pop(0)))
        self.assertDictEqual(data3, db.to_dict(got.pop(0)))
    def testFindAllByMapreduceState(self):
        mr_state = model.MapreduceState.create_new("mapreduce-id")
        mr_state.mapreduce_spec = model.MapreduceSpec(
            "mapreduce", "mapreduce-id",
            model.MapperSpec("handler", "input-reader", {},
                             shard_count=304).to_json())
        mr_state.put()
        for i in range(304):
            model.ShardState.create_new("mapreduce-id", i).put()

        @db.transactional(xg=False)
        def non_xg_tx():
            # Open a single non-related entity group to ensure
            # find_all_by_mapreduce_state does not attempt to use outer transaction
            mr_state2 = model.MapreduceState.create_new(
                "unrelated-mapreduce-id")
            mr_state2.put()
            shard_states = model.ShardState.find_all_by_mapreduce_state(
                mr_state)
            for i, ss in enumerate(shard_states):
                self.assertEqual(i, ss.shard_number)

        non_xg_tx()
示例#19
0
def start_map(name,
              handler_spec,
              reader_spec,
              mapper_parameters,
              shard_count=None,
              output_writer_spec=None,
              mapreduce_parameters=None,
              base_path=None,
              queue_name=None,
              eta=None,
              countdown=None,
              hooks_class_name=None,
              _app=None,
              in_xg_transaction=False):
    """Start a new, mapper-only mapreduce.

  Deprecated! Use map_job.start instead.

  If a value can be specified both from an explicit argument and from
  a dictionary, the value from the explicit argument wins.

  Args:
    name: mapreduce name. Used only for display purposes.
    handler_spec: fully qualified name of mapper handler function/class to call.
    reader_spec: fully qualified name of mapper reader to use
    mapper_parameters: dictionary of parameters to pass to mapper. These are
      mapper-specific and also used for reader/writer initialization.
      Should have format {"input_reader": {}, "output_writer":{}}. Old
      deprecated style does not have sub dictionaries.
    shard_count: number of shards to create.
    mapreduce_parameters: dictionary of mapreduce parameters relevant to the
      whole job.
    base_path: base path of mapreduce library handler specified in app.yaml.
      "/mapreduce" by default.
    queue_name: taskqueue queue name to be used for mapreduce tasks.
      see util.get_queue_name.
    eta: absolute time when the MR should execute. May not be specified
      if 'countdown' is also supplied. This may be timezone-aware or
      timezone-naive.
    countdown: time in seconds into the future that this MR should execute.
      Defaults to zero.
    hooks_class_name: fully qualified name of a hooks.Hooks subclass.
    in_xg_transaction: controls what transaction scope to use to start this MR
      job. If True, there has to be an already opened cross-group transaction
      scope. MR will use one entity group from it.
      If False, MR will create an independent transaction to start the job
      regardless of any existing transaction scopes.

  Returns:
    mapreduce id as string.
  """
    if shard_count is None:
        shard_count = parameters.config.SHARD_COUNT

    if mapper_parameters:
        mapper_parameters = dict(mapper_parameters)

    # Make sure this old API fill all parameters with default values.
    mr_params = map_job.JobConfig._get_default_mr_params()
    if mapreduce_parameters:
        mr_params.update(mapreduce_parameters)

    # Override default values if user specified them as arguments.
    if base_path:
        mr_params["base_path"] = base_path
    mr_params["queue_name"] = util.get_queue_name(queue_name)

    mapper_spec = model.MapperSpec(handler_spec,
                                   reader_spec,
                                   mapper_parameters,
                                   shard_count,
                                   output_writer_spec=output_writer_spec)

    if in_xg_transaction and not db.is_in_transaction():
        log.warning("Expects an opened xg transaction to start mapreduce "
                    "when transactional is True.")

    return handlers.StartJobHandler._start_map(
        name,
        mapper_spec,
        mr_params,
        # TODO(user): Now that "queue_name" is part of mr_params.
        # Remove all the other ways to get queue_name after one release.
        queue_name=mr_params["queue_name"],
        eta=eta,
        countdown=countdown,
        hooks_class_name=hooks_class_name,
        _app=_app,
        in_xg_transaction=in_xg_transaction)
示例#20
0
def start_map(name,
              handler_spec,
              reader_spec,
              mapper_parameters,
              shard_count=_DEFAULT_SHARD_COUNT,
              output_writer_spec=None,
              mapreduce_parameters=None,
              base_path=None,
              queue_name=None,
              eta=None,
              countdown=None,
              hooks_class_name=None,
              _app=None,
              transactional=False,
              transactional_parent=None):
  """Start a new, mapper-only mapreduce.

  Args:
    name: mapreduce name. Used only for display purposes.
    handler_spec: fully qualified name of mapper handler function/class to call.
    reader_spec: fully qualified name of mapper reader to use
    mapper_parameters: dictionary of parameters to pass to mapper. These are
      mapper-specific and also used for reader initialization.
    shard_count: number of shards to create.
    mapreduce_parameters: dictionary of mapreduce parameters relevant to the
      whole job.
    base_path: base path of mapreduce library handler specified in app.yaml.
      "/mapreduce" by default.
    queue_name: executor queue name to be used for mapreduce tasks. If
      unspecified it will be the "default" queue or inherit the queue of
      the currently running request.
    eta: absolute time when the MR should execute. May not be specified
      if 'countdown' is also supplied. This may be timezone-aware or
      timezone-naive.
    countdown: time in seconds into the future that this MR should execute.
      Defaults to zero.
    hooks_class_name: fully qualified name of a hooks.Hooks subclass.
    transactional: specifies if job should be started as a part of already
      opened transaction.
    transactional_parent: specifies the entity which is already a part of
      transaction. Child entity will be used to store task payload if mapreduce
      specification is too big.

  Returns:
    mapreduce id as string.
  """
  if not shard_count:
    shard_count = _DEFAULT_SHARD_COUNT
  if base_path is None:
    base_path = base_handler._DEFAULT_BASE_PATH

  if mapper_parameters:
    mapper_parameters = dict(mapper_parameters)
  if mapreduce_parameters:
    mapreduce_parameters = dict(mapreduce_parameters)

  mapper_spec = model.MapperSpec(handler_spec,
                                 reader_spec,
                                 mapper_parameters,
                                 shard_count,
                                 output_writer_spec=output_writer_spec)

  if transactional and not transactional_parent:
    # We should really fail here, but there might be some customers
    # of this code that wouldn't like this.
    # This will cause problems only for huge job definitions.
    logging.error(
        "transactional_parent should be specified for transactional starts."
        "Your job will fail to start if mapreduce specification is too big.")

  return handlers.StartJobHandler._start_map(
      name,
      mapper_spec,
      mapreduce_parameters or {},
      base_path=base_path,
      queue_name=queue_name,
      eta=eta,
      countdown=countdown,
      hooks_class_name=hooks_class_name,
      _app=_app,
      transactional=transactional,
      parent_entity=transactional_parent)
示例#21
0
def start_map(name,
              handler_spec,
              reader_spec,
              mapper_parameters,
              shard_count=None,
              output_writer_spec=None,
              mapreduce_parameters=None,
              base_path=None,
              queue_name=None,
              eta=None,
              countdown=None,
              hooks_class_name=None,
              _app=None,
              in_xg_transaction=False):
    """Start a new, mapper-only mapreduce.

  Args:
    name: mapreduce name. Used only for display purposes.
    handler_spec: fully qualified name of mapper handler function/class to call.
    reader_spec: fully qualified name of mapper reader to use
    mapper_parameters: dictionary of parameters to pass to mapper. These are
      mapper-specific and also used for reader initialization.
    shard_count: number of shards to create.
    mapreduce_parameters: dictionary of mapreduce parameters relevant to the
      whole job.
    base_path: base path of mapreduce library handler specified in app.yaml.
      "/mapreduce" by default.
    queue_name: taskqueue queue name to be used for mapreduce tasks.
      see util.get_queue_name.
    eta: absolute time when the MR should execute. May not be specified
      if 'countdown' is also supplied. This may be timezone-aware or
      timezone-naive.
    countdown: time in seconds into the future that this MR should execute.
      Defaults to zero.
    hooks_class_name: fully qualified name of a hooks.Hooks subclass.
    in_xg_transaction: controls what transaction scope to use to start this MR
      job. If True, there has to be an already opened cross-group transaction
      scope. MR will use one entity group from it.
      If False, MR will create an independent transaction to start the job
      regardless of any existing transaction scopes.

  Returns:
    mapreduce id as string.
  """
    if shard_count is None:
        shard_count = parameters.config.SHARD_COUNT
    if base_path is None:
        base_path = parameters.config.BASE_PATH

    if mapper_parameters:
        mapper_parameters = dict(mapper_parameters)
    if mapreduce_parameters:
        mapreduce_parameters = dict(mapreduce_parameters)
        if "base_path" not in mapreduce_parameters:
            mapreduce_parameters["base_path"] = base_path
    else:
        mapreduce_parameters = {"base_path": base_path}

    mapper_spec = model.MapperSpec(handler_spec,
                                   reader_spec,
                                   mapper_parameters,
                                   shard_count,
                                   output_writer_spec=output_writer_spec)

    if in_xg_transaction and not db.is_in_transaction():
        logging.warning("Expects an opened xg transaction to start mapreduce "
                        "when transactional is True.")

    return handlers.StartJobHandler._start_map(
        name,
        mapper_spec,
        mapreduce_parameters,
        queue_name=util.get_queue_name(queue_name),
        eta=eta,
        countdown=countdown,
        hooks_class_name=hooks_class_name,
        _app=_app,
        in_xg_transaction=in_xg_transaction)