Пример #1
0
    def _schedule_shards(cls, spec, input_readers, output_writers, queue_name,
                         base_path):
        """Prepares shard states and schedules their execution.

    Args:
      spec: mapreduce specification as MapreduceSpec.
      input_readers: list of InputReaders describing shard splits.
      queue_name: The queue to run this job on.
      base_path: The base url path of mapreduce callbacks.
    """
        assert len(input_readers) == len(output_writers)
        # Note: it's safe to re-attempt this handler because:
        # - shard state has deterministic and unique key.
        # - _schedule_slice will fall back gracefully if a task already exists.
        shard_states = []
        for shard_number, input_reader in enumerate(input_readers):
            shard_state = model.ShardState.create_new(spec.mapreduce_id,
                                                      shard_number)
            shard_state.shard_description = str(input_reader)
            shard_states.append(shard_state)

        # Retrievs already existing shards.
        existing_shard_states = db.get(shard.key() for shard in shard_states)
        existing_shard_keys = set(shard.key()
                                  for shard in existing_shard_states
                                  if shard is not None)

        # Puts only non-existing shards.
        db.put((shard for shard in shard_states
                if shard.key() not in existing_shard_keys),
               config=util.create_datastore_write_config(spec))

        # Give each shard some quota to start with.
        processing_rate = int(
            spec.mapper.params.get("processing_rate")
            or model._DEFAULT_PROCESSING_RATE_PER_SEC)
        quota_refill = processing_rate / len(shard_states)
        quota_manager = quota.QuotaManager(memcache.Client())
        for shard_state in shard_states:
            quota_manager.put(shard_state.shard_id, quota_refill)

        # Schedule shard tasks.
        for shard_number, (input_reader, output_writer) in enumerate(
                zip(input_readers, output_writers)):
            shard_id = model.ShardState.shard_id_from_number(
                spec.mapreduce_id, shard_number)
            MapperWorkerCallbackHandler._schedule_slice(
                shard_states[shard_number],
                model.TransientShardState(base_path,
                                          spec,
                                          shard_id,
                                          0,
                                          input_reader,
                                          output_writer=output_writer),
                queue_name=queue_name)
Пример #2
0
    def _schedule_shards(cls, spec, input_readers, output_writers, queue_name,
                         base_path):
        """Prepares shard states and schedules their execution.

    Args:
      spec: mapreduce specification as MapreduceSpec.
      input_readers: list of InputReaders describing shard splits.
      queue_name: The queue to run this job on.
      base_path: The base url path of mapreduce callbacks.
    """
        assert len(input_readers) == len(output_writers)
        # Note: it's safe to re-attempt this handler because:
        # - shard state has deterministic and unique key.
        # - _schedule_slice will fall back gracefully if a task already exists.
        shard_states = []
        for shard_number, input_reader in enumerate(input_readers):
            shard_state = model.ShardState.create_new(spec.mapreduce_id,
                                                      shard_number)
            shard_state.shard_description = str(input_reader)
            shard_states.append(shard_state)

        # Retrievs already existing shards.
        existing_shard_states = db.get(shard.key() for shard in shard_states)
        existing_shard_keys = set(shard.key()
                                  for shard in existing_shard_states
                                  if shard is not None)

        # Puts only non-existing shards.
        db.put((shard for shard in shard_states
                if shard.key() not in existing_shard_keys),
               config=util.create_datastore_write_config(spec))

        for shard_number, (input_reader, output_writer) in enumerate(
                zip(input_readers, output_writers)):
            shard_id = model.ShardState.shard_id_from_number(
                spec.mapreduce_id, shard_number)
            MapperWorkerCallbackHandler._schedule_slice(
                model.TransientShardState(base_path,
                                          spec,
                                          shard_id,
                                          0,
                                          input_reader,
                                          output_writer=output_writer),
                queue_name=queue_name)