Пример #1
0
    def validate(cls, mapper_spec):
        """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Mapper input reader class mismatch")
        params = mapper_spec.params
        if cls.BLOB_KEYS_PARAM not in params:
            raise BadReaderParamsError(
                "Must specify 'blob_key' for mapper input")

        blob_keys = params[cls.BLOB_KEYS_PARAM]
        if isinstance(blob_keys, basestring):
            # This is a mechanism to allow multiple blob keys (which do not contain
            # commas) in a single string. It may go away.
            blob_keys = blob_keys.split(",")
        if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT:
            raise BadReaderParamsError("Too many 'blob_keys' for mapper input")
        if not blob_keys:
            raise BadReaderParamsError(
                "No 'blob_keys' specified for mapper input")
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            if not blob_info:
                raise BadReaderParamsError(
                    "Could not find blobinfo for key %s" % blob_key)
Пример #2
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'blob_keys' parameter with one or more blob keys.

    Returns:
      A list of BlobstoreInputReaders corresponding to the specified shards.
    """
        params = mapper_spec.params
        blob_keys = params[cls.BLOB_KEYS_PARAM]
        if isinstance(blob_keys, basestring):
            # This is a mechanism to allow multiple blob keys (which do not contain
            # commas) in a single string. It may go away.
            blob_keys = blob_keys.split(",")

        blob_sizes = {}
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            blob_sizes[blob_key] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(blob_keys)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob
            for i in xrange(shards_per_blob - 1):
                chunks.append(
                    BlobstoreLineInputReader.from_json({
                        cls.BLOB_KEY_PARAM:
                        blob_key,
                        cls.INITIAL_POSITION_PARAM:
                        blob_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        blob_chunk_size * (i + 1)
                    }))
            chunks.append(
                BlobstoreLineInputReader.from_json({
                    cls.BLOB_KEY_PARAM:
                    blob_key,
                    cls.INITIAL_POSITION_PARAM:
                    blob_chunk_size * (shards_per_blob - 1),
                    cls.END_POSITION_PARAM:
                    blob_size
                }))
        return chunks
Пример #3
0
    def validate(cls, mapper_spec):
        """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Mapper input reader class mismatch")
        params = mapper_spec.params
        if cls.BLOB_KEY_PARAM not in params:
            raise BadReaderParamsError(
                "Must specify 'blob_key' for mapper input")
        blob_key = params[cls.BLOB_KEY_PARAM]
        blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
        if not blob_info:
            raise BadReaderParamsError("Could not find blobinfo for key %s" %
                                       blob_key)
Пример #4
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'blob_keys' parameter with one or more blob keys.

    Returns:
      A list of BlobstoreInputReaders corresponding to the specified shards.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Mapper input reader class mismatch")
        params = mapper_spec.params
        if cls.BLOB_KEYS_PARAM not in params:
            raise BadReaderParamsError(
                "Must specify 'blob_keys' for mapper input")

        blob_keys = params[cls.BLOB_KEYS_PARAM]
        if isinstance(blob_keys, basestring):
            # This is a mechanism to allow multiple blob keys (which do not contain
            # commas) in a single string. It may go away.
            blob_keys = blob_keys.split(",")
        if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT:
            raise BadReaderParamsError("Too many 'blob_keys' for mapper input")
        if not blob_keys:
            raise BadReaderParamsError(
                "No 'blob_keys' specified for mapper input")

        blob_sizes = {}
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            blob_sizes[blob_key] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(blob_keys)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob
            for i in xrange(shards_per_blob - 1):
                chunks.append(
                    BlobstoreLineInputReader.from_json({
                        cls.BLOB_KEY_PARAM:
                        blob_key,
                        cls.INITIAL_POSITION_PARAM:
                        blob_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        blob_chunk_size * (i + 1)
                    }))
            chunks.append(
                BlobstoreLineInputReader.from_json({
                    cls.BLOB_KEY_PARAM:
                    blob_key,
                    cls.INITIAL_POSITION_PARAM:
                    blob_chunk_size * (shards_per_blob - 1),
                    cls.END_POSITION_PARAM:
                    blob_size
                }))
        return chunks
Пример #5
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from.

    Returns:
      A list of BlobstoreInputReaders corresponding to the specified shards.

    Raises:
      BadReaderParamsError if required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Mapper input reader class mismatch")
        params = mapper_spec.params
        if "blob_keys" not in params:
            raise BadReaderParamsError(
                "Must specify 'blob_keys' for mapper input")

        blob_keys = params["blob_keys"]
        if isinstance(blob_keys, basestring):
            blob_keys = [blob_keys]
        if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT:
            raise BadReaderParamsError("Too many 'blob_keys' for mapper input")
        if not blob_keys:
            raise BadReaderParamsError(
                "No 'blob_keys' specified for mapper input")

        blob_sizes = {}
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            blob_sizes[blob_key] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(blob_keys)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob
            for i in xrange(shards_per_blob - 1):
                chunks.append(
                    BlobstoreLineInputReader.from_json({
                        "blob_key":
                        blob_key,
                        "initial_position":
                        blob_chunk_size * i,
                        "end_position":
                        blob_chunk_size * (i + 1)
                    }))
            chunks.append(
                BlobstoreLineInputReader.from_json({
                    "blob_key":
                    blob_key,
                    "initial_position":
                    blob_chunk_size * (shard_count - 1),
                    "end_position":
                    blob_size
                }))
        return chunks