def validate(cls, mapper_spec): """Validates mapper spec and all mapper parameters. Args: mapper_spec: The MapperSpec for this InputReader. Raises: BadReaderParamsError: required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Mapper input reader class mismatch") params = mapper_spec.params if cls.BLOB_KEYS_PARAM not in params: raise BadReaderParamsError( "Must specify 'blob_key' for mapper input") blob_keys = params[cls.BLOB_KEYS_PARAM] if isinstance(blob_keys, basestring): # This is a mechanism to allow multiple blob keys (which do not contain # commas) in a single string. It may go away. blob_keys = blob_keys.split(",") if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT: raise BadReaderParamsError("Too many 'blob_keys' for mapper input") if not blob_keys: raise BadReaderParamsError( "No 'blob_keys' specified for mapper input") for blob_key in blob_keys: blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) if not blob_info: raise BadReaderParamsError( "Could not find blobinfo for key %s" % blob_key)
def split_input(cls, mapper_spec): """Returns a list of shard_count input_spec_shards for input_spec. Args: mapper_spec: The mapper specification to split from. Must contain 'blob_keys' parameter with one or more blob keys. Returns: A list of BlobstoreInputReaders corresponding to the specified shards. """ params = mapper_spec.params blob_keys = params[cls.BLOB_KEYS_PARAM] if isinstance(blob_keys, basestring): # This is a mechanism to allow multiple blob keys (which do not contain # commas) in a single string. It may go away. blob_keys = blob_keys.split(",") blob_sizes = {} for blob_key in blob_keys: blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) blob_sizes[blob_key] = blob_info.size shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) shards_per_blob = shard_count // len(blob_keys) if shards_per_blob == 0: shards_per_blob = 1 chunks = [] for blob_key, blob_size in blob_sizes.items(): blob_chunk_size = blob_size // shards_per_blob for i in xrange(shards_per_blob - 1): chunks.append( BlobstoreLineInputReader.from_json({ cls.BLOB_KEY_PARAM: blob_key, cls.INITIAL_POSITION_PARAM: blob_chunk_size * i, cls.END_POSITION_PARAM: blob_chunk_size * (i + 1) })) chunks.append( BlobstoreLineInputReader.from_json({ cls.BLOB_KEY_PARAM: blob_key, cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1), cls.END_POSITION_PARAM: blob_size })) return chunks
def validate(cls, mapper_spec): """Validates mapper spec and all mapper parameters. Args: mapper_spec: The MapperSpec for this InputReader. Raises: BadReaderParamsError: required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Mapper input reader class mismatch") params = mapper_spec.params if cls.BLOB_KEY_PARAM not in params: raise BadReaderParamsError( "Must specify 'blob_key' for mapper input") blob_key = params[cls.BLOB_KEY_PARAM] blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) if not blob_info: raise BadReaderParamsError("Could not find blobinfo for key %s" % blob_key)
def split_input(cls, mapper_spec): """Returns a list of shard_count input_spec_shards for input_spec. Args: mapper_spec: The mapper specification to split from. Must contain 'blob_keys' parameter with one or more blob keys. Returns: A list of BlobstoreInputReaders corresponding to the specified shards. Raises: BadReaderParamsError: required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Mapper input reader class mismatch") params = mapper_spec.params if cls.BLOB_KEYS_PARAM not in params: raise BadReaderParamsError( "Must specify 'blob_keys' for mapper input") blob_keys = params[cls.BLOB_KEYS_PARAM] if isinstance(blob_keys, basestring): # This is a mechanism to allow multiple blob keys (which do not contain # commas) in a single string. It may go away. blob_keys = blob_keys.split(",") if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT: raise BadReaderParamsError("Too many 'blob_keys' for mapper input") if not blob_keys: raise BadReaderParamsError( "No 'blob_keys' specified for mapper input") blob_sizes = {} for blob_key in blob_keys: blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) blob_sizes[blob_key] = blob_info.size shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) shards_per_blob = shard_count // len(blob_keys) if shards_per_blob == 0: shards_per_blob = 1 chunks = [] for blob_key, blob_size in blob_sizes.items(): blob_chunk_size = blob_size // shards_per_blob for i in xrange(shards_per_blob - 1): chunks.append( BlobstoreLineInputReader.from_json({ cls.BLOB_KEY_PARAM: blob_key, cls.INITIAL_POSITION_PARAM: blob_chunk_size * i, cls.END_POSITION_PARAM: blob_chunk_size * (i + 1) })) chunks.append( BlobstoreLineInputReader.from_json({ cls.BLOB_KEY_PARAM: blob_key, cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1), cls.END_POSITION_PARAM: blob_size })) return chunks
def split_input(cls, mapper_spec): """Returns a list of shard_count input_spec_shards for input_spec. Args: mapper_spec: The mapper specification to split from. Returns: A list of BlobstoreInputReaders corresponding to the specified shards. Raises: BadReaderParamsError if required parameters are missing or invalid. """ if mapper_spec.input_reader_class() != cls: raise BadReaderParamsError("Mapper input reader class mismatch") params = mapper_spec.params if "blob_keys" not in params: raise BadReaderParamsError( "Must specify 'blob_keys' for mapper input") blob_keys = params["blob_keys"] if isinstance(blob_keys, basestring): blob_keys = [blob_keys] if len(blob_keys) > cls._MAX_BLOB_KEYS_COUNT: raise BadReaderParamsError("Too many 'blob_keys' for mapper input") if not blob_keys: raise BadReaderParamsError( "No 'blob_keys' specified for mapper input") blob_sizes = {} for blob_key in blob_keys: blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key)) blob_sizes[blob_key] = blob_info.size shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) shards_per_blob = shard_count // len(blob_keys) if shards_per_blob == 0: shards_per_blob = 1 chunks = [] for blob_key, blob_size in blob_sizes.items(): blob_chunk_size = blob_size // shards_per_blob for i in xrange(shards_per_blob - 1): chunks.append( BlobstoreLineInputReader.from_json({ "blob_key": blob_key, "initial_position": blob_chunk_size * i, "end_position": blob_chunk_size * (i + 1) })) chunks.append( BlobstoreLineInputReader.from_json({ "blob_key": blob_key, "initial_position": blob_chunk_size * (shard_count - 1), "end_position": blob_size })) return chunks