def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. """ if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") output_sharding = cls._get_output_sharding(mapper_spec=mapper_spec) if (output_sharding != cls.OUTPUT_SHARDING_NONE and output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS): raise errors.BadWriterParamsError( "Invalid output_sharding value: %s" % output_sharding) params = _get_params(mapper_spec) filesystem = cls._get_filesystem(mapper_spec) if filesystem not in files.FILESYSTEMS: raise errors.BadWriterParamsError( "Filesystem '%s' is not supported. Should be one of %s" % (filesystem, files.FILESYSTEMS)) if filesystem == files.GS_FILESYSTEM: if not cls.GS_BUCKET_NAME_PARAM in params: raise errors.BadWriterParamsError( "%s is required for Google store filesystem" % cls.GS_BUCKET_NAME_PARAM) else: if params.get(cls.GS_BUCKET_NAME_PARAM) is not None: raise errors.BadWriterParamsError( "%s can only be provided for Google store filesystem" % cls.GS_BUCKET_NAME_PARAM)
def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. """ if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") output_sharding = _get_output_sharding(mapper_spec=mapper_spec) if (output_sharding != cls.OUTPUT_SHARDING_NONE and output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS): raise errors.BadWriterParamsError( "Invalid output_sharding value: %s" % output_sharding)
def _generate_filename(cls, writer_spec, name, job_id, num, retry): """Generates a filename for a shard / retry count. Args: writer_spec: specification dictionary for the output writer. name: name of the job. job_id: the ID number assigned to the job. num: shard number. retry: the retry number. Returns: a string containing the filename. Raises: BadWriterParamsError if the template contains any errors such as invalid syntax or contains unknown substitution placeholders. """ naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM, cls.DEFAULT_NAMING_FORMAT) template = string.Template(naming_format) try: return template.substitute(name=name, id=job_id, num=num, retry=retry) except ValueError, error: raise errors.BadWriterParamsError("Naming template is bad, %s" % (error))
def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. Raises: BadWriterParamsError: when Output writer class mismatch. """ if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") params = output_writers._get_params(mapper_spec) if cls.BUCKET_NAME_PARAM not in params: raise errors.BadWriterParamsError( "%s is required for the _HashingGCSOutputWriter" % cls.BUCKET_NAME_PARAM)
def _generate_filename(cls, writer_spec, name, job_id, num, attempt=None, seg_index=None): """Generates a filename for a particular output. Args: writer_spec: specification dictionary for the output writer. name: name of the job. job_id: the ID number assigned to the job. num: shard number. attempt: the shard attempt number. seg_index: index of the seg. None means the final output. Returns: a string containing the filename. Raises: BadWriterParamsError if the template contains any errors such as invalid syntax or contains unknown substitution placeholders. """ naming_format = cls._TMP_FILE_NAMING_FORMAT if seg_index is None: naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM, cls.DEFAULT_NAMING_FORMAT) template = string.Template(naming_format) try: if seg_index is None: return template.substitute(name=name, id=job_id, num=num) else: return template.substitute(name=name, id=job_id, num=num, attempt=attempt, seg=seg_index) except ValueError, error: raise errors.BadWriterParamsError("Naming template is bad, %s" % (error))
def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. """ if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch")
def _create_file(cls, filesystem, filename, mime_type, **kwargs): """Creates a file and returns its created filename.""" if filesystem == files.BLOBSTORE_FILESYSTEM: return files.blobstore.create(mime_type, filename) elif filesystem == files.GS_FILESYSTEM: return files.gs.create("/gs/%s" % filename, mime_type, **kwargs) else: raise errors.BadWriterParamsError( "Filesystem '%s' is not supported" % filesystem)
def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. """ if cls.OUTPUT_SHARDING_PARAM in _get_params(mapper_spec): raise errors.BadWriterParamsError( "output_sharding should not be specified for %s" % cls.__name__) super(FileRecordsOutputWriter, cls).validate(mapper_spec)
def _get_finalized_filename(cls, fs, create_filename, request_filename): """Returns the finalized filename for the created filename.""" if fs == "blobstore": return files.blobstore.get_file_name( files.blobstore.get_blob_key(create_filename)) elif fs == "gs": return "/gs/" + request_filename else: raise errors.BadWriterParamsError( "Filesystem '%s' is not supported" % fs)
def validate(cls, mapper_spec): """Validate mapper specification. Args: mapper_spec: an instance of model.MapperSpec. Raises: BadWriterParamsError if the specification is invalid for any reason such as missing the bucket name or providing an invalid bucket name. """ writer_spec = _get_params(mapper_spec, allow_old=False) if cls.BUCKET_NAME_PARAM not in writer_spec: raise errors.BadWriterParamsError( "%s is required for Google Cloud Storage" % cls.BUCKET_NAME_PARAM) try: cloudstorage.validate_bucket_name( writer_spec[cls.BUCKET_NAME_PARAM]) except ValueError, error: raise errors.BadWriterParamsError("Bad bucket name, %s" % (error))
def _get_params(mapper_spec, allowed_keys=None, allow_old=True): """Obtain output writer parameters. Utility function for output writer implementation. Fetches parameters from mapreduce specification giving appropriate usage warnings. Args: mapper_spec: The MapperSpec for the job allowed_keys: set of all allowed keys in parameters as strings. If it is not None, then parameters are expected to be in a separate "output_writer" subdictionary of mapper_spec parameters. allow_old: Allow parameters to exist outside of the output_writer subdictionary for compatability. Returns: mapper parameters as dict Raises: BadWriterParamsError: if parameters are invalid/missing or not allowed. """ if "output_writer" not in mapper_spec.params: message = ("Output writer's parameters should be specified in " "output_writer subdictionary.") if not allow_old or allowed_keys: raise errors.BadWriterParamsError(message) params = mapper_spec.params params = dict((str(n), v) for n, v in params.iteritems()) else: if not isinstance(mapper_spec.params.get("output_writer"), dict): raise errors.BadWriterParamsError( "Output writer parameters should be a dictionary") params = mapper_spec.params.get("output_writer") params = dict((str(n), v) for n, v in params.iteritems()) if allowed_keys: params_diff = set(params.keys()) - allowed_keys if params_diff: raise errors.BadWriterParamsError( "Invalid output_writer parameters: %s" % ",".join(params_diff)) return params
def validate(cls, job_config): """Validates relevant parameters. This method can validate fields which it deems relevant. Args: job_config: an instance of map_job.JobConfig. Raises: errors.BadWriterParamsError: required parameters are missing or invalid. """ if job_config.output_writer_cls != cls: raise errors.BadWriterParamsError( "Expect output writer class %r, got %r." % (cls, job_config.output_writer_cls))
class _GoogleCloudStorageOutputWriterBase(_GoogleCloudStorageBase): """Base class for GCS writers directly interacting with GCS. Base class for both _GoogleCloudStorageOutputWriter and GoogleCloudStorageConsistentOutputWriter. This class is expected to be subclassed with a writer that applies formatting to user-level records. Subclasses need to define to_json, from_json, create, finalize and _get_write_buffer methods. See _GoogleCloudStorageBase for config options. """ _DEFAULT_NAMING_FORMAT = "$name/$id/output-$num" _MR_TMP = "gae_mr_tmp" _TMP_FILE_NAMING_FORMAT = ( _MR_TMP + "/$name/$id/attempt-$attempt/output-$num/seg-$seg") @classmethod def _generate_filename(cls, writer_spec, name, job_id, num, attempt=None, seg_index=None): """Generates a filename for a particular output. Args: writer_spec: specification dictionary for the output writer. name: name of the job. job_id: the ID number assigned to the job. num: shard number. attempt: the shard attempt number. seg_index: index of the seg. None means the final output. Returns: a string containing the filename. Raises: BadWriterParamsError: if the template contains any errors such as invalid syntax or contains unknown substitution placeholders. """ naming_format = cls._TMP_FILE_NAMING_FORMAT if seg_index is None: naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM, cls._DEFAULT_NAMING_FORMAT) template = string.Template(naming_format) try: if seg_index is None: return template.substitute(name=name, id=job_id, num=num) else: return template.substitute(name=name, id=job_id, num=num, attempt=attempt, seg=seg_index) except ValueError, error: raise errors.BadWriterParamsError("Naming template is bad, %s" % (error)) except KeyError, error: raise errors.BadWriterParamsError("Naming template '%s' has extra " "mappings, %s" % (naming_format, error))
""" writer_spec = _get_params(mapper_spec, allow_old=False) if cls.BUCKET_NAME_PARAM not in writer_spec: raise errors.BadWriterParamsError( "%s is required for Google Cloud Storage" % cls.BUCKET_NAME_PARAM) try: cloudstorage.validate_bucket_name( writer_spec[cls.BUCKET_NAME_PARAM]) except ValueError, error: raise errors.BadWriterParamsError("Bad bucket name, %s" % (error)) if writer_spec.get(cls._NO_DUPLICATE, False) not in (True, False): raise errors.BadWriterParamsError("No duplicate must a boolean.") cls._generate_filename(writer_spec, "name", "id", 0) cls._generate_filename(writer_spec, "name", "id", 0, 1, 0) @classmethod def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None): """Inherit docs.""" writer_spec = _get_params(mr_spec.mapper, allow_old=False) seg_index = None if writer_spec.get(cls._NO_DUPLICATE, False): seg_index = 0 key = cls._generate_filename(writer_spec, mr_spec.name,
class _GoogleCloudStorageOutputWriter(OutputWriter): """Output writer to Google Cloud Storage using the cloudstorage library. This class is expected to be subclassed with a writer that applies formatting to user-level records. Required configuration in the mapper_spec.output_writer dictionary. BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or suffixes such as directories. Directories/prefixes can be specifed as part of the NAMING_FORMAT_PARAM). Optional configuration in the mapper_spec.output_writer dictionary: ACL_PARAM: acl to apply to new files, else bucket default used. NAMING_FORMAT_PARAM: prefix format string for the new files (there is no required starting slash, expected formats would look like "directory/basename...", any starting slash will be treated as part of the file name) that should use the following substitutions: $name - the name of the job $id - the id assigned to the job $num - the shard number $retry - the retry count for this shard If there is more than one shard $num must be used. An arbitrary suffix may be applied by the writer. CONTENT_TYPE_PARAM: mime type to apply on the files. If not provided, Google Cloud Storage will apply its default. """ BUCKET_NAME_PARAM = "bucket_name" ACL_PARAM = "acl" NAMING_FORMAT_PARAM = "naming_format" CONTENT_TYPE_PARAM = "content_type" DEFAULT_NAMING_FORMAT = "$name-$id-output-$num-retry-$retry" _ACCOUNT_ID_PARAM = "account_id" _JSON_PICKLE = "pickle" def __init__(self, streaming_buffer, writer_spec=None): """Initialize a GoogleCloudStorageOutputWriter instance. Args: streaming_buffer: an instance of writable buffer from cloudstorage_api. writer_spec: the specification for the writer, useful for subclasses. """ self._streaming_buffer = streaming_buffer @classmethod def _generate_filename(cls, writer_spec, name, job_id, num, retry): """Generates a filename for a shard / retry count. Args: writer_spec: specification dictionary for the output writer. name: name of the job. job_id: the ID number assigned to the job. num: shard number. retry: the retry number. Returns: a string containing the filename. Raises: BadWriterParamsError if the template contains any errors such as invalid syntax or contains unknown substitution placeholders. """ naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM, cls.DEFAULT_NAMING_FORMAT) template = string.Template(naming_format) try: return template.substitute(name=name, id=job_id, num=num, retry=retry) except ValueError, error: raise errors.BadWriterParamsError("Naming template is bad, %s" % (error)) except KeyError, error: raise errors.BadWriterParamsError("Naming template '%s' has extra " "mappings, %s" % (naming_format, error))
def validate(cls, mapper_spec): """Inherit docs.""" writer_spec = cls.get_params(mapper_spec, allow_old=False) if writer_spec.get(cls._NO_DUPLICATE, False) not in (True, False): raise errors.BadWriterParamsError("No duplicate must a boolean.") super(_GoogleCloudStorageOutputWriter, cls).validate(mapper_spec)
class _GoogleCloudStorageOutputWriter(OutputWriter): """Output writer to Google Cloud Storage using the cloudstorage library. This class is expected to be subclassed with a writer that applies formatting to user-level records. Required configuration in the mapper_spec.output_writer dictionary. BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or suffixes such as directories. Directories/prefixes can be specifed as part of the NAMING_FORMAT_PARAM). Optional configuration in the mapper_spec.output_writer dictionary: ACL_PARAM: acl to apply to new files, else bucket default used. NAMING_FORMAT_PARAM: prefix format string for the new files (there is no required starting slash, expected formats would look like "directory/basename...", any starting slash will be treated as part of the file name) that should use the following substitutions: $name - the name of the job $id - the id assigned to the job $num - the shard number If there is more than one shard $num must be used. An arbitrary suffix may be applied by the writer. CONTENT_TYPE_PARAM: mime type to apply on the files. If not provided, Google Cloud Storage will apply its default. _NO_DUPLICATE: if True, slice recovery logic will be used to ensure output files has no duplicates. Every shard should have only one final output in user specified location. But it may produce many smaller files (named "seg") due to slice recovery. These segs live in a tmp directory and should be combined and renamed to the final location. In current impl, they are not combined. """ BUCKET_NAME_PARAM = "bucket_name" ACL_PARAM = "acl" NAMING_FORMAT_PARAM = "naming_format" CONTENT_TYPE_PARAM = "content_type" _NO_DUPLICATE = "no_duplicate" DEFAULT_NAMING_FORMAT = "$name/$id/output-$num" _MR_TMP = "gae_mr_tmp" _TMP_FILE_NAMING_FORMAT = ( _MR_TMP + "/$name/$id/attempt-$attempt/output-$num/seg-$seg") _ACCOUNT_ID_PARAM = "account_id" _SEG_PREFIX = "seg_prefix" _LAST_SEG_INDEX = "last_seg_index" _JSON_GCS_BUFFER = "buffer" _JSON_SEG_INDEX = "seg_index" _JSON_NO_DUP = "no_dup" _VALID_LENGTH = "x-goog-meta-gae-mr-valid-length" def __init__(self, streaming_buffer, writer_spec=None): """Initialize a GoogleCloudStorageOutputWriter instance. Args: streaming_buffer: an instance of writable buffer from cloudstorage_api. writer_spec: the specification for the writer. """ self._streaming_buffer = streaming_buffer self._no_dup = False if writer_spec: self._no_dup = writer_spec.get(self._NO_DUPLICATE, False) if self._no_dup: self._seg_index = int(streaming_buffer.name.rsplit("-", 1)[1]) self._seg_valid_length = 0 @classmethod def _generate_filename(cls, writer_spec, name, job_id, num, attempt=None, seg_index=None): """Generates a filename for a particular output. Args: writer_spec: specification dictionary for the output writer. name: name of the job. job_id: the ID number assigned to the job. num: shard number. attempt: the shard attempt number. seg_index: index of the seg. None means the final output. Returns: a string containing the filename. Raises: BadWriterParamsError if the template contains any errors such as invalid syntax or contains unknown substitution placeholders. """ naming_format = cls._TMP_FILE_NAMING_FORMAT if seg_index is None: naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM, cls.DEFAULT_NAMING_FORMAT) template = string.Template(naming_format) try: if seg_index is None: return template.substitute(name=name, id=job_id, num=num) else: return template.substitute(name=name, id=job_id, num=num, attempt=attempt, seg=seg_index) except ValueError, error: raise errors.BadWriterParamsError("Naming template is bad, %s" % (error)) except KeyError, error: raise errors.BadWriterParamsError("Naming template '%s' has extra " "mappings, %s" % (naming_format, error))