Exemplo n.º 1
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")

        output_sharding = cls._get_output_sharding(mapper_spec=mapper_spec)
        if (output_sharding != cls.OUTPUT_SHARDING_NONE
                and output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS):
            raise errors.BadWriterParamsError(
                "Invalid output_sharding value: %s" % output_sharding)

        params = _get_params(mapper_spec)
        filesystem = cls._get_filesystem(mapper_spec)
        if filesystem not in files.FILESYSTEMS:
            raise errors.BadWriterParamsError(
                "Filesystem '%s' is not supported. Should be one of %s" %
                (filesystem, files.FILESYSTEMS))
        if filesystem == files.GS_FILESYSTEM:
            if not cls.GS_BUCKET_NAME_PARAM in params:
                raise errors.BadWriterParamsError(
                    "%s is required for Google store filesystem" %
                    cls.GS_BUCKET_NAME_PARAM)
        else:
            if params.get(cls.GS_BUCKET_NAME_PARAM) is not None:
                raise errors.BadWriterParamsError(
                    "%s can only be provided for Google store filesystem" %
                    cls.GS_BUCKET_NAME_PARAM)
Exemplo n.º 2
0
    def _generate_filename(cls, writer_spec, name, job_id, num, retry):
        """Generates a filename for a shard / retry count.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      retry: the retry number.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
        naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                        cls.DEFAULT_NAMING_FORMAT)
        template = string.Template(naming_format)
        try:
            # Check that template doesn't use undefined mappings and is formatted well
            return template.substitute(name=name,
                                       id=job_id,
                                       num=num,
                                       retry=retry)
        except ValueError, error:
            raise errors.BadWriterParamsError("Naming template is bad, %s" %
                                              (error))
Exemplo n.º 3
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")
Exemplo n.º 4
0
 def _create_file(cls, filesystem, filename, mime_type, **kwargs):
     """Creates a file and returns its created filename."""
     if filesystem == files.BLOBSTORE_FILESYSTEM:
         return files.blobstore.create(mime_type, filename)
     elif filesystem == files.GS_FILESYSTEM:
         return files.gs.create("/gs/%s" % filename, mime_type, **kwargs)
     else:
         raise errors.BadWriterParamsError(
             "Filesystem '%s' is not supported" % filesystem)
Exemplo n.º 5
0
 def _get_finalized_filename(cls, fs, create_filename, request_filename):
     """Returns the finalized filename for the created filename."""
     if fs == "blobstore":
         return files.blobstore.get_file_name(
             files.blobstore.get_blob_key(create_filename))
     elif fs == "gs":
         return "/gs/" + request_filename
     else:
         raise errors.BadWriterParamsError(
             "Filesystem '%s' is not supported" % fs)
Exemplo n.º 6
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if cls.OUTPUT_SHARDING_PARAM in _get_params(mapper_spec):
            raise errors.BadWriterParamsError(
                "output_sharding should not be specified for %s" %
                cls.__name__)
        super(FileRecordsOutputWriter, cls).validate(mapper_spec)
Exemplo n.º 7
0
def _get_params(mapper_spec, allowed_keys=None, allow_old=True):
    """Obtain output writer parameters.

  Utility function for output writer implementation. Fetches parameters
  from mapreduce specification giving appropriate usage warnings.

  Args:
    mapper_spec: The MapperSpec for the job
    allowed_keys: set of all allowed keys in parameters as strings. If it is not
      None, then parameters are expected to be in a separate "output_writer"
      subdictionary of mapper_spec parameters.
    allow_old: Allow parameters to exist outside of the output_writer
      subdictionary for compatability.

  Returns:
    mapper parameters as dict

  Raises:
    BadWriterParamsError: if parameters are invalid/missing or not allowed.
  """
    if "output_writer" not in mapper_spec.params:
        message = ("Output writer's parameters should be specified in "
                   "output_writer subdictionary.")
        if not allow_old or allowed_keys:
            raise errors.BadWriterParamsError(message)
        params = mapper_spec.params
        params = dict((str(n), v) for n, v in params.iteritems())
    else:
        if not isinstance(mapper_spec.params.get("output_writer"), dict):
            raise errors.BadWriterParamsError(
                "Output writer parameters should be a dictionary")
        params = mapper_spec.params.get("output_writer")
        params = dict((str(n), v) for n, v in params.iteritems())
        if allowed_keys:
            params_diff = set(params.keys()) - allowed_keys
            if params_diff:
                raise errors.BadWriterParamsError(
                    "Invalid output_writer parameters: %s" %
                    ",".join(params_diff))
    return params
Exemplo n.º 8
0
    def validate(cls, mapper_spec):
        """Validate mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Raises:
      BadWriterParamsError if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        writer_spec = _get_params(mapper_spec, allow_old=False)

        # Bucket Name is required
        if cls.BUCKET_NAME_PARAM not in writer_spec:
            raise errors.BadWriterParamsError(
                "%s is required for Google Cloud Storage" %
                cls.BUCKET_NAME_PARAM)
        try:
            cloudstorage.validate_bucket_name(
                writer_spec[cls.BUCKET_NAME_PARAM])
        except ValueError, error:
            raise errors.BadWriterParamsError("Bad bucket name, %s" % (error))
Exemplo n.º 9
0
class _GoogleCloudStorageOutputWriter(OutputWriter):
    """Output writer to Google Cloud Storage using the cloudstorage library.

  This class is expected to be subclassed with a writer that applies formatting
  to user-level records.

  Required configuration in the mapper_spec.output_writer dictionary.
    BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or
      suffixes such as directories. Directories/prefixes can be specifed as
      part of the NAMING_FORMAT_PARAM).

  Optional configuration in the mapper_spec.output_writer dictionary:
    ACL_PARAM: acl to apply to new files, else bucket default used.
    NAMING_FORMAT_PARAM: prefix format string for the new files (there is no
      required starting slash, expected formats would look like
      "directory/basename...", any starting slash will be treated as part of
      the file name) that should use the following substitutions:
        $name - the name of the job
        $id - the id assigned to the job
        $num - the shard number
        $retry - the retry count for this shard
      If there is more than one shard $num must be used. An arbitrary suffix may
      be applied by the writer.
    CONTENT_TYPE_PARAM: mime type to apply on the files. If not provided, Google
      Cloud Storage will apply its default.
  """

    # Supported parameters
    BUCKET_NAME_PARAM = "bucket_name"
    ACL_PARAM = "acl"
    NAMING_FORMAT_PARAM = "naming_format"
    CONTENT_TYPE_PARAM = "content_type"

    # Default settings
    DEFAULT_NAMING_FORMAT = "$name-$id-output-$num-retry-$retry"

    # Internal parameters
    _ACCOUNT_ID_PARAM = "account_id"
    _JSON_FILENAME = "filename"
    _JSON_GCS_BUFFER = "buffer"

    # writer_spec only used by subclasses, pylint: disable=unused-argument
    def __init__(self, streaming_buffer, filename, writer_spec=None):
        """Initialize a GoogleCloudStorageOutputWriter instance.

    Args:
      streaming_buffer: an instance of writable buffer from cloudstorage_api.
      filename: the GCS client filename this writer is writing to.
      writer_spec: the specification for the writer, useful for subclasses.
    """
        self._streaming_buffer = streaming_buffer
        self._filename = filename

    @classmethod
    def _generate_filename(cls, writer_spec, name, job_id, num, retry):
        """Generates a filename for a shard / retry count.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      retry: the retry number.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
        naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                        cls.DEFAULT_NAMING_FORMAT)
        template = string.Template(naming_format)
        try:
            # Check that template doesn't use undefined mappings and is formatted well
            return template.substitute(name=name,
                                       id=job_id,
                                       num=num,
                                       retry=retry)
        except ValueError, error:
            raise errors.BadWriterParamsError("Naming template is bad, %s" %
                                              (error))
        except KeyError, error:
            raise errors.BadWriterParamsError("Naming template '%s' has extra "
                                              "mappings, %s" %
                                              (naming_format, error))