Пример #1
0
 def test_uri_parsing(self):
     self.assertEqual(is_uri('notauri!'), False)
     self.assertEqual(is_uri('they://did/the/monster/mash'), True)
     self.assertEqual(is_s3_uri('s3://a/uri'), True)
     self.assertEqual(is_s3_uri('s3n://a/uri'), True)
     self.assertEqual(is_s3_uri('hdfs://a/uri'), False)
     self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
Пример #2
0
    def _s3_ls(self, uri):
        """Helper for ls(); doesn't bother with globbing or directories"""
        bucket_name, key_name = parse_s3_uri(uri)

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(key_name):
            yield s3_key_to_uri(key)
Пример #3
0
    def _get_s3_key(self, uri):
        """Get the boto3 s3.Object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``
        """
        bucket_name, key_name = parse_s3_uri(uri)
        return self.get_bucket(bucket_name).Object(key_name)
Пример #4
0
Файл: s3.py Проект: LXiong/mrjob
    def _s3_ls(self, uri):
        """Helper for ls(); doesn't bother with globbing or directories"""
        s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        bucket = s3_conn.get_bucket(bucket_name, validate=VALIDATE_BUCKET)
        for key in bucket.list(key_name):
            yield s3_key_to_uri(key)
Пример #5
0
    def make_s3_key(self, uri):
        """Create the given S3 key, and return the corresponding
        boto Key object.

        uri is an S3 URI: ``s3://foo/bar``
        """
        bucket_name, key_name = parse_s3_uri(uri)

        return self.get_bucket(bucket_name).new_key(key_name)
Пример #6
0
    def get_s3_keys(self, uri):
        """Get a stream of boto Key objects for each key inside
        the given dir on S3.

        uri is an S3 URI: ``s3://foo/bar``
        """
        bucket_name, key_prefix = parse_s3_uri(uri)
        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(key_prefix):
            yield key
Пример #7
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        log.debug("ls %s", path_glob)

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just incase we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        # Check if we're only going to get results by using a / on the end
        uris = self._s3_ls(base_uri)
        try:
            first = uris.next()
            uris = chain([first], uris)
        except (boto.exception.S3ResponseError, StopIteration):
            try:
                uris = self._s3_ls(base_uri.rstrip("/") + "/")
            except (boto.exception.S3ResponseError, StopIteration):
                return

        prev_uri = None
        for uri in uris:
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            # If there are keys /data and /data/my_file then we consider there
            # to be a file /data, overriding there being a directory called
            # /data containing a file my_file. We discard /data/my_file.
            if prev_uri is not None and uri.startswith(prev_uri):
                continue

            yield uri
            prev_uri = uri.rstrip("/") + "/"
Пример #8
0
    def test_cleanup(self):
        runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01)

        # add some mock data and change last_modified
        remote_input_path = 's3://walrus/data/'
        self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n',
                                        'data/bar': 'bar\n',
                                        'data/qux': 'qux\n'}})

        s3_conn = runner.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(remote_input_path)
        bucket = s3_conn.get_bucket(bucket_name)

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')
        key_bar.last_modified = datetime.now() - timedelta(days=45)
        key_qux.last_modified = datetime.now() - timedelta(hours=50)

        # make sure keys are there
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True,
                   conf_paths=[])

        # dry-run shouldn't delete anything
        assert isinstance(key_foo, MockKey)
        assert isinstance(key_bar, MockKey)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_bar is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
        assert isinstance(key_qux, MockKey)

        s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[])

        key_foo = bucket.get_key('data/foo')
        key_bar = bucket.get_key('data/bar')
        key_qux = bucket.get_key('data/qux')

        # make sure key_qux is deleted
        assert isinstance(key_foo, MockKey)
        self.assertEqual(key_bar, None)
        self.assertEqual(key_qux, None)
Пример #9
0
    def get_s3_key(self, uri, s3_conn=None):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing s3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        return s3_conn.get_bucket(bucket_name).get_key(key_name)
Пример #10
0
    def make_s3_key(self, uri, s3_conn=None):
        """Create the given S3 key, and return the corresponding
        boto Key object.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        return s3_conn.get_bucket(bucket_name).new_key(key_name)
Пример #11
0
    def get_s3_keys(self, uri, s3_conn=None):
        """Get a stream of boto Key objects for each key inside
        the given dir on S3.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through s3_conn
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_prefix = parse_s3_uri(uri)
        bucket = s3_conn.get_bucket(bucket_name)
        for key in bucket.list(key_prefix):
            yield key
Пример #12
0
    def make_s3_key(self, uri, s3_conn=None):
        """Create the given S3 key, and return the corresponding
        boto Key object.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        return s3_conn.get_bucket(
            bucket_name, validate=VALIDATE_BUCKET).new_key(key_name)
Пример #13
0
    def get_s3_keys(self, uri, s3_conn=None):
        """Get a stream of boto Key objects for each key inside
        the given dir on S3.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing S3 connection through s3_conn
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_prefix = parse_s3_uri(uri)
        bucket = _get_bucket(s3_conn, bucket_name)
        for key in bucket.list(key_prefix):
            yield key
Пример #14
0
    def mkdir(self, dest):
        """Make a directory. This doesn't actually create directories on S3
        (because there is no such thing), but it will create the corresponding
        bucket if it doesn't exist.
        """
        bucket_name, key_name = parse_s3_uri(dest)

        client = self.make_s3_client()

        try:
            client.head_bucket(Bucket=bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) != 404:
                raise

            self.create_bucket(bucket_name)
Пример #15
0
Файл: s3.py Проект: Yelp/mrjob
    def mkdir(self, dest):
        """Make a directory. This doesn't actually create directories on S3
        (because there is no such thing), but it will create the corresponding
        bucket if it doesn't exist.
        """
        bucket_name, key_name = parse_s3_uri(dest)

        client = self.make_s3_client()

        try:
            client.head_bucket(Bucket=bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) != 404:
                raise

            self.create_bucket(bucket_name)
Пример #16
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Пример #17
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Пример #18
0
    def get_s3_key(self, uri):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``
        """
        bucket_name, key_name = parse_s3_uri(uri)

        try:
            bucket = self.get_bucket(bucket_name)
        except boto.exception.S3ResponseError as e:
            if e.status != 404:
                raise e
            key = None
        else:
            key = bucket.get_key(key_name)

        return key
Пример #19
0
    def get_s3_key(self, uri):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``
        """
        bucket_name, key_name = parse_s3_uri(uri)

        try:
            bucket = self.get_bucket(bucket_name)
        except boto.exception.S3ResponseError as e:
            if e.status != 404:
                raise e
            key = None
        else:
            key = bucket.get_key(key_name)

        return key
Пример #20
0
    def get_s3_key(self, uri, s3_conn=None):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing s3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        try:
            bucket = s3_conn.get_bucket(bucket_name)
        except boto.exception.S3ResponseError, e:
            if e.status != 404:
                raise e
            key = None
Пример #21
0
    def get_s3_key(self, uri, s3_conn=None):
        """Get the boto Key object matching the given S3 uri, or
        return None if that key doesn't exist.

        uri is an S3 URI: ``s3://foo/bar``

        You may optionally pass in an existing s3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()
        bucket_name, key_name = parse_s3_uri(uri)

        try:
            bucket = s3_conn.get_bucket(bucket_name)
        except boto.exception.S3ResponseError, e:
            if e.status != 404:
                raise e
            key = None
Пример #22
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Пример #23
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Пример #24
0
    def get_s3_folder_keys(self, uri, s3_conn=None):
        """.. deprecated:: 0.4.0

        Background: EMR used to fake directories on S3 by creating special
        ``*_$folder$`` keys in S3. That is no longer true, so this method is
        deprecated.

        For example if your job outputs ``s3://walrus/tmp/output/part-00000``,
        EMR will also create these keys:

        - ``s3://walrus/tmp_$folder$``
        - ``s3://walrus/tmp/output_$folder$``

        If you want to grant another Amazon user access to your files so they
        can use them in S3, you must grant read access on the actual keys,
        plus any ``*_$folder$`` keys that "contain" your keys; otherwise
        EMR will error out with a permissions error.

        This gets all the ``*_$folder$`` keys associated with the given URI,
        as boto Key objects.

        This does not support globbing.

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        log.warning(
            'get_s3_folder_keys() is deprecated and will be removed in v0.5.0')

        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_name = parse_s3_uri(uri)
        bucket = _get_bucket(s3_conn, bucket_name)

        dirs = key_name.split('/')
        for i in range(len(dirs)):
            folder_name = '/'.join(dirs[:i]) + '_$folder$'
            key = bucket.get_key(folder_name)
            if key:
                yield key
Пример #25
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just incase we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Пример #26
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Пример #27
0
def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None):
    """Delete all files older than *time_old* in *path*.
       If *dry_run* is ``True``, then just log the files that need to be
       deleted without actually deleting them
       """
    runner = EMRJobRunner(conf_path=conf_path)
    s3_conn = runner.make_s3_conn()

    log.info("Deleting all files in %s that are older than %s" % (glob_path, time_old))

    for path in runner.ls(glob_path):
        bucket_name, key_name = parse_s3_uri(path)
        bucket = s3_conn.get_bucket(bucket_name)

        for key in bucket.list(key_name):
            last_modified = iso8601_to_datetime(key.last_modified)
            age = datetime.utcnow() - last_modified
            if age > time_old:
                # Delete it
                log.info("Deleting %s; is %s old" % (key.name, age))
                if not dry_run:
                    key.delete()
Пример #28
0
def s3_cleanup(glob_path, time_old, dry_run=False, conf_paths=None):
    """Delete all files older than *time_old* in *path*.
       If *dry_run* is ``True``, then just log the files that need to be
       deleted without actually deleting them
       """
    runner = EMRJobRunner(conf_paths=conf_paths)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path in runner.ls(glob_path):
        bucket_name, key_name = parse_s3_uri(path)
        bucket = runner.fs.get_bucket(bucket_name)

        for key in bucket.list(key_name):
            last_modified = iso8601_to_datetime(key.last_modified)
            age = datetime.utcnow() - last_modified
            if age > time_old:
                # Delete it
                log.info('Deleting %s; is %s old' % (key.name, age))
                if not dry_run:
                    key.delete()
Пример #29
0
    def get_s3_folder_keys(self, uri, s3_conn=None):
        """.. deprecated:: 0.4.0

        Background: EMR used to fake directories on S3 by creating special
        ``*_$folder$`` keys in S3. That is no longer true, so this method is
        deprecated.

        For example if your job outputs ``s3://walrus/tmp/output/part-00000``,
        EMR will also create these keys:

        - ``s3://walrus/tmp_$folder$``
        - ``s3://walrus/tmp/output_$folder$``

        If you want to grant another Amazon user access to your files so they
        can use them in S3, you must grant read access on the actual keys,
        plus any ``*_$folder$`` keys that "contain" your keys; otherwise
        EMR will error out with a permissions error.

        This gets all the ``*_$folder$`` keys associated with the given URI,
        as boto Key objects.

        This does not support globbing.

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_name = parse_s3_uri(uri)
        bucket = s3_conn.get_bucket(bucket_name)

        dirs = key_name.split('/')
        for i in range(len(dirs)):
            folder_name = '/'.join(dirs[:i]) + '_$folder$'
            key = bucket.get_key(folder_name)
            if key:
                yield key
Пример #30
0
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs):
    """Delete all files older than *time_old* in *path*.

    If *dry_run* is true, then just log the files that need to be
    deleted without actually deleting them
    """
    runner = EMRJobRunner(**runner_kwargs)

    log.info('Deleting all files in %s that are older than %s' %
             (glob_path, time_old))

    for path in runner.fs.ls(glob_path):
        bucket_name, key_name = parse_s3_uri(path)
        bucket = runner.fs.get_bucket(bucket_name)

        for key in bucket.list(key_name):
            last_modified = iso8601_to_datetime(key.last_modified)
            age = datetime.utcnow() - last_modified
            if age > time_old:
                # Delete it
                log.info('Deleting %s; is %s old' % (key.name, age))
                if not dry_run:
                    key.delete()
Пример #31
0
    def get_s3_folder_keys(self, uri, s3_conn=None):
        """Background: S3 is even less of a filesystem than HDFS in that it
        doesn't have directories. EMR fakes directories by creating special
        ``*_$folder$`` keys in S3.

        For example if your job outputs ``s3://walrus/tmp/output/part-00000``,
        EMR will also create these keys:

        - ``s3://walrus/tmp_$folder$``
        - ``s3://walrus/tmp/output_$folder$``

        If you want to grant another Amazon user access to your files so they
        can use them in S3, you must grant read access on the actual keys,
        plus any ``*_$folder$`` keys that "contain" your keys; otherwise
        EMR will error out with a permissions error.

        This gets all the ``*_$folder$`` keys associated with the given URI,
        as boto Key objects.

        This does not support globbing.

        You may optionally pass in an existing S3 connection through
        ``s3_conn``.
        """
        if not s3_conn:
            s3_conn = self.make_s3_conn()

        bucket_name, key_name = parse_s3_uri(uri)
        bucket = s3_conn.get_bucket(bucket_name)

        dirs = key_name.split('/')
        for i in range(len(dirs)):
            folder_name = '/'.join(dirs[:i]) + '_$folder$'
            key = bucket.get_key(folder_name)
            if key:
                yield key
Пример #32
0
    def simulate_progress(self, jobflow_id, now=None):
        """Simulate progress on the given job flow. This is automatically
        run when we call describe_jobflow().

        :type jobflow_id: str
        :param jobflow_id: fake job flow ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        if now is None:
            now = datetime.utcnow()

        if self.simulation_iterator:
            try:
                self.simulation_iterator.next()
            except StopIteration:
                raise AssertionError(
                    'Simulated progress too many times; bailing out')

        job_flow = self.mock_emr_job_flows[jobflow_id]

        # if job is STARTING, move it along to WAITING
        if job_flow.state == 'STARTING':
            job_flow.state = 'WAITING'
            job_flow.startdatetime = to_iso8601(now)
            # instances are now provisioned and running
            for ig in job_flow.instancegroups:
                ig.instancerunningcount = ig.instancerequestcount

        # if job is done, don't advance it
        if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'):
            return

        # if SHUTTING_DOWN, finish shutting down
        if job_flow.state == 'SHUTTING_DOWN':
            if job_flow.reason == 'Shut down as step failed':
                job_flow.state = 'FAILED'
            else:
                job_flow.state = 'TERMINATED'
            job_flow.enddatetime = to_iso8601(now)
            return

        # if a step is currently running, advance it
        steps = getattr(job_flow, 'steps', None) or []

        for step_num, step in enumerate(steps):
            # skip steps that are already done
            if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'):
                continue
            if step.name in ('Setup Hadoop Debugging', ):
                step.state = 'COMPLETED'
                continue

            # allow steps to get stuck
            if getattr(step, 'mock_no_progress', None):
                return

            # found currently running step! going to handle it, then exit
            if step.state == 'PENDING':
                step.state = 'RUNNING'
                step.startdatetime = to_iso8601(now)
                return

            assert step.state == 'RUNNING'
            step.enddatetime = to_iso8601(now)

            # check if we're supposed to have an error
            if (jobflow_id, step_num) in self.mock_emr_failures:
                step.state = 'FAILED'
                reason = self.mock_emr_failures[(jobflow_id, step_num)]
                if reason:
                    job_flow.reason = reason
                if step.actiononfailure == 'TERMINATE_JOB_FLOW':
                    job_flow.state = 'SHUTTING_DOWN'
                    if not reason:
                        job_flow.reason = 'Shut down as step failed'
                return

            step.state = 'COMPLETED'

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step)
            if output_uri and is_s3_uri(output_uri):
                mock_output = self.mock_emr_output.get(
                    (jobflow_id, step_num)) or ['']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, bytes in enumerate(mock_output):
                    add_mock_s3_data(self.mock_s3_fs, {
                        bucket_name: {key_name + 'part-%05d' % i: bytes}})
            elif (jobflow_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for job flow ID %s, step %d "
                    "(it doesn't output to S3)" %
                    (jobflow_id, step_num))

            # done!
            return

        # no pending steps. shut down job if appropriate
        if job_flow.keepjobflowalivewhennosteps == 'true':
            job_flow.state = 'WAITING'
            job_flow.reason = 'Waiting for steps to run'
        else:
            job_flow.state = 'COMPLETED'
            job_flow.reason = 'Steps Completed'
Пример #33
0
    def simulate_progress(self, jobflow_id, now=None):
        """Simulate progress on the given job flow. This is automatically
        run when we call describe_jobflow().

        :type jobflow_id: str
        :param jobflow_id: fake job flow ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        if now is None:
            now = datetime.datetime.utcnow()

        if self.simulation_steps_left <= 0:
            raise AssertionError("Simulated progress too many times; bailing out")
        self.simulation_steps_left -= 1

        job_flow = self.mock_emr_job_flows[jobflow_id]

        # if job is STARTING, move it along to WAITING
        if job_flow.state == "STARTING":
            job_flow.state = "WAITING"
            job_flow.startdatetime = to_iso8601(now)

        # if job is done, don't advance it
        if job_flow.state in ("COMPLETED", "TERMINATED", "FAILED"):
            return

        # if SHUTTING_DOWN, finish shutting down
        if job_flow.state == "SHUTTING_DOWN":
            if job_flow.reason == "Shut down as step failed":
                job_flow.state = "FAILED"
            else:
                job_flow.state = "TERMINATED"
            job_flow.enddatetime = to_iso8601(now)
            return

        # if a step is currently running, advance it
        for step_num, step in enumerate(job_flow.steps):
            # skip steps that are already done
            if step.state in ("COMPLETED", "FAILED", "CANCELLED"):
                continue
            if step.name in ("Setup Hadoop Debugging",):
                step.state = "COMPLETED"
                continue

            # found currently running step! going to handle it, then exit
            if step.state == "PENDING":
                step.state = "RUNNING"
                step.startdatetime = to_iso8601(now)
                return

            assert step.state == "RUNNING"
            step.enddatetime = to_iso8601(now)

            # check if we're supposed to have an error
            if (jobflow_id, step_num) in self.mock_emr_failures:
                step.state = "FAILED"
                reason = self.mock_emr_failures[(jobflow_id, step_num)]
                if reason:
                    job_flow.reason = reason
                if step.actiononfailure == "TERMINATE_JOB_FLOW":
                    job_flow.state = "SHUTTING_DOWN"
                    if not reason:
                        job_flow.reason = "Shut down as step failed"
                return

            step.state = "COMPLETED"

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step)
            if output_uri and is_s3_uri(output_uri):
                mock_output = self.mock_emr_output.get((jobflow_id, step_num)) or [""]

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, bytes in enumerate(mock_output):
                    add_mock_s3_data(self.mock_s3_fs, {bucket_name: {key_name + "part-%05d" % i: bytes}})
            elif (jobflow_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for job flow ID %s, step %d " "(it doesn't output to S3)" % (jobflow_id, step_num)
                )

            # done!
            return

        # no pending steps. shut down job if appropriate
        if job_flow.keepjobflowalivewhennosteps == "true":
            job_flow.state = "WAITING"
            job_flow.reason = "Waiting for steps to run"
        else:
            job_flow.state = "COMPLETED"
            job_flow.reason = "Steps Completed"
Пример #34
0
 def test_parse_s3_uri(self):
     self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
Пример #35
0
    def _simulate_progress(self, cluster_id, now=None):
        """Simulate progress on the given cluster. This is automatically
        run when we call :py:meth:`describe_step`, and, when the cluster is
        ``TERMINATING``, :py:meth:`describe_cluster`.

        :type cluster_id: str
        :param cluster_id: fake cluster ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        # TODO: this doesn't actually update steps to CANCELLED when
        # cluster is shut down
        if now is None:
            now = _boto3_now()

        cluster = self.mock_emr_clusters[cluster_id]

        # allow clusters to get stuck
        if cluster.get('_DelayProgressSimulation', 0) > 0:
            cluster['_DelayProgressSimulation'] -= 1
            return

        # this code is pretty loose about updating StateChangeReason
        # (for the cluster, instance groups, and steps). Add this as needed.

        # if job is STARTING, move it along to BOOTSTRAPPING
        if cluster['Status']['State'] == 'STARTING':
            cluster['Status']['State'] = 'BOOTSTRAPPING'

            # master now has a hostname
            cluster['MasterPublicDnsName'] = 'master.%s.mock' % cluster['Id']

            # instances are now provisioned
            for ig in cluster['_InstanceGroups']:
                ig['RunningInstanceCount'] = ig['RequestedInstanceCount']
                ig['Status']['State'] = 'BOOTSTRAPPING'

            return

        # if job is TERMINATING, move along to terminated
        if cluster['Status']['State'] == 'TERMINATING':
            code = cluster['Status']['StateChangeReason'].get('Code')
            if code and code.endswith('_FAILURE'):
                cluster['Status']['State'] = 'TERMINATED_WITH_ERRORS'
            else:
                cluster['Status']['State'] = 'TERMINATED'

            return

        # if job is done, nothing to do
        if cluster['Status']['State'] in ('TERMINATED',
                                          'TERMINATED_WITH_ERRORS'):
            return

        # if job is BOOTSTRAPPING, move it along to RUNNING and continue
        if cluster['Status']['State'] == 'BOOTSTRAPPING':
            cluster['Status']['State'] = 'RUNNING'
            for ig in cluster['_InstanceGroups']:
                ig['Status']['State'] = 'RUNNING'

        # at this point, should be RUNNING or WAITING
        assert cluster['Status']['State'] in ('RUNNING', 'WAITING')

        # simulate self-termination
        if cluster_id in self.mock_emr_self_termination:
            cluster['Status']['State'] = 'TERMINATING'
            cluster['Status']['StateChangeReason'] = dict(
                Code='INSTANCE_FAILURE',
                Message='The master node was terminated. ',  # sic
            )

            for step in cluster['_Steps']:
                if step['Status']['State'] in ('PENDING', 'RUNNING'):
                    step['Status']['State'] = 'CANCELLED'  # not INTERRUPTED

            return

        # try to find the next step, and advance it

        for step_num, step in enumerate(cluster['_Steps']):
            # skip steps that are already done
            if step['Status']['State'] in ('COMPLETED', 'FAILED', 'CANCELLED',
                                           'INTERRUPTED'):
                continue

            # found currently running step! handle it, then exit

            # start PENDING step
            if step['Status']['State'] == 'PENDING':
                step['Status']['State'] = 'RUNNING'
                step['Status']['Timeline']['StartDateTime'] = now
                return

            assert step['Status']['State'] == 'RUNNING'

            # check if we're supposed to have an error
            if (cluster_id, step_num) in self.mock_emr_failures:
                step['Status']['State'] = 'FAILED'

                if step['ActionOnFailure'] in ('TERMINATE_CLUSTER',
                                               'TERMINATE_JOB_FLOW'):

                    cluster['Status']['State'] = 'TERMINATING'
                    cluster['Status']['StateChangeReason']['Code'] = (
                        'STEP_FAILURE')
                    cluster['Status']['StateChangeReason']['Message'] = (
                        'Shut down as step failed')

                    for step in cluster['_Steps']:
                        if step['Status']['State'] in ('PENDING', 'RUNNING'):
                            step['Status']['State'] = 'CANCELLED'

                return

            # complete step
            step['Status']['State'] = 'COMPLETED'
            step['Status']['Timeline']['EndDateTime'] = now

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step['Config']['Args'])
            if output_uri and is_s3_uri(output_uri):
                mock_output = self.mock_emr_output.get(
                    (cluster_id, step_num)) or [b'']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, part in enumerate(mock_output):
                    add_mock_s3_data(
                        self.mock_s3_fs,
                        {bucket_name: {
                            key_name + 'part-%05d' % i: part
                        }})
            elif (cluster_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for cluster ID %s, step %d "
                    "(it doesn't output to S3)" % (cluster_id, step_num))

            # done!
            # if this is the last step, continue to autotermination code, below
            if step_num < len(cluster['_Steps']) - 1:
                return

        # no pending steps. should we wait, or shut down?
        if cluster['AutoTerminate']:
            cluster['Status']['State'] = 'TERMINATING'
            cluster['Status']['StateChangeReason']['Code'] = (
                'ALL_STEPS_COMPLETED')
            cluster['Status']['StateChangeReason']['Message'] = (
                'Steps Completed')
        else:
            # just wait
            cluster['Status']['State'] = 'WAITING'
            cluster['Status']['StateChangeReason'] = {}

        return
Пример #36
0
    def simulate_progress(self, jobflow_id, now=None):
        """Simulate progress on the given job flow. This is automatically
        run when we call describe_jobflow().

        :type jobflow_id: str
        :param jobflow_id: fake job flow ID
        :type now: py:class:`datetime.datetime`
        :param now: alternate time to use as the current time (should be UTC)
        """
        if now is None:
            now = datetime.utcnow()

        if self.simulation_iterator:
            try:
                self.simulation_iterator.next()
            except StopIteration:
                raise AssertionError(
                    'Simulated progress too many times; bailing out')

        job_flow = self.mock_emr_job_flows[jobflow_id]

        # if job is STARTING, move it along to WAITING
        if job_flow.state == 'STARTING':
            job_flow.state = 'WAITING'
            job_flow.startdatetime = to_iso8601(now)
            # instances are now provisioned and running
            for ig in job_flow.instancegroups:
                ig.instancerunningcount = ig.instancerequestcount

        # if job is done, don't advance it
        if job_flow.state in ('COMPLETED', 'TERMINATED', 'FAILED'):
            return

        # if SHUTTING_DOWN, finish shutting down
        if job_flow.state == 'SHUTTING_DOWN':
            if job_flow.reason == 'Shut down as step failed':
                job_flow.state = 'FAILED'
            else:
                job_flow.state = 'TERMINATED'
            job_flow.enddatetime = to_iso8601(now)
            return

        # if a step is currently running, advance it
        steps = getattr(job_flow, 'steps', None) or []

        for step_num, step in enumerate(steps):
            # skip steps that are already done
            if step.state in ('COMPLETED', 'FAILED', 'CANCELLED'):
                continue
            if step.name in ('Setup Hadoop Debugging', ):
                step.state = 'COMPLETED'
                continue

            # allow steps to get stuck
            if getattr(step, 'mock_no_progress', None):
                return

            # found currently running step! going to handle it, then exit
            if step.state == 'PENDING':
                step.state = 'RUNNING'
                step.startdatetime = to_iso8601(now)
                return

            assert step.state == 'RUNNING'
            step.enddatetime = to_iso8601(now)

            # check if we're supposed to have an error
            if (jobflow_id, step_num) in self.mock_emr_failures:
                step.state = 'FAILED'
                reason = self.mock_emr_failures[(jobflow_id, step_num)]
                if reason:
                    job_flow.reason = reason
                if step.actiononfailure == 'TERMINATE_JOB_FLOW':
                    job_flow.state = 'SHUTTING_DOWN'
                    if not reason:
                        job_flow.reason = 'Shut down as step failed'
                return

            step.state = 'COMPLETED'

            # create fake output if we're supposed to write to S3
            output_uri = self._get_step_output_uri(step)
            if output_uri and is_s3_uri(output_uri):
                mock_output = self.mock_emr_output.get(
                    (jobflow_id, step_num)) or ['']

                bucket_name, key_name = parse_s3_uri(output_uri)

                # write output to S3
                for i, bytes in enumerate(mock_output):
                    add_mock_s3_data(self.mock_s3_fs, {
                        bucket_name: {key_name + 'part-%05d' % i: bytes}})
            elif (jobflow_id, step_num) in self.mock_emr_output:
                raise AssertionError(
                    "can't use output for job flow ID %s, step %d "
                    "(it doesn't output to S3)" %
                    (jobflow_id, step_num))

            # done!
            return

        # no pending steps. shut down job if appropriate
        if job_flow.keepjobflowalivewhennosteps == 'true':
            job_flow.state = 'WAITING'
            job_flow.reason = 'Waiting for steps to run'
        else:
            job_flow.state = 'COMPLETED'
            job_flow.reason = 'Steps Completed'