Exemplo n.º 1
0
def _my_urlsplit(url):
    """This is a hack to prevent the regular urlsplit from splitting around question marks.

    A question mark (?) in a URL typically indicates the start of a
    querystring, and the standard library's urlparse function handles the
    querystring separately.  Unfortunately, question marks can also appear
    _inside_ the actual URL for some schemas like S3.

    Replaces question marks with newlines prior to splitting.  This is safe because:

    1. The standard library's urlsplit completely ignores newlines
    2. Raw newlines will never occur in innocuous URLs.  They are always URL-encoded.

    See Also
    --------
    https://github.com/python/cpython/blob/3.7/Lib/urllib/parse.py
    https://github.com/RaRe-Technologies/smart_open/issues/285
    """
    if '?' not in url:
        return urlsplit(url, allow_fragments=False)

    sr = urlsplit(url.replace('?', '\n'), allow_fragments=False)
    SplitResult = collections.namedtuple('SplitResult',
                                         'scheme netloc path query fragment')
    return SplitResult(sr.scheme, sr.netloc, sr.path.replace('\n', '?'), '',
                       '')
Exemplo n.º 2
0
def _parse_uri(uri_as_string):
    """
    Parse the given URI from a string.

    Supported URI schemes are:

      * file
      * hdfs
      * http
      * https
      * s3
      * s3a
      * s3n
      * s3u
      * webhdfs

    .s3, s3a and s3n are treated the same way.  s3u is s3 but without SSL.

    Valid URI examples::

      * s3://my_bucket/my_key
      * s3://my_key:my_secret@my_bucket/my_key
      * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key
      * hdfs:///path/file
      * hdfs://path/file
      * webhdfs://host:port/path/file
      * ./local/path/file
      * ~/local/path/file
      * local/path/file
      * ./local/path/file.gz
      * file:///home/user/file
      * file:///home/user/file.bz2
      * [ssh|scp|sftp]://username@host//path/file
      * [ssh|scp|sftp]://username@host/path/file

    """
    if os.name == 'nt':
        # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
        if '://' not in uri_as_string:
            # no protocol given => assume a local file
            uri_as_string = 'file://' + uri_as_string
    parsed_uri = urlsplit(uri_as_string, allow_fragments=False)

    if parsed_uri.scheme == "hdfs":
        return _parse_uri_hdfs(parsed_uri)
    elif parsed_uri.scheme == "webhdfs":
        return _parse_uri_webhdfs(parsed_uri)
    elif parsed_uri.scheme in smart_open_s3.SUPPORTED_SCHEMES:
        return _parse_uri_s3x(parsed_uri)
    elif parsed_uri.scheme == 'file':
        return _parse_uri_file(parsed_uri.netloc + parsed_uri.path)
    elif parsed_uri.scheme in ('', None):
        return _parse_uri_file(uri_as_string)
    elif parsed_uri.scheme.startswith('http'):
        return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string)
    elif parsed_uri.scheme in smart_open_ssh.SCHEMES:
        return _parse_uri_ssh(parsed_uri)
    else:
        raise NotImplementedError("unknown URI scheme %r in %r" %
                                  (parsed_uri.scheme, uri_as_string))
Exemplo n.º 3
0
    def __init__(self, uri, default_scheme="file"):
        """
        Assume `default_scheme` if no scheme given in `uri`.

        """
        if os.name == 'nt':
            # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
            if '://' not in uri:
                # no protocol given => assume a local file
                uri = 'file://' + uri
        parsed_uri = urlsplit(uri)
        self.scheme = parsed_uri.scheme if parsed_uri.scheme else default_scheme

        if self.scheme == "hdfs":
            self.uri_path = parsed_uri.netloc + parsed_uri.path
            self.uri_path = "/" + self.uri_path.lstrip("/")

            if not self.uri_path:
                raise RuntimeError("invalid HDFS URI: %s" % uri)
        elif self.scheme == "webhdfs":
            self.uri_path = parsed_uri.netloc + "/webhdfs/v1" + parsed_uri.path
            if parsed_uri.query:
                self.uri_path += "?" + parsed_uri.query

            if not self.uri_path:
                raise RuntimeError("invalid WebHDFS URI: %s" % uri)
        elif self.scheme in ("s3", "s3n"):
            self.bucket_id = (parsed_uri.netloc + parsed_uri.path).split('@')
            self.key_id = None

            if len(self.bucket_id) == 1:
                # URI without credentials: s3://bucket/object
                self.bucket_id, self.key_id = self.bucket_id[0].split('/', 1)
                # "None" credentials are interpreted as "look for credentials in other locations" by boto
                self.access_id, self.access_secret = None, None
            elif len(self.bucket_id) == 2 and len(self.bucket_id[0].split(':')) == 2:
                # URI in full format: s3://key:secret@bucket/object
                # access key id: [A-Z0-9]{20}
                # secret access key: [A-Za-z0-9/+=]{40}
                acc, self.bucket_id = self.bucket_id
                self.access_id, self.access_secret = acc.split(':')
                self.bucket_id, self.key_id = self.bucket_id.split('/', 1)
            else:
                # more than 1 '@' means invalid uri
                # Bucket names must be at least 3 and no more than 63 characters long.
                # Bucket names must be a series of one or more labels.
                # Adjacent labels are separated by a single period (.).
                # Bucket names can contain lowercase letters, numbers, and hyphens.
                # Each label must start and end with a lowercase letter or a number.
                raise RuntimeError("invalid S3 URI: %s" % uri)
        elif self.scheme == 'file':
            self.uri_path = parsed_uri.netloc + parsed_uri.path

            # '~/tmp' may be expanded to '/Users/username/tmp'
            self.uri_path = os.path.expanduser(self.uri_path)

            if not self.uri_path:
                raise RuntimeError("invalid file URI: %s" % uri)
        else:
            raise NotImplementedError("unknown URI scheme %r in %r" % (self.scheme, uri))
Exemplo n.º 4
0
    def __init__(self, uri, default_scheme="file"):
        """
        Assume `default_scheme` if no scheme given in `uri`.

        """
        if os.name == 'nt':
            # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
            if '://' not in uri:
                # no protocol given => assume a local file
                uri = 'file://' + uri
        parsed_uri = urlsplit(uri)
        self.scheme = parsed_uri.scheme if parsed_uri.scheme else default_scheme

        if self.scheme == "hdfs":
            self.uri_path = parsed_uri.netloc + parsed_uri.path
            self.uri_path = "/" + self.uri_path.lstrip("/")

            if not self.uri_path:
                raise RuntimeError("invalid HDFS URI: %s" % uri)
        elif self.scheme == "webhdfs":
            self.uri_path = parsed_uri.netloc + "/webhdfs/v1" + parsed_uri.path

            if not self.uri_path:
                raise RuntimeError("invalid WebHDFS URI: %s" % uri)
        elif self.scheme in ("s3", "s3n"):
            self.bucket_id = (parsed_uri.netloc + parsed_uri.path).split('@')
            self.key_id = None

            if len(self.bucket_id) == 1:
                # URI without credentials: s3://bucket/object
                self.bucket_id, self.key_id = self.bucket_id[0].split('/', 1)
                # "None" credentials are interpreted as "look for credentials in other locations" by boto
                self.access_id, self.access_secret = None, None
            elif len(self.bucket_id) == 2 and len(
                    self.bucket_id[0].split(':')) == 2:
                # URI in full format: s3://key:secret@bucket/object
                # access key id: [A-Z0-9]{20}
                # secret access key: [A-Za-z0-9/+=]{40}
                acc, self.bucket_id = self.bucket_id
                self.access_id, self.access_secret = acc.split(':')
                self.bucket_id, self.key_id = self.bucket_id.split('/', 1)
            else:
                # more than 1 '@' means invalid uri
                # Bucket names must be at least 3 and no more than 63 characters long.
                # Bucket names must be a series of one or more labels.
                # Adjacent labels are separated by a single period (.).
                # Bucket names can contain lowercase letters, numbers, and hyphens.
                # Each label must start and end with a lowercase letter or a number.
                raise RuntimeError("invalid S3 URI: %s" % uri)
        elif self.scheme == 'file':
            self.uri_path = parsed_uri.netloc + parsed_uri.path

            if not self.uri_path:
                raise RuntimeError("invalid file URI: %s" % uri)
        else:
            raise NotImplementedError("unknown URI scheme %r in %r" %
                                      (self.scheme, uri))
def read_sample_csv(filename_to_read, inConn):
    splitInputDir = urlsplit(filename_to_read, allow_fragments=False)
    if inConn is None:
        inConn = get_objectstore_conn()
    inbucket = inConn.get_bucket(splitInputDir.netloc)
    kr = inbucket.get_key(splitInputDir.path)
    assert kr is not None, 'Unable to read file. File may be absent'
    with smart_open.smart_open(kr, 'r') as fin:
        data = pn.read_csv(fin, header=0, error_bad_lines=False,
                           dtype='str').fillna('NA')
    return data
Exemplo n.º 6
0
def _parse_uri(uri_as_string):
    """
    Parse the given URI from a string.

    Supported URI schemes are "file", "s3", "s3n", "s3u" and "hdfs".

      * s3 and s3n are treated the same way.
      * s3u is s3 but without SSL.

    Valid URI examples::

      * s3://my_bucket/my_key
      * s3://my_key:my_secret@my_bucket/my_key
      * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key
      * hdfs:///path/file
      * hdfs://path/file
      * webhdfs://host:port/path/file
      * ./local/path/file
      * ~/local/path/file
      * local/path/file
      * ./local/path/file.gz
      * file:///home/user/file
      * file:///home/user/file.bz2
    """
    if os.name == 'nt':
        # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
        if '://' not in uri_as_string:
            # no protocol given => assume a local file
            uri_as_string = 'file://' + uri_as_string
    parsed_uri = urlsplit(uri_as_string, allow_fragments=False)

    if parsed_uri.scheme == "hdfs":
        return _parse_uri_hdfs(parsed_uri)
    elif parsed_uri.scheme == "webhdfs":
        return _parse_uri_webhdfs(parsed_uri)
    elif parsed_uri.scheme in ("s3", "s3n", "s3u"):
        return _parse_uri_s3x(parsed_uri)
    elif parsed_uri.scheme in ('file', '', None):
        return _parse_uri_file(parsed_uri)
    elif parsed_uri.scheme.startswith('http'):
        return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string)
    else:
        raise NotImplementedError(
            "unknown URI scheme %r in %r" % (parsed_uri.scheme, uri_as_string)
        )
Exemplo n.º 7
0
def _parse_uri(uri_as_string):
    """
    Parse the given URI from a string.

    Supported URI schemes are "file", "s3", "s3n", "s3u" and "hdfs".

      * s3 and s3n are treated the same way.
      * s3u is s3 but without SSL.

    Valid URI examples::

      * s3://my_bucket/my_key
      * s3://my_key:my_secret@my_bucket/my_key
      * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key
      * hdfs:///path/file
      * hdfs://path/file
      * webhdfs://host:port/path/file
      * ./local/path/file
      * ~/local/path/file
      * local/path/file
      * ./local/path/file.gz
      * file:///home/user/file
      * file:///home/user/file.bz2
    """
    if os.name == 'nt':
        # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
        if '://' not in uri_as_string:
            # no protocol given => assume a local file
            uri_as_string = 'file://' + uri_as_string
    parsed_uri = urlsplit(uri_as_string, allow_fragments=False)

    if parsed_uri.scheme == "hdfs":
        return _parse_uri_hdfs(parsed_uri)
    elif parsed_uri.scheme == "webhdfs":
        return _parse_uri_webhdfs(parsed_uri)
    elif parsed_uri.scheme in ("s3", "s3n", "s3u"):
        return _parse_uri_s3x(parsed_uri)
    elif parsed_uri.scheme in ('file', '', None):
        return _parse_uri_file(parsed_uri)
    elif parsed_uri.scheme.startswith('http'):
        return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string)
    else:
        raise NotImplementedError(
            "unknown URI scheme %r in %r" % (parsed_uri.scheme, uri_as_string)
        )
Exemplo n.º 8
0
def HttpOpenRead(parsed_uri, mode='r', **kwargs):
    if parsed_uri.scheme not in ('http', 'https'):
        raise TypeError("can only process http/https urls")
    if mode not in ('r', 'rb'):
        raise NotImplementedError('Streaming write to http not supported')

    url = parsed_uri.uri_path

    response = HttpReadStream(url, **kwargs)

    fname = urlsplit(url, allow_fragments=False).path.split('/')[-1]

    if fname.endswith('.gz'):
        #  Gzip needs a seek-able filehandle, so we need to buffer it.
        buffer = make_closing(io.BytesIO)(response.binary_content())
        return compression_wrapper(buffer, fname, mode)
    else:
        return compression_wrapper(response, fname, mode)
def write_avro_context_manager(data,
                               filename_to_write,
                               inConn=None,
                               num_lines=100):
    avroSchemaOut = gen_schema(data)
    schema = avro.schema.parse(avroSchemaOut)
    dictRes = data.to_dict(orient='records')
    splitInputDir = urlsplit(filename_to_write, allow_fragments=False)
    if inConn is None:
        inConn = get_objectstore_conn()
    inbucket = inConn.get_bucket(splitInputDir.netloc)
    kw = inbucket.get_key(splitInputDir.path, validate=False)
    assert kw is not None, "Unable to get avro key to write"
    with smart_open.smart_open(kw, 'wb') as foutd:
        foutd.flush = lambda: None
        with avro.datafile.DataFileWriter(foutd, avro.io.DatumWriter(),
                                          schema) as writer_contextManager:
            for ll, row in enumerate(dictRes):
                writer_contextManager.append(row)
Exemplo n.º 10
0
    def __init__(self, uri, default_scheme="file"):
        """
        Assume `default_scheme` if no scheme given in `uri`.

        """
        if os.name == 'nt':
            # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
            if '://' not in uri:
                # no protocol given => assume a local file
                uri = 'file://' + uri
        parsed_uri = urlsplit(uri, allow_fragments=False)
        self.scheme = parsed_uri.scheme if parsed_uri.scheme else default_scheme

        if self.scheme == "hdfs":
            self.uri_path = parsed_uri.netloc + parsed_uri.path
            self.uri_path = "/" + self.uri_path.lstrip("/")

            if not self.uri_path:
                raise RuntimeError("invalid HDFS URI: %s" % uri)
        elif self.scheme == "webhdfs":
            self.uri_path = parsed_uri.netloc + "/webhdfs/v1" + parsed_uri.path
            if parsed_uri.query:
                self.uri_path += "?" + parsed_uri.query

            if not self.uri_path:
                raise RuntimeError("invalid WebHDFS URI: %s" % uri)
        elif self.scheme in ("s3", "s3n", "s3u"):
            self.bucket_id = (parsed_uri.netloc + parsed_uri.path).split('@')
            self.key_id = None
            self.port = 443
            self.host = boto.config.get('s3', 'host', 's3.amazonaws.com')
            self.ordinary_calling_format = False
            if len(self.bucket_id) == 1:
                # URI without credentials: s3://bucket/object
                self.bucket_id, self.key_id = self.bucket_id[0].split('/', 1)
                # "None" credentials are interpreted as "look for credentials in other locations" by boto
                self.access_id, self.access_secret = None, None
            elif len(self.bucket_id) == 2 and len(
                    self.bucket_id[0].split(':')) == 2:
                # URI in full format: s3://key:secret@bucket/object
                # access key id: [A-Z0-9]{20}
                # secret access key: [A-Za-z0-9/+=]{40}
                acc, self.bucket_id = self.bucket_id
                self.access_id, self.access_secret = acc.split(':')
                self.bucket_id, self.key_id = self.bucket_id.split('/', 1)
            elif len(self.bucket_id) == 3 and len(
                    self.bucket_id[0].split(':')) == 2:
                # or URI in extended format: s3://key:secret@server[:port]@bucket/object
                acc, server, self.bucket_id = self.bucket_id
                self.access_id, self.access_secret = acc.split(':')
                self.bucket_id, self.key_id = self.bucket_id.split('/', 1)
                server = server.split(':')
                self.ordinary_calling_format = True
                self.host = server[0]
                if len(server) == 2:
                    self.port = int(server[1])
            else:
                # more than 2 '@' means invalid uri
                # Bucket names must be at least 3 and no more than 63 characters long.
                # Bucket names must be a series of one or more labels.
                # Adjacent labels are separated by a single period (.).
                # Bucket names can contain lowercase letters, numbers, and hyphens.
                # Each label must start and end with a lowercase letter or a number.
                raise RuntimeError("invalid S3 URI: %s" % uri)
        elif self.scheme == 'file':
            self.uri_path = parsed_uri.netloc + parsed_uri.path

            # '~/tmp' may be expanded to '/Users/username/tmp'
            self.uri_path = os.path.expanduser(self.uri_path)

            if not self.uri_path:
                raise RuntimeError("invalid file URI: %s" % uri)
        elif self.scheme.startswith('http'):
            self.uri_path = uri
        elif self.scheme == 'gs':
            self.uri_path = uri
        else:
            raise NotImplementedError("unknown URI scheme %r in %r" %
                                      (self.scheme, uri))
Exemplo n.º 11
0
    def test_1_basic(self):
        print('--- running S3Connection tests ---')
        c = S3Connection()
        # create a new, empty bucket
        bucket_name = 'test-%d' % int(time.time())
        bucket = c.create_bucket(bucket_name)
        # now try a get_bucket call and see if it's really there
        bucket = c.get_bucket(bucket_name)
        # test logging
        logging_bucket = c.create_bucket(bucket_name + '-log')
        logging_bucket.set_as_logging_target()
        bucket.enable_logging(target_bucket=logging_bucket,
                              target_prefix=bucket.name)
        bucket.disable_logging()
        c.delete_bucket(logging_bucket)
        k = bucket.new_key('foobar')
        s1 = 'This is a test of file upload and download'
        s2 = 'This is a second string to test file upload and download'
        k.set_contents_from_string(s1)
        fp = open('foobar', 'wb')
        # now get the contents from s3 to a local file
        k.get_contents_to_file(fp)
        fp.close()
        fp = open('foobar')
        # check to make sure content read from s3 is identical to original
        assert s1 == fp.read(), 'corrupted file'
        fp.close()
        # test generated URLs
        url = k.generate_url(3600)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        url = k.generate_url(3600, force_http=True)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        url = k.generate_url(3600,
                             force_http=True,
                             headers={'x-amz-x-token': 'XYZ'})
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        rh = {'response-content-disposition': 'attachment; filename="foo.txt"'}
        url = k.generate_url(60, response_headers=rh)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        #test whether amperands and to-be-escaped characters work in header filename
        rh = {
            'response-content-disposition':
            'attachment; filename="foo&z%20ar&ar&zar&bar.txt"'
        }
        url = k.generate_url(60, response_headers=rh, force_http=True)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        # overwrite foobar contents with a PUT
        url = k.generate_url(3600,
                             'PUT',
                             force_http=True,
                             policy='private',
                             reduced_redundancy=True)
        up = urlsplit(url)
        con = http_client.HTTPConnection(up.hostname, up.port)
        con.request("PUT", up.path + '?' + up.query, body="hello there")
        resp = con.getresponse()
        assert 200 == resp.status
        assert b"hello there" == k.get_contents_as_string()
        bucket.delete_key(k)
        # test a few variations on get_all_keys - first load some data
        # for the first one, let's override the content type
        phony_mimetype = 'application/x-boto-test'
        headers = {'Content-Type': phony_mimetype}
        k.name = 'foo/bar'
        k.set_contents_from_string(s1, headers)
        k.name = 'foo/bas'
        size = k.set_contents_from_filename('foobar')
        assert size == 42
        k.name = 'foo/bat'
        k.set_contents_from_string(s1)
        k.name = 'fie/bar'
        k.set_contents_from_string(s1)
        k.name = 'fie/bas'
        k.set_contents_from_string(s1)
        k.name = 'fie/bat'
        k.set_contents_from_string(s1)
        # try resetting the contents to another value
        md5 = k.md5
        k.set_contents_from_string(s2)
        assert k.md5 != md5
        os.unlink('foobar')
        all = bucket.get_all_keys()
        assert len(all) == 6
        rs = bucket.get_all_keys(prefix='foo')
        assert len(rs) == 3
        rs = bucket.get_all_keys(prefix='', delimiter='/')
        assert len(rs) == 2
        rs = bucket.get_all_keys(maxkeys=5)
        assert len(rs) == 5
        # test the lookup method
        k = bucket.lookup('foo/bar')
        assert isinstance(k, bucket.key_class)
        assert k.content_type == phony_mimetype
        k = bucket.lookup('notthere')
        assert k == None
        # try some metadata stuff
        k = bucket.new_key('has_metadata')
        mdkey1 = 'meta1'
        mdval1 = 'This is the first metadata value'
        k.set_metadata(mdkey1, mdval1)
        mdkey2 = 'meta2'
        mdval2 = 'This is the second metadata value'
        k.set_metadata(mdkey2, mdval2)
        # try a unicode metadata value
        mdval3 = u'föö'
        mdkey3 = 'meta3'
        k.set_metadata(mdkey3, mdval3)
        k.set_contents_from_string(s1)
        k = bucket.lookup('has_metadata')
        assert k.get_metadata(mdkey1) == mdval1
        assert k.get_metadata(mdkey2) == mdval2
        assert k.get_metadata(mdkey3) == mdval3
        k = bucket.new_key('has_metadata')
        k.get_contents_as_string()
        assert k.get_metadata(mdkey1) == mdval1
        assert k.get_metadata(mdkey2) == mdval2
        assert k.get_metadata(mdkey3) == mdval3
        bucket.delete_key(k)
        # test list and iterator
        rs1 = bucket.list()
        num_iter = 0
        for r in rs1:
            num_iter = num_iter + 1
        rs = bucket.get_all_keys()
        num_keys = len(rs)
        assert num_iter == num_keys
        # try a key with a funny character
        k = bucket.new_key('testnewline\n')
        k.set_contents_from_string('This is a test')
        rs = bucket.get_all_keys()
        assert len(rs) == num_keys + 1
        bucket.delete_key(k)
        rs = bucket.get_all_keys()
        assert len(rs) == num_keys
        # try some acl stuff
        bucket.set_acl('public-read')
        policy = bucket.get_acl()
        assert len(policy.acl.grants) == 2
        bucket.set_acl('private')
        policy = bucket.get_acl()
        assert len(policy.acl.grants) == 1
        k = bucket.lookup('foo/bar')
        k.set_acl('public-read')
        policy = k.get_acl()
        assert len(policy.acl.grants) == 2
        k.set_acl('private')
        policy = k.get_acl()
        assert len(policy.acl.grants) == 1
        # try the convenience methods for grants
        bucket.add_user_grant(
            'FULL_CONTROL',
            'c1e724fbfa0979a4448393c59a8c055011f739b6d102fb37a65f26414653cd67')
        try:
            bucket.add_email_grant('foobar', '*****@*****.**')
        except S3PermissionsError:
            pass
        # now try to create an RRS key
        k = bucket.new_key('reduced_redundancy')
        k.set_contents_from_string('This key has reduced redundancy',
                                   reduced_redundancy=True)

        # now try to inject a response header
        data = k.get_contents_as_string(
            response_headers={'response-content-type': 'foo/bar'})
        assert k.content_type == 'foo/bar'

        # now delete all keys in bucket
        for k in bucket:
            if k.name == 'reduced_redundancy':
                assert k.storage_class == 'REDUCED_REDUNDANCY'
            bucket.delete_key(k)
        # now delete bucket
        time.sleep(5)
        c.delete_bucket(bucket)
        print('--- tests completed ---')
Exemplo n.º 12
0
    def test_1_basic(self):
        print('--- running S3Connection tests ---')
        c = S3Connection()
        # create a new, empty bucket
        bucket_name = 'test-%d' % int(time.time())
        bucket = c.create_bucket(bucket_name)
        # now try a get_bucket call and see if it's really there
        bucket = c.get_bucket(bucket_name)
        # test logging
        logging_bucket = c.create_bucket(bucket_name + '-log')
        logging_bucket.set_as_logging_target()
        bucket.enable_logging(target_bucket=logging_bucket, target_prefix=bucket.name)
        bucket.disable_logging()
        c.delete_bucket(logging_bucket)
        k = bucket.new_key('foobar')
        s1 = 'This is a test of file upload and download'
        s2 = 'This is a second string to test file upload and download'
        k.set_contents_from_string(s1)
        fp = open('foobar', 'wb')
        # now get the contents from s3 to a local file
        k.get_contents_to_file(fp)
        fp.close()
        fp = open('foobar')
        # check to make sure content read from s3 is identical to original
        assert s1 == fp.read(), 'corrupted file'
        fp.close()
        # test generated URLs
        url = k.generate_url(3600)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        url = k.generate_url(3600, force_http=True)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        url = k.generate_url(3600, force_http=True, headers={'x-amz-x-token' : 'XYZ'})
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        rh = {'response-content-disposition': 'attachment; filename="foo.txt"'}
        url = k.generate_url(60, response_headers=rh)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        #test whether amperands and to-be-escaped characters work in header filename
        rh = {'response-content-disposition': 'attachment; filename="foo&z%20ar&ar&zar&bar.txt"'}
        url = k.generate_url(60, response_headers=rh, force_http=True)
        file = urlopen(url)
        assert s1 == file.read().decode('utf-8'), 'invalid URL %s' % url
        # overwrite foobar contents with a PUT
        url = k.generate_url(3600, 'PUT', force_http=True, policy='private', reduced_redundancy=True)
        up = urlsplit(url)
        con = http_client.HTTPConnection(up.hostname, up.port)
        con.request("PUT", up.path + '?' + up.query, body="hello there")
        resp = con.getresponse()
        assert 200 == resp.status
        assert b"hello there" == k.get_contents_as_string()
        bucket.delete_key(k)
        # test a few variations on get_all_keys - first load some data
        # for the first one, let's override the content type
        phony_mimetype = 'application/x-boto-test'
        headers = {'Content-Type': phony_mimetype}
        k.name = 'foo/bar'
        k.set_contents_from_string(s1, headers)
        k.name = 'foo/bas'
        size = k.set_contents_from_filename('foobar')
        assert size == 42
        k.name = 'foo/bat'
        k.set_contents_from_string(s1)
        k.name = 'fie/bar'
        k.set_contents_from_string(s1)
        k.name = 'fie/bas'
        k.set_contents_from_string(s1)
        k.name = 'fie/bat'
        k.set_contents_from_string(s1)
        # try resetting the contents to another value
        md5 = k.md5
        k.set_contents_from_string(s2)
        assert k.md5 != md5
        os.unlink('foobar')
        all = bucket.get_all_keys()
        assert len(all) == 6
        rs = bucket.get_all_keys(prefix='foo')
        assert len(rs) == 3
        rs = bucket.get_all_keys(prefix='', delimiter='/')
        assert len(rs) == 2
        rs = bucket.get_all_keys(maxkeys=5)
        assert len(rs) == 5
        # test the lookup method
        k = bucket.lookup('foo/bar')
        assert isinstance(k, bucket.key_class)
        assert k.content_type == phony_mimetype
        k = bucket.lookup('notthere')
        assert k == None
        # try some metadata stuff
        k = bucket.new_key('has_metadata')
        mdkey1 = 'meta1'
        mdval1 = 'This is the first metadata value'
        k.set_metadata(mdkey1, mdval1)
        mdkey2 = 'meta2'
        mdval2 = 'This is the second metadata value'
        k.set_metadata(mdkey2, mdval2)
        # try a unicode metadata value
        mdval3 = u'föö'
        mdkey3 = 'meta3'
        k.set_metadata(mdkey3, mdval3)
        k.set_contents_from_string(s1)
        k = bucket.lookup('has_metadata')
        assert k.get_metadata(mdkey1) == mdval1
        assert k.get_metadata(mdkey2) == mdval2
        assert k.get_metadata(mdkey3) == mdval3
        k = bucket.new_key('has_metadata')
        k.get_contents_as_string()
        assert k.get_metadata(mdkey1) == mdval1
        assert k.get_metadata(mdkey2) == mdval2
        assert k.get_metadata(mdkey3) == mdval3
        bucket.delete_key(k)
        # test list and iterator
        rs1 = bucket.list()
        num_iter = 0
        for r in rs1:
            num_iter = num_iter + 1
        rs = bucket.get_all_keys()
        num_keys = len(rs)
        assert num_iter == num_keys
        # try a key with a funny character
        k = bucket.new_key('testnewline\n')
        k.set_contents_from_string('This is a test')
        rs = bucket.get_all_keys()
        assert len(rs) == num_keys + 1
        bucket.delete_key(k)
        rs = bucket.get_all_keys()
        assert len(rs) == num_keys
        # try some acl stuff
        bucket.set_acl('public-read')
        policy = bucket.get_acl()
        assert len(policy.acl.grants) == 2
        bucket.set_acl('private')
        policy = bucket.get_acl()
        assert len(policy.acl.grants) == 1
        k = bucket.lookup('foo/bar')
        k.set_acl('public-read')
        policy = k.get_acl()
        assert len(policy.acl.grants) == 2
        k.set_acl('private')
        policy = k.get_acl()
        assert len(policy.acl.grants) == 1
        # try the convenience methods for grants
        bucket.add_user_grant('FULL_CONTROL',
                              'c1e724fbfa0979a4448393c59a8c055011f739b6d102fb37a65f26414653cd67')
        try:
            bucket.add_email_grant('foobar', '*****@*****.**')
        except S3PermissionsError:
            pass
        # now try to create an RRS key
        k = bucket.new_key('reduced_redundancy')
        k.set_contents_from_string('This key has reduced redundancy',
                                   reduced_redundancy=True)

        # now try to inject a response header
        data = k.get_contents_as_string(response_headers={'response-content-type' : 'foo/bar'})
        assert k.content_type == 'foo/bar'

        # now delete all keys in bucket
        for k in bucket:
            if k.name == 'reduced_redundancy':
                assert k.storage_class == 'REDUCED_REDUNDANCY'
            bucket.delete_key(k)
        # now delete bucket
        time.sleep(5)
        c.delete_bucket(bucket)
        print('--- tests completed ---')