Exemplo n.º 1
0
    def test_s3_iter_bucket_moto(self):
        """Does s3_iter_bucket work correctly?"""
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        mybucket = conn.get_bucket("mybucket")

        # first, create some keys in the bucket
        expected = {}
        for key_no in range(200):
            key_name = "mykey%s" % key_no
            with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout:
                content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10))
                fout.write(content)
                expected[key_name] = content

        # read all keys + their content back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(expected, result)

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4')))
        self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4')))

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10))
        self.assertEqual(len(result), min(len(expected), 10))

        for workers in [1, 4, 8, 16, 64]:
            self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
Exemplo n.º 2
0
    def test_s3_iter_bucket_moto(self):
        """Does s3_iter_bucket work correctly?"""
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        mybucket = conn.get_bucket("mybucket")

        # first, create some keys in the bucket
        expected = {}
        for key_no in range(200):
            key_name = "mykey%s" % key_no
            with smart_open.smart_open("s3://mybucket/%s" % key_name, 'wb') as fout:
                content = '\n'.join("line%i%i" % (key_no, line_no) for line_no in range(10)).encode('utf8')
                fout.write(content)
                expected[key_name] = content

        # read all keys + their content back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(expected, result)

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, accept_key=lambda fname: fname.endswith('4')))
        self.assertEqual(result, dict((k, c) for k, c in expected.items() if k.endswith('4')))

        # read some of the keys back, in parallel, using s3_iter_bucket
        result = dict(smart_open.s3_iter_bucket(mybucket, key_limit=10))
        self.assertEqual(len(result), min(len(expected), 10))

        for workers in [1, 4, 8, 16, 64]:
            self.assertEqual(dict(smart_open.s3_iter_bucket(mybucket, workers=workers)), expected)
Exemplo n.º 3
0
 def test_deprecated_top_level_s3_iter_bucket(self):
     populate_bucket()
     with self.assertLogs(smart_open.logger.name, level='WARN') as cm:
         # invoking once will generate a warning
         smart_open.s3_iter_bucket(BUCKET_NAME)
         # invoking again will not (to reduce spam)
         smart_open.s3_iter_bucket(BUCKET_NAME)
         # verify only one output
         assert len(cm.output) == 1
         # verify the suggested new import is in the warning
         assert "from smart_open.s3 import iter_bucket as s3_iter_bucket" in cm.output[0]
Exemplo n.º 4
0
def load_dir_jls(conn, path):
    bucket, key = get_bucket_key(path)

    bucket = conn.get_bucket(bucket)
    for name, content in tqdm(
            s3_iter_bucket(bucket,
                           key,
                           accept_key=lambda name: name.endswith('.jl'))):
        for line in content.decode('utf8').split('\n'):
            if line:
                yield json.loads(line)
Exemplo n.º 5
0
def getRecordFromBucket(BUCKET,PREFIX):

    records = []
    for key, file in s3_iter_bucket(BUCKET,prefix=PREFIX,\
                accept_key=lambda key:'part' in key):

        temp = file.decode('utf-8').strip().split('\n')
        if isinstance(temp, str):
            records.append(temp)
        elif isinstance(temp,list):
            records += temp

    return records
Exemplo n.º 6
0
    def test_s3_iter_bucket_mock(self, mock_pool):
        """Is s3_iter_bucket called correctly?"""
        attrs = {"name" : "fileA", "get_contents_as_string.return_value" : "contentA"}
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value" : [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        for key, content in smart_open.s3_iter_bucket(mybucket):
            mock_pool.Pool.assert_called_with(processes=16)
            mock_pool.Pool().imap_unordered.assert_called_with()

        mock_pool.Pool.assert_called_with(processes=16)
        self.assertTrue(mock_pool.Pool().imap_unordered.called)
Exemplo n.º 7
0
    def test_s3_iter_bucket_mock(self, mock_pool):
        """Is s3_iter_bucket called correctly?"""
        attrs = {"name" : "fileA", "get_contents_as_string.return_value" : "contentA"}
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value" : [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        for key, content in smart_open.s3_iter_bucket(mybucket):
            mock_pool.Pool.assert_called_with(processes=16)
            mock_pool.Pool().imap_unordered.assert_called_with()

        mock_pool.Pool.assert_called_with(processes=16)
        self.assertTrue(mock_pool.Pool().imap_unordered.called)
Exemplo n.º 8
0
    def test_s3_iter_bucket_with_SSLError_moto(self):
        attrs = {
            "name": "fileA",
            "get_contents_as_string.return_value": b"contentA"
        }
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value": [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        # when get_contents_as_string always returns SSLError
        mykey.get_contents_as_string.side_effect = SSLError
        self.assertRaises(SSLError,
                          lambda x: next(smart_open.s3_iter_bucket(x)),
                          mybucket)

        # when get_contents_as_string only returns SSLError once, can still recover
        mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # when get_contents_as_string fails up to three times, can still recover
        mykey.get_contents_as_string.side_effect = [
            SSLError, SSLError, SSLError, b"contentA"
        ]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # but not more than three times ....
        mykey.get_contents_as_string.side_effect = [
            SSLError, SSLError, SSLError, SSLError, b"contentA"
        ]
        self.assertRaises(SSLError,
                          lambda x: next(smart_open.s3_iter_bucket(x)),
                          mybucket)

        # unless you specify more retries ....
        mykey.get_contents_as_string.side_effect = [
            SSLError, SSLError, SSLError, SSLError, b"contentA"
        ]
        key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # some other exception always fails, and never retries
        mykey.get_contents_as_string.side_effect = [Exception, b"contentA"]
        self.assertRaises(Exception,
                          lambda x: next(smart_open.s3_iter_bucket(x)),
                          mybucket)
Exemplo n.º 9
0
def s3_get_articles(issue, bucket, workers=None):
    """Read a newspaper issue from S3 and return the articles it contains.

    :param issue: the newspaper issue
    :type issue: an instance of `impresso_commons.path.IssueDir`
    :param bucket: the input s3 bucket
    :type bucket: `boto.s3.bucket.Bucket`
    :param workers: number of workers for the s3_iter_bucket function. If None, will be the number of detected CPUs.
    :return: a list of articles (dictionaries)

    NB: Content items with type = "ad" (advertisement) are filtered out.
    """
    nb_workers = _get_cores() if workers is None else workers
    issue_data = list(
        s3_iter_bucket(bucket, prefix=issue.path, workers=nb_workers))
    print(issue_data)
    issue_data = issue_data[0][1]
    issue_json = json.loads(issue_data.decode('utf-8'))
    articles = [
        item for item in issue_json["i"] if item["m"]["tp"] == "article"
    ]
    return articles
Exemplo n.º 10
0
    def test_s3_iter_bucket_with_SSLError_moto(self):
        attrs = {"name" : "fileA", "get_contents_as_string.return_value" : b"contentA"}
        mykey = mock.Mock(spec=["name", "get_contents_as_string"])
        mykey.configure_mock(**attrs)

        attrs = {"list.return_value" : [mykey]}
        mybucket = mock.Mock(spec=["list"])
        mybucket.configure_mock(**attrs)

        # when get_contents_as_string always returns SSLError
        mykey.get_contents_as_string.side_effect = SSLError
        self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)

        # when get_contents_as_string only returns SSLError once, can still recover
        mykey.get_contents_as_string.side_effect = [SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # when get_contents_as_string fails up to three times, can still recover
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # but not more than three times ....
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"]
        self.assertRaises(SSLError, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)

        # unless you specify more retries ....
        mykey.get_contents_as_string.side_effect = [SSLError, SSLError, SSLError, SSLError, b"contentA"]
        key, content = next(smart_open.s3_iter_bucket(mybucket, retries=4))
        self.assertEqual(key, mykey)
        self.assertEqual(content, b"contentA")

        # some other exception always fails, and never retries
        mykey.get_contents_as_string.side_effect = [Exception, b"contentA"]
        self.assertRaises(Exception, lambda x: next(smart_open.s3_iter_bucket(x)), mybucket)
Exemplo n.º 11
0
from boto.s3.connection import S3Connection
import glob
import sys
from smart_open import s3_iter_bucket
#print(sys.argv)

conn=S3Connection(sys.argv[1],sys.argv[2])
bucket=conn.get_bucket('s3-acpcontent')

#keys = bucket.list()


for key, content in s3_iter_bucket(bucket, accept_key=lambda key: key.endswith('.json')):
    print (key, len(content))
Exemplo n.º 12
0
def smart_walk(path,
               load=False,
               raise_exc=False,
               accept_path=const_true,
               **s3_args):
    """ walk a directory """

    # TODO spaghetti code - disentangle!

    url = urlparse(to_str(path)) if isinstance(path, (bytes, str)) else path
    path = url.path if url.path.endswith(
        os.path.sep) else url.path + os.path.sep

    if url.scheme == "s3":
        try:
            import boto

            bucket = boto.connect_s3().get_bucket(url.hostname, validate=True)
        except Exception as exc:
            LOGGER.exception(exc)
            if raise_exc:
                raise exc
            return

        path = path[1:]

        if not load:
            try:
                for obj in bucket.list(prefix=path):
                    if accept_path(obj.key):
                        yield smart_url(scheme="s3",
                                        hostname=url.hostname,
                                        path=obj.key), None

            except Exception as exc:
                LOGGER.exception(exc)
                if raise_exc:
                    raise exc

            return

        try:
            from smart_open import s3_iter_bucket
        except ImportError as exc:
            LOGGER.error("<smart_open> library must be importable")
            LOGGER.exception(exc)
            if raise_exc:
                raise exc
            return

        try:
            for key, content in s3_iter_bucket(bucket,
                                               prefix=path,
                                               accept_key=accept_path,
                                               **s3_args):
                yield smart_url(scheme="s3",
                                hostname=bucket.name,
                                path=key.key), content

        except Exception as exc:
            LOGGER.exception(exc)
            if raise_exc:
                raise exc

        return

    path = os.path.abspath(path)

    for sub_dir, _, file_paths in os.walk(path):
        for file_path in file_paths:
            file_path = os.path.join(sub_dir, file_path)

            if not accept_path(file_path):
                continue

            url = smart_url(scheme="file", path=file_path)

            if not load:
                yield url, None
                continue

            try:
                with open(file_path, "rb") as file_obj:
                    yield url, file_obj.read()

            except Exception as exc:
                LOGGER.exception(exc)
                if raise_exc:
                    raise exc