예제 #1
0
    def concatenate_from(self,
                         other_locs: List['BaseFileUrl']) -> Optional[int]:
        if not all([
                isinstance(loc, S3FileUrl) and loc.bucket == self.bucket
                for loc in other_locs
        ]):
            logger.warning(
                "Concatenating data locally - this may be slow for large data sets"
            )
            return super().concatenate_from(other_locs)
        job = S3Concat(
            self.bucket,
            self.key,
            session=self._boto3_session,
            # We want one file the end--S3Concat's other
            # job in life is concatting small log files
            # into larger ones, where a minimum file size
            # is a hint for when to stop combining files
            # together.
            min_file_size=None)
        for loc in other_locs:
            assert isinstance(loc, S3FileUrl)  # keep mypy happy
            # Add files, can call multiple times to add files from other directories
            job.add_file(loc.key)

        out = job.concat()
        assert len(
            out
        ) == 1  # with above arg, this should provide only a single output file

        return None
예제 #2
0
def test_concat_gzip_content():
    """Create 2 gzip files, then use s3concat to create a single gzip file
    To test, un-compress and read contents of the concat'd file
    """
    import gzip
    import tempfile
    session = boto3.session.Session()
    s3 = session.client('s3')
    # Need to create the bucket since this is in Moto's 'virtual' AWS account
    s3.create_bucket(Bucket='my-bucket')

    file1 = tempfile.NamedTemporaryFile()
    with gzip.open(file1.name, 'wb') as f:
        f.write(b"file 1 contents\n")
    s3.upload_file(file1.name, 'my-bucket', 'some_folder/thing1.gz')

    file2 = tempfile.NamedTemporaryFile()
    with gzip.open(file2.name, 'wb') as f:
        f.write(b"file 2 contents\n")
    s3.upload_file(file2.name, 'my-bucket', 'some_folder/thing2.gz')

    concat = S3Concat('my-bucket', 'all_data.gz', None, session=session)
    concat.add_files('some_folder')
    concat.concat()

    all_data_file = tempfile.NamedTemporaryFile()

    s3.download_file('my-bucket', 'all_data.gz', all_data_file.name)

    with gzip.open(all_data_file.name, 'rb') as f:
        content_output = f.read()

    assert content_output == b'file 1 contents\nfile 2 contents\n'
예제 #3
0
def test_concat_text_file():
    session = boto3.session.Session()
    s3 = session.client('s3')
    # Need to create the bucket since this is in Moto's 'virtual' AWS account
    s3.create_bucket(Bucket='my-bucket')
    s3.put_object(
        Bucket='my-bucket',
        Key='some_folder/thing1.json',
        Body=b'Thing1\n',
    )
    s3.put_object(
        Bucket='my-bucket',
        Key='some_folder/thing2.json',
        Body=b'Thing2\n',
    )

    concat = S3Concat('my-bucket', 'all_things.json', None, session=session)
    concat.add_files('some_folder')
    concat.concat()

    concat_output = s3.get_object(
        Bucket='my-bucket',
        Key='all_things.json'
    )['Body'].read().decode('utf-8')

    assert concat_output == 'Thing1\nThing2\n'
예제 #4
0
def concat_json(bucket, path_to_concat, concatenated_file, min_file_size=None):

    if data_exists(bucket, path_to_concat):
        job = S3Concat(
            bucket, concatenated_file, min_file_size, content_type="application/json",
        )
        job.add_files(path_to_concat)
        job.concat(small_parts_threads=4)
예제 #5
0
def test_add_file():
    session = boto3.session.Session()
    s3 = session.client('s3')
    # Need to create the bucket since this is in Moto's 'virtual' AWS account
    s3.create_bucket(Bucket='my-bucket')
    s3.put_object(
        Bucket='my-bucket',
        Key='some_folder/thing1.json',
        Body=b'{"foo": "Test File Contents"}',
    )

    tar = S3Concat('my-bucket', 'all_data.json', '10MB', session=session)
    tar.add_file('some_folder/thing1.json')

    assert tar.all_files == [('some_folder/thing1.json', 29)]