def concatenate_from(self, other_locs: List['BaseFileUrl']) -> Optional[int]: if not all([ isinstance(loc, S3FileUrl) and loc.bucket == self.bucket for loc in other_locs ]): logger.warning( "Concatenating data locally - this may be slow for large data sets" ) return super().concatenate_from(other_locs) job = S3Concat( self.bucket, self.key, session=self._boto3_session, # We want one file the end--S3Concat's other # job in life is concatting small log files # into larger ones, where a minimum file size # is a hint for when to stop combining files # together. min_file_size=None) for loc in other_locs: assert isinstance(loc, S3FileUrl) # keep mypy happy # Add files, can call multiple times to add files from other directories job.add_file(loc.key) out = job.concat() assert len( out ) == 1 # with above arg, this should provide only a single output file return None
def test_concat_gzip_content(): """Create 2 gzip files, then use s3concat to create a single gzip file To test, un-compress and read contents of the concat'd file """ import gzip import tempfile session = boto3.session.Session() s3 = session.client('s3') # Need to create the bucket since this is in Moto's 'virtual' AWS account s3.create_bucket(Bucket='my-bucket') file1 = tempfile.NamedTemporaryFile() with gzip.open(file1.name, 'wb') as f: f.write(b"file 1 contents\n") s3.upload_file(file1.name, 'my-bucket', 'some_folder/thing1.gz') file2 = tempfile.NamedTemporaryFile() with gzip.open(file2.name, 'wb') as f: f.write(b"file 2 contents\n") s3.upload_file(file2.name, 'my-bucket', 'some_folder/thing2.gz') concat = S3Concat('my-bucket', 'all_data.gz', None, session=session) concat.add_files('some_folder') concat.concat() all_data_file = tempfile.NamedTemporaryFile() s3.download_file('my-bucket', 'all_data.gz', all_data_file.name) with gzip.open(all_data_file.name, 'rb') as f: content_output = f.read() assert content_output == b'file 1 contents\nfile 2 contents\n'
def test_concat_text_file(): session = boto3.session.Session() s3 = session.client('s3') # Need to create the bucket since this is in Moto's 'virtual' AWS account s3.create_bucket(Bucket='my-bucket') s3.put_object( Bucket='my-bucket', Key='some_folder/thing1.json', Body=b'Thing1\n', ) s3.put_object( Bucket='my-bucket', Key='some_folder/thing2.json', Body=b'Thing2\n', ) concat = S3Concat('my-bucket', 'all_things.json', None, session=session) concat.add_files('some_folder') concat.concat() concat_output = s3.get_object( Bucket='my-bucket', Key='all_things.json' )['Body'].read().decode('utf-8') assert concat_output == 'Thing1\nThing2\n'
def concat_json(bucket, path_to_concat, concatenated_file, min_file_size=None): if data_exists(bucket, path_to_concat): job = S3Concat( bucket, concatenated_file, min_file_size, content_type="application/json", ) job.add_files(path_to_concat) job.concat(small_parts_threads=4)
def test_add_file(): session = boto3.session.Session() s3 = session.client('s3') # Need to create the bucket since this is in Moto's 'virtual' AWS account s3.create_bucket(Bucket='my-bucket') s3.put_object( Bucket='my-bucket', Key='some_folder/thing1.json', Body=b'{"foo": "Test File Contents"}', ) tar = S3Concat('my-bucket', 'all_data.json', '10MB', session=session) tar.add_file('some_folder/thing1.json') assert tar.all_files == [('some_folder/thing1.json', 29)]