def test_remove(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') self.assertRaises( S3ResponseError, lambda: s3_client.remove('s3://bucketdoesnotexist/file')) self.assertFalse(s3_client.remove('s3://mybucket/doesNotExist')) s3_client.put(self.tempFilePath, 's3://mybucket/existingFile0') self.assertTrue(s3_client.remove('s3://mybucket/existingFile0')) self.assertFalse(s3_client.exists('s3://mybucket/existingFile0')) self.assertRaises(InvalidDeleteException, lambda: s3_client.remove('s3://mybucket/')) self.assertRaises(InvalidDeleteException, lambda: s3_client.remove('s3://mybucket')) s3_client.put(self.tempFilePath, 's3://mybucket/removemedir/file') self.assertRaises( InvalidDeleteException, lambda: s3_client.remove('s3://mybucket/removemedir', recursive=False))
def test_put_sse(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put(self.tempFilePath, 's3://mybucket/putMe', encrypt_key=True) self.assertTrue(s3_client.exists('s3://mybucket/putMe'))
def test_sanity_test_table_task(self, mock_config): mock_config.get_config.return_value.get.return_value = AWS_ACCESS_KEY t = TestSanityTestDynamoDBTableTask() # mock s3 location for writing output token s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') # create table table_name = 'dynamo_table1' schema = [HashKey('my_hash', data_type=STRING)] indexes = [ AllIndex('IndexName', parts=[ HashKey('my_hash', data_type=STRING), RangeKey('range_index', data_type=NUMBER) ]) ] throughput = {'read': 2, 'write': 4} client = DynamoDBClient(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY) client.create_table(table_name, schema, throughput, indexes=indexes) self.assertRaises(DynamoDBTaskException, luigi.build([t], local_scheduler=True))
def test_put_string_sse(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put_string("SOMESTRING", 's3://mybucket/putString', encrypt_key=True) self.assertTrue(s3_client.exists('s3://mybucket/putString'))
class CreateRandomData(Task): nrows = IntParameter(default=20) ncolumns = IntParameter(default=4) csv_string = None s3_client = S3Client(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=ACCESS_SECRET) def requires(self): return [] def output(self): s3_filepath = 's3://' + BUCKET + '/random_numbers.csv' return S3Target(s3_filepath) def run(self): df = DataFrame(np.random.rand(self.nrows, self.ncolumns), columns=ALPHABET[0:self.ncolumns]) df.index.name = 'index' output = df.to_string() conn = S3Connection(ACCESS_KEY, ACCESS_SECRET) bucket = conn.get_bucket(BUCKET) file = Key(bucket) file.key = 'random_numbers.csv' file.set_contents_from_string(output)
def test_copy_dir(self): """ Test copying 20 files from one folder to another """ n = 20 copy_part_size = (1024 ** 2) * 5 # Note we can't test the multipart copy due to moto issue #526 # so here I have to keep the file size smaller than the copy_part_size file_size = 5000 s3_dir = 's3://mybucket/copydir/' file_contents = b"a" * file_size tmp_file = tempfile.NamedTemporaryFile(mode='wb', delete=True) tmp_file_path = tmp_file.name tmp_file.write(file_contents) tmp_file.flush() s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') for i in range(n): file_path = s3_dir + str(i) s3_client.put_multipart(tmp_file_path, file_path) self.assertTrue(s3_client.exists(file_path)) s3_dest = 's3://mybucket/copydir_new/' s3_client.copy(s3_dir, s3_dest, threads=10, part_size=copy_part_size) for i in range(n): original_size = s3_client.get_key(s3_dir + str(i)).size copy_size = s3_client.get_key(s3_dest + str(i)).size self.assertEqual(original_size, copy_size)
def create_target(self, format=None, **kwargs): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') return S3Target('s3://mybucket/test_file', client=client, format=format, **kwargs)
def _get_s3_client(self): if not hasattr(self, "client"): self.client = \ S3Client( luigi.configuration.get_config().get('s3', 'aws_access_key_id'), luigi.configuration.get_config().get('s3', 'aws_secret_access_key')) return self.client
def load(client, loadDate, bucket): print "loading AdUnits" adUnitsMap = getAllAdUnits(client) parentedList = addParents(adUnitsMap) #remove any existing local files for auFile in glob.glob(CONFIG['local']['prefix'] + '*'): os.remove(auFile) #Write data to .CSV outFile = (os.path.dirname(CONFIG['local']['prefix']) + "/" + CONFIG['local']['name'] + "_" + loadDate.strftime(CONFIG['local']['date_format']) + "." + CONFIG['local']['format']) written = writeAsCsv(parentedList, outFile) #Copy to S3 s3File = bucket + CONFIG['s3']['folder'] + loadDate.strftime( CONFIG['s3']['date_format']) + CONFIG['s3']['file'] s3Client = S3Client( ) #should get authentication data from server boto config s3Client.put(outFile, s3File) print str(len(adUnitsMap)) + " adUnit Map size. " + str( len(parentedList)) + " parented List size." # save to DB saved = saveToDb(parentedList) return len(parentedList)
def test_read_iterator_long(self): # write a file that is 5X the boto buffersize # to test line buffering old_buffer = key.Key.BufferSize key.Key.BufferSize = 2 try: tempf = tempfile.NamedTemporaryFile(mode='wb', delete=False) temppath = tempf.name firstline = ''.zfill(key.Key.BufferSize * 5) + os.linesep contents = firstline + 'line two' + os.linesep + 'line three' tempf.write(contents.encode('utf-8')) tempf.close() client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') client.put(temppath, 's3://mybucket/largetempfile') t = S3Target('s3://mybucket/largetempfile', client=client) with t.open() as read_file: lines = [line for line in read_file] finally: key.Key.BufferSize = old_buffer self.assertEqual(3, len(lines)) self.assertEqual(firstline, lines[0]) self.assertEqual("line two" + os.linesep, lines[1]) self.assertEqual("line three", lines[2])
def _run_remote_temp_upload_test(self, file_size): file_contents = b"a" * file_size tmp_file = tempfile.NamedTemporaryFile(mode='wb', delete=True) tmp_file_path = tmp_file.name tmp_file.write(file_contents) tmp_file.flush() s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_path = 's3://mybucket/remote_test_file' t = S3Target(s3_path, client=s3_client, remote_temp_write=True, boto3_session_kwargs={ 'region_name': 'us-east-1', 'aws_access_key_id': AWS_ACCESS_KEY, 'aws_secret_access_key': AWS_SECRET_KEY }) with open(tmp_file_path, 'rb') as source_file: with t.open('w') as write_file: for line in source_file: write_file.write(line) self.assertTrue(s3_client.exists(s3_path)) file_size = os.path.getsize(tmp_file.name) key_size = s3_client.get_key(s3_path).size self.assertEqual(file_size, key_size) tmp_file.close()
def test_remove(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') self.assertRaises( S3ResponseError, lambda: s3_client.remove('s3://bucketdoesnotexist/file')) self.assertFalse(s3_client.remove('s3://mybucket/doesNotExist')) s3_client.put(self.tempFilePath, 's3://mybucket/existingFile0') self.assertTrue(s3_client.remove('s3://mybucket/existingFile0')) self.assertFalse(s3_client.exists('s3://mybucket/existingFile0')) self.assertRaises(InvalidDeleteException, lambda: s3_client.remove('s3://mybucket/')) self.assertRaises(InvalidDeleteException, lambda: s3_client.remove('s3://mybucket')) s3_client.put(self.tempFilePath, 's3://mybucket/removemedir/file') self.assertRaises( InvalidDeleteException, lambda: s3_client.remove('s3://mybucket/removemedir', recursive=False)) # test that the marker file created by Hadoop S3 Native FileSystem is removed s3_client.put(self.tempFilePath, 's3://mybucket/removemedir/file') s3_client.put_string("", 's3://mybucket/removemedir_$folder$') self.assertTrue(s3_client.remove('s3://mybucket/removemedir')) self.assertFalse( s3_client.exists('s3://mybucket/removemedir_$folder$'))
def test_read(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') client.put(self.tempFilePath, 's3://mybucket/tempfile') t = S3Target('s3://mybucket/tempfile', client=client) read_file = t.open() file_str = read_file.read() self.assertEquals(self.tempFileContents, file_str)
def test_del(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/test_del', client=client) p = t.open('w') print >> p, 'test' del p self.assertFalse(t.exists())
def test_close(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/test_file', client=client) p = t.open('w') print >> p, 'test' self.assertFalse(t.exists()) p.close() self.assertTrue(t.exists())
def run(self): """ Generate and print a URL where we can download the graph. """ s3_client = S3Client() s3_key = s3_client.get_key(self.s3_path) download_url = s3_key.generate_url(expires_in=self.url_expires_in) logger.info('DOWNLOAD GRAPH AT: %s' % download_url)
def test_list_key(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put_string("", 's3://mybucket/hello/frank') s3_client.put_string("", 's3://mybucket/hello/world') self.assertEqual([True, True], [x.exists() for x in s3_client.list('s3://mybucket/hello', return_key=True)])
def test_list(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put_string("", 's3://mybucket/hello/frank') s3_client.put_string("", 's3://mybucket/hello/world') self.assertEqual(['frank', 'world'], list(s3_client.list('s3://mybucket/hello')))
def test_get_as_string(self): # put a file on s3 first s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put(self.tempFilePath, 's3://mybucket/putMe') contents = s3_client.get_as_string('s3://mybucket/putMe') self.assertEqual(contents, self.tempFileContents)
def _read_schema_file(self): s3Client = S3Client() if not s3Client.exists(self.s3_schema_path()): raise Exception("No schema file located at %s. Can not set Redshift columns." % s3_schema_path) else: logger.info("Found schema file %s" % self.s3_schema_path()) schema_key = s3Client.get_key(self.s3_schema_path()) return schema_key.get_contents_as_string()
def test_write_cleanup_with_error(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/test_cleanup2', client=client) try: with t.open('w'): raise Exception('something broke') except: pass self.assertFalse(t.exists())
def test_gzip(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/gzip_test', luigi.format.Gzip, client=client) p = t.open('w') test_data = 'test' p.write(test_data) self.assertFalse(t.exists()) p.close() self.assertTrue(t.exists())
def test_write_cleanup_no_close(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/test_cleanup', client=client) def context(): f = t.open('w') f.write('stuff') context() gc.collect() self.assertFalse(t.exists())
def test_init_with_environment_variables(self): os.environ['AWS_ACCESS_KEY_ID'] = 'foo' os.environ['AWS_SECRET_ACCESS_KEY'] = 'bar' # Don't read any exsisting config old_config_paths = configuration.LuigiConfigParser._config_paths configuration.LuigiConfigParser._config_paths = [tempfile.mktemp()] s3_client = S3Client() configuration.LuigiConfigParser._config_paths = old_config_paths self.assertEqual(s3_client.s3.gs_access_key_id, 'foo') self.assertEqual(s3_client.s3.gs_secret_access_key, 'bar')
def test_mkdir(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') self.assertTrue(s3_client.isdir('s3://mybucket')) s3_client.mkdir('s3://mybucket') s3_client.mkdir('s3://mybucket/dir') self.assertTrue(s3_client.isdir('s3://mybucket/dir')) self.assertRaises(MissingParentDirectory, s3_client.mkdir, 's3://mybucket/dir/foo/bar', parents=False) self.assertFalse(s3_client.isdir('s3://mybucket/dir/foo/bar'))
def convert(lines, configuration): access_key_id = str(configuration['aws_access_key_id']) secret_access_key = str(configuration['aws_secret_access_key']) bucket = str(configuration['bucket']) cfg_filename = str(configuration.get('output_file', '')) s3_client = S3Client(access_key_id, secret_access_key) targets = {} for line in lines: try: data = json.loads(line) except Exception as e: raise Exception(errors.PARSING_ERROR % (line, e)) if 'type' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('type', line)) data_type = data['type'] if data_type == 'RECORD': if 'stream' not in data: raise Exception(errors.MISSING_KEY_ERROR % ('stream', line)) filename = cfg_filename if filename == "": filename = data['stream'] + '.json' target_path = ('s3://{bucket}/{filename}'.format( bucket=bucket, filename=filename)) record = data['record'] print(targets) target = None if not target_path in targets.keys(): target = S3Target(target_path, client=s3_client) targets[target_path] = { 'target': target, 'file': target.open('w') } target = targets[target_path]['target'] targets[target_path]['file'].write(json.dumps(record) + '\n') else: l.WARN(errors.UNEXPECTED_MESSAGE_TYPE % (data['type'], data)) for target_path in targets: targets[target_path]['file'].close()
def test_is_dir(self): s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') self.assertTrue(s3_client.is_dir('s3://mybucket')) s3_client.put(self.tempFilePath, 's3://mybucket/tempdir0_$folder$') self.assertTrue(s3_client.is_dir('s3://mybucket/tempdir0')) s3_client.put(self.tempFilePath, 's3://mybucket/tempdir1/') self.assertTrue(s3_client.is_dir('s3://mybucket/tempdir1')) s3_client.put(self.tempFilePath, 's3://mybucket/key') self.assertFalse(s3_client.is_dir('s3://mybucket/key'))
def test_get(self): # put a file on s3 first s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) s3_client.s3.create_bucket('mybucket') s3_client.put(self.tempFilePath, 's3://mybucket/putMe') tmp_file = tempfile.NamedTemporaryFile(delete=True) tmp_file_path = tmp_file.name s3_client.get('s3://mybucket/putMe', tmp_file_path) self.assertEqual(tmp_file.read(), self.tempFileContents) tmp_file.close()
def test_gzip_works_and_cleans_up(self): client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) client.s3.create_bucket('mybucket') t = S3Target('s3://mybucket/gzip_test', luigi.format.Gzip, client=client) test_data = b'123testing' with t.open('w') as f: f.write(test_data) with t.open() as f: result = f.read() self.assertEqual(test_data, result)
def setUp(self, mock_config): f = tempfile.NamedTemporaryFile(mode='wb', delete=False) self.tempFileContents = "I'm a temporary file for testing\nAnd this is the second line\nThis is the third." f.write(self.tempFileContents) f.close() self.tempFilePath = f.name self.file_name = f.name[f.name.rindex('/') + 1:] self.local_path = f.name[:f.name.rindex('/')] self.s3_client = S3Client(AWS_ACCESS_KEY, AWS_SECRET_KEY) bucket = self.s3_client.s3.create_bucket('bucket') k = Key(bucket) k.key = 'key/%s' % self.file_name mock_config.get_config.return_value.get.return_value = AWS_ACCESS_KEY
def is_empty(self): s3 = boto3.resource('s3') (bucket, key) = S3Client._path_to_bucket_and_key(self.path) return s3.ObjectSummary(bucket, key).size == 0