def test_blank_region(self): # blank region should be treated the same as no region runner = EMRJobRunner(conf_path=False, aws_region='') assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def test_explicit_endpoints(self): runner = EMRJobRunner(conf_path=False, aws_region='EU', s3_endpoint='s3-proxy', emr_endpoint='emr-proxy') assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy') assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
def test_no_region(self): runner = EMRJobRunner(conf_path=False) assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01) # add some mock data and change last_modified remote_input_path = 's3://walrus/data/' self.add_mock_s3_data({ 'walrus': { 'data/foo': 'foo\n', 'data/bar': 'bar\n', 'data/qux': 'qux\n' } }) s3_conn = runner.make_s3_conn() bucket_name, key_name = parse_s3_uri(remote_input_path) bucket = s3_conn.get_bucket(bucket_name) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') key_bar.last_modified = datetime.now() - timedelta(days=45) key_qux.last_modified = datetime.now() - timedelta(hours=50) # make sure keys are there assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True, conf_paths=[]) # dry-run shouldn't delete anything assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_bar is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_qux is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) self.assertEqual(key_qux, None)
def test_cleanup(self): runner = EMRJobRunner(conf_paths=[], s3_sync_wait_time=0.01) # add some mock data and change last_modified remote_input_path = 's3://walrus/data/' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n', 'data/bar': 'bar\n', 'data/qux': 'qux\n'}}) s3_conn = runner.make_s3_conn() bucket_name, key_name = parse_s3_uri(remote_input_path) bucket = s3_conn.get_bucket(bucket_name) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') key_bar.last_modified = datetime.now() - timedelta(days=45) key_qux.last_modified = datetime.now() - timedelta(hours=50) # make sure keys are there assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), dry_run=True, conf_paths=[]) # dry-run shouldn't delete anything assert isinstance(key_foo, MockKey) assert isinstance(key_bar, MockKey) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(days=30), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_bar is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) assert isinstance(key_qux, MockKey) s3_cleanup(remote_input_path, timedelta(hours=48), conf_paths=[]) key_foo = bucket.get_key('data/foo') key_bar = bucket.get_key('data/bar') key_qux = bucket.get_key('data/qux') # make sure key_qux is deleted assert isinstance(key_foo, MockKey) self.assertEqual(key_bar, None) self.assertEqual(key_qux, None)
def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_path=conf_path) s3_conn = runner.make_s3_conn() log.info("Deleting all files in %s that are older than %s" % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info("Deleting %s; is %s old" % (key.name, age)) if not dry_run: key.delete()
def s3_cleanup(glob_path, time_old, dry_run=False, conf_paths=None): """Delete all files older than *time_old* in *path*. If *dry_run* is ``True``, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(conf_paths=conf_paths) s3_conn = runner.make_s3_conn() log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path in runner.ls(glob_path): bucket_name, key_name = parse_s3_uri(path) bucket = s3_conn.get_bucket(bucket_name) for key in bucket.list(key_name): last_modified = iso8601_to_datetime(key.last_modified) age = datetime.utcnow() - last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (key.name, age)) if not dry_run: key.delete()
def test_ap_southeast_1(self): runner = EMRJobRunner(conf_path=False, aws_region='ap-southeast-1') assert_equal(runner.make_s3_conn().endpoint, 's3-ap-southeast-1.amazonaws.com') assert_raises(Exception, runner.make_emr_conn)
def test_us_west_1(self): runner = EMRJobRunner(conf_path=False, aws_region='us-west-1') assert_equal(runner.make_emr_conn().endpoint, 'us-west-1.elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3-us-west-1.amazonaws.com')