def test_s3(self, boto3, raw_config_parser, config_exists): config_exists.return_value = True remote = RemoteDatasets() remote.credentials = dict(test=42) self.assertIsNotNone(remote.s3) boto3.client.assert_called_once_with('s3', test=42) remote.client = remote.s3
def test_delete(self, bucket, s3, config_exists, print_): bucket.return_value = 'serenata-de-amor-data' remote = RemoteDatasets() remote.delete('42.csv') s3.return_value.delete_object.assert_called_once_with( Bucket='serenata-de-amor-data', Key='42.csv' )
class Datasets: """ This is a wrapper for three different classes that together handle the datasets (locally and remotely). Datasets class takes one argument: the path to the local directory of the dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional and the default value is data/ (following the default usage in the main repo, serenata-de-amor). The remote part of the class expect to find Amazon credentials in a config.ini file with an Amazon section (e.g. config.ini.exemple). Inside it object there are three main objects: local, remote, and downloader: * `Datasets.local` handles listing all local datasets through the property `Datasets.local.all` (hint: it's a generator) and deleting local datasets with the method `Datasets.local.delete(filename)`; * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also a generator) and `Dataset.remote.delete(filename)` method just like its local equivalent; in addition to them this object offers the `Datasets.remote.upload(file_path)` method to upload a local file to the remote bucket; `Datasets.remote` does not handles downloads because `boto3` does not support `asyncio` and we prefer to use async tasks to allow the download of more than one file in parallel; * `Datasets.downloader` implements a async manager to download files from the remote bucket. It's `Datasets.downloader.download(files)` take the path for a single file (str) as argument or an iterable of paths (str). Yet this wrapper implement the `Dataset.upload_all()` method to upload all local datasets that are not present in the remote bucket. :param local_directory: (str) path to local directory of the datasets """ def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader(local_directory, bucket=self.remote.bucket, **self.remote.credentials) @property def pending(self): """Files that are in the local datasets but not in S3.""" local = set(self.local.all) remote = set(self.remote.all) yield from (local - remote) def upload_all(self): for file_name in self.pending: full_path = os.path.join(self.local.directory, file_name) self.remote.upload(full_path)
def test_upload(self, bucket, s3, config_exists, print_): bucket.return_value = 'serenata-de-amor-data' remote = RemoteDatasets() remote.upload('/root/serenata/data/42.csv') s3.return_value.upload_file.assert_called_once_with( '/root/serenata/data/42.csv', 'serenata-de-amor-data', '42.csv' )
def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader(local_directory, bucket=self.remote.bucket, **self.remote.credentials)
def test_init_with_old_config(self, raw_config_parser, boto3, print_, config_exists): raw_config_parser.return_value.get.return_value = 's3-test' RemoteDatasets() expected = ( 'It looks like you have an old version of the config.ini file. We ' 'do not need anymore the service (s3) appended to the region ' '(sa-east-1). Please update your config.ini replacing regions ' 'like `s3-sa-east-1` by `sa-east-1`.' ) print_.assert_called_once_with(expected)
def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader( local_directory, bucket=self.remote.bucket, **self.remote.credentials )
def test_successful_init(self, raw_config_parser, boto3, config_exists): raw_config_parser.return_value.get.side_effect = ( 'YOUR_ACCESS_KEY', 'YOUR_SECRET_KEY', 'sa-east-1', 'serenata-de-amor-data') credentials = { 'aws_access_key_id': 'YOUR_ACCESS_KEY', 'aws_secret_access_key': 'YOUR_SECRET_KEY', 'region_name': 'sa-east-1' } remote = RemoteDatasets() self.assertEqual('serenata-de-amor-data', remote.bucket) self.assertEqual(credentials, remote.credentials)
def test_bucket(self, raw_config_parser, config_exists): config_exists.return_value = True raw_config_parser.return_value.get.return_value = "42" remote = RemoteDatasets() self.assertEqual("42", remote.bucket)
def test_bucket_no_section(self, raw_config_parser, config_exists): config_exists.return_value = True raw_config_parser.return_value.get.side_effect = NoSectionError('foo') remote = RemoteDatasets() self.assertIsNone(remote.bucket)
def test_bucket_no_config(self, config_exists): config_exists.return_value = False remote = RemoteDatasets() self.assertIsNone(remote.bucket)
def test_init_without_config(self, print_, config_exist): config_exist.return_value = False remote = RemoteDatasets() self.assertIsNone(remote.s3) self.assertIsNone(remote.bucket) self.assertTrue(print_.called)
def test_config_exists_when_it_is_a_file(self, is_file, exists): exists.return_value = True is_file.return_value = True remote = RemoteDatasets() self.assertTrue(remote.config_exists)
def test_all(self, bucket, s3, config_exists): response = {'Contents': [{'Key': 'file1.xz'}, {'Key': 'file2.xz'}]} s3.return_value.list_objects.return_value = response bucket.return_value = 'bucket' remote = RemoteDatasets() self.assertEqual(('file1.xz', 'file2.xz'), tuple(remote.all))
def test_config_exists_when_it_doesnt_exist(self, is_file, exists): exists.return_value = False is_file.return_value = False remote = RemoteDatasets() self.assertFalse(remote.config_exists)
class Datasets: """ This is a wrapper for three different classes that together handle the datasets (locally and remotely). Datasets class takes one argument: the path to the local directory of the dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional and the default value is data/ (following the default usage in the main repo, serenata-de-amor). The remote part of the class expect to find Amazon credentials in a config.ini file with an Amazon section (e.g. config.ini.exemple). Inside it object there are three main objects: local, remote, and downloader: * `Datasets.local` handles listing all local datasets through the property `Datasets.local.all` (hint: it's a generator) and deleting local datasets with the method `Datasets.local.delete(filename)`; * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also a generator) and `Dataset.remote.delete(filename)` method just like its local equivalent; in addition to them this object offers the `Datasets.remote.upload(file_path)` method to upload a local file to the remote bucket; `Datasets.remote` does not handles downloads because `boto3` does not support `asyncio` and we prefer to use async tasks to allow the download of more than one file in parallel; * `Datasets.downloader` implements a async manager to download files from the remote bucket. It's `Datasets.downloader.download(files)` take the path for a single file (str) as argument or an iterable of paths (str). Yet this wrapper implement the `Dataset.upload_all()` method to upload all local datasets that are not present in the remote bucket. :param local_directory: (str) path to local directory of the datasets """ def __init__(self, local_directory=None): if not local_directory: local_directory = 'data' self.local = LocalDatasets(local_directory) self.remote = RemoteDatasets() self.downloader = Downloader( local_directory, bucket=self.remote.bucket, **self.remote.credentials ) @property def pending(self): """Files that are in the local datasets but not in S3.""" local = set(self.local.all) remote = set(self.remote.all) yield from (local - remote) def upload_all(self): for file_name in self.pending: full_path = os.path.join(self.local.directory, file_name) self.remote.upload(full_path)