Exemplo n.º 1
0
 def test_s3(self, boto3, raw_config_parser, config_exists):
     config_exists.return_value = True
     remote = RemoteDatasets()
     remote.credentials = dict(test=42)
     self.assertIsNotNone(remote.s3)
     boto3.client.assert_called_once_with('s3', test=42)
     remote.client = remote.s3
Exemplo n.º 2
0
 def test_delete(self, bucket, s3, config_exists, print_):
     bucket.return_value = 'serenata-de-amor-data'
     remote = RemoteDatasets()
     remote.delete('42.csv')
     s3.return_value.delete_object.assert_called_once_with(
         Bucket='serenata-de-amor-data',
         Key='42.csv'
     )
Exemplo n.º 3
0
class Datasets:
    """
    This is a wrapper for three different classes that together handle the
    datasets (locally and remotely).

    Datasets class takes one argument: the path to the local directory of the
    dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional
    and the default value is data/ (following the default usage in the main
    repo, serenata-de-amor).

    The remote part of the class expect to find Amazon credentials in a
    config.ini file with an Amazon section (e.g. config.ini.exemple).

    Inside it object there are three main objects: local, remote, and
    downloader:

    * `Datasets.local` handles listing all local datasets through the property
      `Datasets.local.all` (hint: it's a generator) and deleting local datasets
      with the method `Datasets.local.delete(filename)`;

    * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also
      a generator) and `Dataset.remote.delete(filename)` method just like its
      local equivalent; in addition to them this object offers the
      `Datasets.remote.upload(file_path)` method to upload a local file to the
      remote bucket; `Datasets.remote` does not handles downloads because
      `boto3` does not support `asyncio` and we prefer to use async tasks to
      allow the download of more than one file in parallel;

    * `Datasets.downloader` implements a async manager to download files from
      the remote bucket. It's `Datasets.downloader.download(files)` take the
      path for a single file (str) as argument or an iterable of paths (str).

    Yet this wrapper implement the `Dataset.upload_all()` method to upload all
    local datasets that are not present in the remote bucket.

    :param local_directory: (str) path to local directory of the datasets
    """
    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(local_directory,
                                     bucket=self.remote.bucket,
                                     **self.remote.credentials)

    @property
    def pending(self):
        """Files that are in the local datasets but not in S3."""
        local = set(self.local.all)
        remote = set(self.remote.all)
        yield from (local - remote)

    def upload_all(self):
        for file_name in self.pending:
            full_path = os.path.join(self.local.directory, file_name)
            self.remote.upload(full_path)
Exemplo n.º 4
0
 def test_upload(self, bucket, s3, config_exists, print_):
     bucket.return_value = 'serenata-de-amor-data'
     remote = RemoteDatasets()
     remote.upload('/root/serenata/data/42.csv')
     s3.return_value.upload_file.assert_called_once_with(
         '/root/serenata/data/42.csv',
         'serenata-de-amor-data',
         '42.csv'
     )
Exemplo n.º 5
0
    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(local_directory,
                                     bucket=self.remote.bucket,
                                     **self.remote.credentials)
Exemplo n.º 6
0
 def test_init_with_old_config(self, raw_config_parser, boto3, print_, config_exists):
     raw_config_parser.return_value.get.return_value = 's3-test'
     RemoteDatasets()
     expected = (
         'It looks like you have an old version of the config.ini file. We '
         'do not need anymore the service (s3) appended to the region '
         '(sa-east-1). Please update your config.ini replacing regions '
         'like `s3-sa-east-1` by `sa-east-1`.'
     )
     print_.assert_called_once_with(expected)
Exemplo n.º 7
0
    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(
            local_directory,
            bucket=self.remote.bucket,
            **self.remote.credentials
        )
Exemplo n.º 8
0
 def test_successful_init(self, raw_config_parser, boto3, config_exists):
     raw_config_parser.return_value.get.side_effect = (
         'YOUR_ACCESS_KEY', 'YOUR_SECRET_KEY', 'sa-east-1',
         'serenata-de-amor-data')
     credentials = {
         'aws_access_key_id': 'YOUR_ACCESS_KEY',
         'aws_secret_access_key': 'YOUR_SECRET_KEY',
         'region_name': 'sa-east-1'
     }
     remote = RemoteDatasets()
     self.assertEqual('serenata-de-amor-data', remote.bucket)
     self.assertEqual(credentials, remote.credentials)
Exemplo n.º 9
0
 def test_bucket(self, raw_config_parser, config_exists):
     config_exists.return_value = True
     raw_config_parser.return_value.get.return_value = "42"
     remote = RemoteDatasets()
     self.assertEqual("42", remote.bucket)
Exemplo n.º 10
0
 def test_bucket_no_section(self, raw_config_parser, config_exists):
     config_exists.return_value = True
     raw_config_parser.return_value.get.side_effect = NoSectionError('foo')
     remote = RemoteDatasets()
     self.assertIsNone(remote.bucket)
Exemplo n.º 11
0
 def test_bucket_no_config(self, config_exists):
     config_exists.return_value = False
     remote = RemoteDatasets()
     self.assertIsNone(remote.bucket)
Exemplo n.º 12
0
 def test_init_without_config(self, print_, config_exist):
     config_exist.return_value = False
     remote = RemoteDatasets()
     self.assertIsNone(remote.s3)
     self.assertIsNone(remote.bucket)
     self.assertTrue(print_.called)
Exemplo n.º 13
0
 def test_config_exists_when_it_is_a_file(self, is_file, exists):
     exists.return_value = True
     is_file.return_value = True
     remote = RemoteDatasets()
     self.assertTrue(remote.config_exists)
Exemplo n.º 14
0
 def test_all(self, bucket, s3, config_exists):
     response = {'Contents': [{'Key': 'file1.xz'}, {'Key': 'file2.xz'}]}
     s3.return_value.list_objects.return_value = response
     bucket.return_value = 'bucket'
     remote = RemoteDatasets()
     self.assertEqual(('file1.xz', 'file2.xz'), tuple(remote.all))
Exemplo n.º 15
0
 def test_config_exists_when_it_doesnt_exist(self, is_file, exists):
     exists.return_value = False
     is_file.return_value = False
     remote = RemoteDatasets()
     self.assertFalse(remote.config_exists)
Exemplo n.º 16
0
class Datasets:
    """
    This is a wrapper for three different classes that together handle the
    datasets (locally and remotely).

    Datasets class takes one argument: the path to the local directory of the
    dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional
    and the default value is data/ (following the default usage in the main
    repo, serenata-de-amor).

    The remote part of the class expect to find Amazon credentials in a
    config.ini file with an Amazon section (e.g. config.ini.exemple).

    Inside it object there are three main objects: local, remote, and
    downloader:

    * `Datasets.local` handles listing all local datasets through the property
      `Datasets.local.all` (hint: it's a generator) and deleting local datasets
      with the method `Datasets.local.delete(filename)`;

    * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also
      a generator) and `Dataset.remote.delete(filename)` method just like its
      local equivalent; in addition to them this object offers the
      `Datasets.remote.upload(file_path)` method to upload a local file to the
      remote bucket; `Datasets.remote` does not handles downloads because
      `boto3` does not support `asyncio` and we prefer to use async tasks to
      allow the download of more than one file in parallel;

    * `Datasets.downloader` implements a async manager to download files from
      the remote bucket. It's `Datasets.downloader.download(files)` take the
      path for a single file (str) as argument or an iterable of paths (str).

    Yet this wrapper implement the `Dataset.upload_all()` method to upload all
    local datasets that are not present in the remote bucket.

    :param local_directory: (str) path to local directory of the datasets
    """

    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(
            local_directory,
            bucket=self.remote.bucket,
            **self.remote.credentials
        )

    @property
    def pending(self):
        """Files that are in the local datasets but not in S3."""
        local = set(self.local.all)
        remote = set(self.remote.all)
        yield from (local - remote)

    def upload_all(self):
        for file_name in self.pending:
            full_path = os.path.join(self.local.directory, file_name)
            self.remote.upload(full_path)