Exemplo n.º 1
0
class Datasets:
    """
    This is a wrapper for three different classes that together handle the
    datasets (locally and remotely).

    Datasets class takes one argument: the path to the local directory of the
    dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional
    and the default value is data/ (following the default usage in the main
    repo, serenata-de-amor).

    The remote part of the class expect to find Amazon credentials in a
    config.ini file with an Amazon section (e.g. config.ini.exemple).

    Inside it object there are three main objects: local, remote, and
    downloader:

    * `Datasets.local` handles listing all local datasets through the property
      `Datasets.local.all` (hint: it's a generator) and deleting local datasets
      with the method `Datasets.local.delete(filename)`;

    * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also
      a generator) and `Dataset.remote.delete(filename)` method just like its
      local equivalent; in addition to them this object offers the
      `Datasets.remote.upload(file_path)` method to upload a local file to the
      remote bucket; `Datasets.remote` does not handles downloads because
      `boto3` does not support `asyncio` and we prefer to use async tasks to
      allow the download of more than one file in parallel;

    * `Datasets.downloader` implements a async manager to download files from
      the remote bucket. It's `Datasets.downloader.download(files)` take the
      path for a single file (str) as argument or an iterable of paths (str).

    Yet this wrapper implement the `Dataset.upload_all()` method to upload all
    local datasets that are not present in the remote bucket.

    :param local_directory: (str) path to local directory of the datasets
    """
    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(local_directory,
                                     bucket=self.remote.bucket,
                                     **self.remote.credentials)

    @property
    def pending(self):
        """Files that are in the local datasets but not in S3."""
        local = set(self.local.all)
        remote = set(self.remote.all)
        yield from (local - remote)

    def upload_all(self):
        for file_name in self.pending:
            full_path = os.path.join(self.local.directory, file_name)
            self.remote.upload(full_path)
Exemplo n.º 2
0
 def test_upload(self, bucket, s3, config_exists, print_):
     bucket.return_value = 'serenata-de-amor-data'
     remote = RemoteDatasets()
     remote.upload('/root/serenata/data/42.csv')
     s3.return_value.upload_file.assert_called_once_with(
         '/root/serenata/data/42.csv',
         'serenata-de-amor-data',
         '42.csv'
     )
Exemplo n.º 3
0
class Datasets:
    """
    This is a wrapper for three different classes that together handle the
    datasets (locally and remotely).

    Datasets class takes one argument: the path to the local directory of the
    dataset files (e.g. data/ or /tmp/serenata-data). The argument is optional
    and the default value is data/ (following the default usage in the main
    repo, serenata-de-amor).

    The remote part of the class expect to find Amazon credentials in a
    config.ini file with an Amazon section (e.g. config.ini.exemple).

    Inside it object there are three main objects: local, remote, and
    downloader:

    * `Datasets.local` handles listing all local datasets through the property
      `Datasets.local.all` (hint: it's a generator) and deleting local datasets
      with the method `Datasets.local.delete(filename)`;

    * `Datasets.remote` has the `Datasets.remote.all` property (hint: it's also
      a generator) and `Dataset.remote.delete(filename)` method just like its
      local equivalent; in addition to them this object offers the
      `Datasets.remote.upload(file_path)` method to upload a local file to the
      remote bucket; `Datasets.remote` does not handles downloads because
      `boto3` does not support `asyncio` and we prefer to use async tasks to
      allow the download of more than one file in parallel;

    * `Datasets.downloader` implements a async manager to download files from
      the remote bucket. It's `Datasets.downloader.download(files)` take the
      path for a single file (str) as argument or an iterable of paths (str).

    Yet this wrapper implement the `Dataset.upload_all()` method to upload all
    local datasets that are not present in the remote bucket.

    :param local_directory: (str) path to local directory of the datasets
    """

    def __init__(self, local_directory=None):
        if not local_directory:
            local_directory = 'data'

        self.local = LocalDatasets(local_directory)
        self.remote = RemoteDatasets()
        self.downloader = Downloader(
            local_directory,
            bucket=self.remote.bucket,
            **self.remote.credentials
        )

    @property
    def pending(self):
        """Files that are in the local datasets but not in S3."""
        local = set(self.local.all)
        remote = set(self.remote.all)
        yield from (local - remote)

    def upload_all(self):
        for file_name in self.pending:
            full_path = os.path.join(self.local.directory, file_name)
            self.remote.upload(full_path)