示例#1
0
    def from_s3_csv(cls, bucket, key, aws_access_key_id=None, aws_secret_access_key=None,
                    **csvargs):
        """
        Create a ``parsons table`` from a key in an S3 bucket.

        `Args:`
            bucket: str
                The S3 bucket.
            key: str
                The S3 key
            aws_access_key_id: str
                Required if not included as environmental variable.
            aws_secret_access_key: str
                Required if not included as environmental variable.
            \**csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            `parsons.Table` object
        """  # noqa: W605

        from parsons import S3
        s3 = S3(aws_access_key_id, aws_secret_access_key)
        file_obj = s3.get_file(bucket, key)

        if files.compression_type_for_path(key) == 'zip':
            file_obj = files.zip_archive.unzip_archive(file_obj)

        return cls(petl.fromcsv(file_obj, **csvargs))
示例#2
0
    def to_s3_csv(self, bucket, key, aws_access_key_id=None,
                  aws_secret_access_key=None, compression=None, encoding=None,
                  errors='strict', write_header=True, public_url=False,
                  public_url_expires=3600, **csvargs):
        """
        Writes the table to an s3 object as a CSV

        `Args:`
            bucket: str
                The s3 bucket to upload to
            key: str
                The s3 key to name the file. If it ends in '.gz' or '.zip', the file will be
                compressed.
            aws_access_key_id: str
                Required if not included as environmental variable
            aws_secret_access_key: str
                Required if not included as environmental variable
            compression: str
                The compression type for the s3 object. Currently "None", "zip" and "gzip" are
                supported. If specified, will override the key suffix.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            public_url: boolean
                Create a public link to the file
            public_url_expire: 3600
                The time, in seconds, until the url expires if ``public_url`` set to ``True``.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        `Returns:`
            Public url if specified. If not ``None``.
        """  # noqa: W605

        compression = compression or files.compression_type_for_path(key)

        csv_name = files.extract_file_name(key, include_suffix=False) + '.csv'

        # Save the CSV as a temp file
        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 csv_name=csv_name,
                                 **csvargs)

        # Put the file on S3
        from parsons import S3
        self.s3 = S3(aws_access_key_id=aws_access_key_id,
                     aws_secret_access_key=aws_secret_access_key)
        self.s3.put_file(bucket, key, local_path)

        if public_url:
            return self.s3.get_url(bucket, key, expires_in=public_url_expires)
        else:
            return None
示例#3
0
    def to_sftp_csv(self,
                    remote_path,
                    host,
                    username,
                    password,
                    port=22,
                    encoding=None,
                    compression=None,
                    errors='strict',
                    write_header=True,
                    rsa_private_key_file=None,
                    **csvargs):
        """
        Writes the table to a CSV file on a remote SFTP server

        `Args:`
            remote_path: str
                The remote path of the file. If it ends in '.gz', the file will be compressed.
            host: str
                The remote host
            username: str
                The username to access the SFTP server
            password: str
                The password to access the SFTP server
            port: int
                The port number of the SFTP server
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            rsa_private_key_file str
                Absolute path to a private RSA key used
                to authenticate stfp connection
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        """  # noqa: W605

        from parsons.sftp import SFTP

        sftp = SFTP(host, username, password, port, rsa_private_key_file)

        compression = files.compression_type_for_path(remote_path)

        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 **csvargs)
        sftp.put_file(local_path, remote_path)
示例#4
0
    def from_s3_csv(cls,
                    bucket,
                    key,
                    from_manifest=False,
                    aws_access_key_id=None,
                    aws_secret_access_key=None,
                    **csvargs):
        """
        Create a ``parsons table`` from a key in an S3 bucket.

        `Args:`
            bucket: str
                The S3 bucket.
            key: str
                The S3 key
            from_manifest: bool
                If True, treats `key` as a manifest file and loads all urls into a `parsons.Table`.
                Defaults to False.
            aws_access_key_id: str
                Required if not included as environmental variable.
            aws_secret_access_key: str
                Required if not included as environmental variable.
            \**csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            `parsons.Table` object
        """  # noqa: W605

        from parsons.aws import S3
        s3 = S3(aws_access_key_id, aws_secret_access_key)

        if from_manifest:
            with open(s3.get_file(bucket, key)) as fd:
                manifest = json.load(fd)

            s3_keys = [x["url"] for x in manifest["entries"]]

        else:
            s3_keys = [f"s3://{bucket}/{key}"]

        tbls = []
        for key in s3_keys:
            # TODO handle urls that end with '/', i.e. urls that point to "folders"
            _, _, bucket_, key_ = key.split("/", 3)
            file_ = s3.get_file(bucket_, key_)
            if files.compression_type_for_path(key_) == 'zip':
                file_ = files.zip_archive.unzip_archive(file_)

            tbls.append(petl.fromcsv(file_, **csvargs))

        return cls(petl.cat(*tbls))
示例#5
0
def test_compression_type_for_path():
    assert files.compression_type_for_path('some/file') == None
    assert files.compression_type_for_path('some/file.csv') == None
    assert files.compression_type_for_path('some/file.csv.gz') == 'gzip'
示例#6
0
文件: tofrom.py 项目: rdhyee/parsons
    def to_gcs_csv(self,
                   bucket_name,
                   blob_name,
                   app_creds=None,
                   project=None,
                   compression=None,
                   encoding=None,
                   errors='strict',
                   write_header=True,
                   public_url=False,
                   public_url_expires=60,
                   **csvargs):
        """
        Writes the table to a Google Cloud Storage blob as a CSV.

        `Args:`
            bucket_name: str
                The bucket to upload to
            blob_name: str
                The blob to name the file. If it ends in '.gz' or '.zip', the file will be
                compressed.
            app_creds: str
                A credentials json string or a path to a json file. Not required
                if ``GOOGLE_APPLICATION_CREDENTIALS`` env variable set.
            project: str
                The project which the client is acting on behalf of. If not passed
                then will use the default inferred environment.
            compression: str
                The compression type for the csv. Currently "None", "zip" and "gzip" are
                supported. If specified, will override the key suffix.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            public_url: boolean
                Create a public link to the file
            public_url_expire: 60
                The time, in minutes, until the url expires if ``public_url`` set to ``True``.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        `Returns:`
            Public url if specified. If not ``None``.
        """  # noqa: W605

        compression = compression or files.compression_type_for_path(blob_name)

        csv_name = files.extract_file_name(blob_name,
                                           include_suffix=False) + '.csv'

        # Save the CSV as a temp file
        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 csv_name=csv_name,
                                 **csvargs)

        from parsons.google.google_cloud_storage import GoogleCloudStorage
        gcs = GoogleCloudStorage(app_creds=app_creds, project=project)
        gcs.put_blob(bucket_name, blob_name, local_path)

        if public_url:
            return gcs.get_url(bucket_name,
                               blob_name,
                               expires_in=public_url_expires)
        else:
            return None