Пример #1
0
    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super(GoogleCloudStorageToS3Operator, self).execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key)
            existing_files = s3_hook.list_keys(bucket_name)
            files = set(files) - set(existing_files)

        if files:
            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to
            )

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
Пример #2
0
 def check_for_url(self, s3url):
     """
     check if the s3url exists
     :param s3url: S3 url
     :type s3url:str
     :return: bool
     """
     bucket, key = S3Hook.parse_s3_url(s3url)
     s3hook = S3Hook(aws_conn_id=self.aws_conn_id)
     if not s3hook.check_for_bucket(bucket_name=bucket):
         raise AirflowException(
             "The input S3 Bucket {} does not exist ".format(bucket))
     if key and not s3hook.check_for_key(key=key, bucket_name=bucket)\
        and not s3hook.check_for_prefix(
             prefix=key, bucket_name=bucket, delimiter='/'):
         # check if s3 key exists in the case user provides a single file
         # or if s3 prefix exists in the case user provides a prefix for files
         raise AirflowException("The input S3 Key "
                                "or Prefix {} does not exist in the Bucket {}"
                                .format(s3url, bucket))
     return True
Пример #3
0
    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super().execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [file.replace(prefix, '', 1) for file in existing_files]
            files = list(set(files) - set(existing_files))

        if files:
            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to
            )

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
Пример #4
0
    def check_s3_url(self, s3url):
        """
        Check if an S3 URL exists

        :param s3url: S3 url
        :type s3url: str
        :rtype: bool
        """
        bucket, key = S3Hook.parse_s3_url(s3url)
        if not self.s3_hook.check_for_bucket(bucket_name=bucket):
            raise AirflowException(
                "The input S3 Bucket {} does not exist ".format(bucket))
        if key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket)\
           and not self.s3_hook.check_for_prefix(
                prefix=key, bucket_name=bucket, delimiter='/'):
            # check if s3 key exists in the case user provides a single file
            # or if s3 prefix exists in the case user provides multiple files in
            # a prefix
            raise AirflowException("The input S3 Key "
                                   "or Prefix {} does not exist in the Bucket {}"
                                   .format(s3url, bucket))
        return True
    def __init__(self, aws_conn_id, s3_bucket, s3_key, execution_date,
                 cass_cluster, *args, **kwargs):

        super(TargetDBWrite, self).__init__(*args, **kwargs)

        self.aws_conn_id = aws_conn_id
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.execution_date = execution_date
        self.s3_hook = S3Hook(self.aws_conn_id)
        aws_session = self.s3_hook.get_credentials()
        self.spark = SparkSession.builder.appName(
            's3_to_cassandra').getOrCreate()
        self.sc = self.spark.sparkContext
        hadoop_conf = self.sc._jsc.hadoopConfiguration()
        hadoop_conf.set("fs.s3.impl",
                        "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
        # logging.info(f'CREDENTIALS : {aws_session}')
        hadoop_conf.set("fs.s3.awsAccessKeyId", aws_session[0])
        hadoop_conf.set("fs.s3.awsSecretAccessKey", aws_session[1])
        cluster = Cluster(cass_cluster)
        self.session = cluster.connect()
 def execute(self, context):
     """
     redshift_conn_id: redshift cluster connection info.
     aws_credentials_id: necessary info needed to make AWS connection
     s3_bucket: source data in S3 bucket that has the files we want to copy from.
     """
     self.log.info('StageToRedshiftOperator not implemented yet')
     hook = S3Hook(self.aws_credentials_id)
     bucket = self.s3bucket
     keys = hook.list_keys(bucket)
     aws_hook = AwsHook(self.aws_credentials_id)
     credentials = aws_hook.get_credentials()
     session = Session(aws_access_key_id=credentials.access_key,
                   aws_secret_access_key=credentials.secret_key)
     keys = os.listdir('/home/workspace/uk-traffic')
     for key in keys:
         session.resource('s3').Bucket(bucket).upload_file('/home/workspace/uk-traffic/' + key, key, 
                                                           ExtraArgs={'ACL': 'public-read'})
     keys2 = os.listdir('/home/workspace/uk-accident')
     for key in keys2:
         session.resource('s3').Bucket(bucket).upload_file('/home/workspace/uk-accident/' + key, key, 
                                                           ExtraArgs={'ACL': 'public-read'})
Пример #7
0
    def execute(self, context):
        self.log.info('Retrieving credentials')
        s3_hook = S3Hook(self.s3_conn_id)

        # render macros to variables
        rendered_s3_bucket = self.s3_bucket.format(**context)
        rendered_s3_directory = self.s3_directory.format(**context)
        rendered_local_directory = self.local_directory.format(**context)

        # save file to S3
        self.log.info('Saving local directory to S3')
        local_file_list = os.listdir(rendered_local_directory)
        for local_file in local_file_list:
            rendered_s3_key = rendered_s3_directory + local_file
            rendered_local_file = rendered_local_directory + local_file
            self.log.info(rendered_s3_key)
            s3_hook.load_file(filename=rendered_local_file,
                              bucket_name=rendered_s3_bucket,
                              key=rendered_s3_key,
                              replace=self.replace)
        self.log.info('Saved {} locals file to bucket {}'.format(
            len(local_file_list), rendered_s3_bucket))
    def check_s3_url(self, s3url):
        """
        Check if an S3 URL exists

        :param s3url: S3 url
        :type s3url: str
        :rtype: bool
        """
        bucket, key = S3Hook.parse_s3_url(s3url)
        if not self.s3_hook.check_for_bucket(bucket_name=bucket):
            raise AirflowException(
                "The input S3 Bucket {} does not exist ".format(bucket))
        if key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket)\
           and not self.s3_hook.check_for_prefix(
                prefix=key, bucket_name=bucket, delimiter='/'):
            # check if s3 key exists in the case user provides a single file
            # or if s3 prefix exists in the case user provides multiple files in
            # a prefix
            raise AirflowException("The input S3 Key "
                                   "or Prefix {} does not exist in the Bucket {}"
                                   .format(s3url, bucket))
        return True
Пример #9
0
    def execute(self, context):
        """
        This function executes the transfer from the email server (via imap) into s3.

        :param context: The context while executing.
        :type context: dict
        """
        self.log.info(
            'Transferring mail attachment %s from mail server via imap to s3 key %s...',
            self.imap_attachment_name, self.s3_key)

        with ImapHook(imap_conn_id=self.imap_conn_id) as imap_hook:
            imap_mail_attachments = imap_hook.retrieve_mail_attachments(
                name=self.imap_attachment_name,
                mail_folder=self.imap_mail_folder,
                check_regex=self.imap_check_regex,
                latest_only=True)

        s3_hook = S3Hook(aws_conn_id=self.s3_conn_id)
        s3_hook.load_bytes(bytes_data=imap_mail_attachments[0][1],
                           key=self.s3_key,
                           replace=self.s3_overwrite)
Пример #10
0
    def execute(self, context):
        self.log.info("Going to execute CSV to Json Operator")
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)

        self.log.info("Downloading S3 File")
        with NamedTemporaryFile() as csv_file:
            source_obj = s3_hook.get_key(self.csv_key, self.csv_bucket)
            with open(csv_file.name, "wb") as opened_csv_file:
                source_obj.download_fileobj(opened_csv_file)

            with open(csv_file.name, "r") as opened_csv_file:
                reader = csv.DictReader(opened_csv_file)
                json_data = self.python_callable(reader)

                try:
                    first_row = next(json_data)
                except StopIteration:
                    self.log.info("Callable didn't return any rows")
                    return False

                self.log.info("Uploading to S3")
                rows = itertools.chain([first_row], json_data)
                with NamedTemporaryFile() as final_file:
                    with open(final_file.name, "w",
                              encoding="utf-8") as opened_final_file:
                        for row in rows:
                            opened_final_file.write(
                                json.dumps(row, ensure_ascii=False))
                            opened_final_file.write("\n")
                        opened_final_file.flush()

                    s3_hook.load_file(
                        filename=final_file.name,
                        key=self.json_key,
                        bucket_name=self.json_bucket,
                        replace=True,
                    )
                self.log.info("Finished executing CSV to JSON Operator")
                return True
    def execute(self, context):
        hook = AutopilotHook(http_conn_id=self.autopilot_conn_id)

        results = []

        if self.ids:
            for id in self.ids:
                id_endpoint = "{}/{}".format(self.autopilot_resource, id)

                if self.contacts:
                    results += self.get_all_contacts(hook,
                                                     id_endpoint,
                                                     data=self.payload)
                else:
                    results += self.get(hook, id_endpoint, data=self.payload)
        elif self.contacts:
            results += self.get_all_contacts(hook,
                                             self.autopilot_resource,
                                             data=self.payload)
        else:
            results += self.get(hook,
                                self.autopilot_resource,
                                results_field=self.results_field,
                                data=self.payload)

        with NamedTemporaryFile("w") as tmp:
            for result in results:
                tmp.write(json.dumps(result) + '\n')

            tmp.flush()

            dest_s3 = S3Hook(s3_conn_id=self.s3_conn_id)

            dest_s3.load_file(filename=tmp.name,
                              key=self.s3_key,
                              bucket_name=self.s3_bucket,
                              replace=True)

            dest_s3.connection.close()
Пример #12
0
    def __init__(self,
                 redshift_conn_id: str = "",
                 aws_credentials_id: str = "",
                 target_table: str = "",
                 s3_bucket: Optional[str] = None,
                 s3_key: Optional[str] = None,
                 json_path: Optional[str] = None,
                 ignore_headers: int = 1,
                 delimiter: str = ',',
                 *args, **kwargs):

        super(StageToRedshiftOperator, self).__init__(*args, **kwargs)

        self.redshift_conn_id = redshift_conn_id
        self.aws_credentials_id = aws_credentials_id
        self.target_table = target_table
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.json_path = json_path
        self.ignore_headers = ignore_headers
        self.delimiter = delimiter
        self.s3_hook = S3Hook(aws_conn_id=aws_credentials_id)
Пример #13
0
    def setUp(self):
        hook = SSHHook(ssh_conn_id='ssh_default')
        s3_hook = S3Hook('aws_default')
        hook.no_host_key_check = True
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE,
            'provide_context': True
        }
        dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
        dag.schedule_interval = '@once'

        self.hook = hook
        self.s3_hook = s3_hook

        self.ssh_client = self.hook.get_conn()
        self.sftp_client = self.ssh_client.open_sftp()

        self.dag = dag
        self.s3_bucket = BUCKET
        self.sftp_path = SFTP_PATH
        self.s3_key = S3_KEY
Пример #14
0
    def execute(self, context):
        self.log.info(f'S3DataExistsOperator')
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
        rendered_prefix = self.prefix.format(**context)

        success = s3_hook.check_for_bucket(self.bucket)
        if success:
            self.log.info("Found the bucket: {}".format(self.bucket))
        else:
            self.log.info("Invalid bucket: {}".format(self.bucket))
            raise FileNotFoundError("No S3 bucket named {}".format(
                self.bucket))

        success = s3_hook.check_for_prefix(prefix=rendered_prefix,
                                           delimiter='/',
                                           bucket_name=self.bucket)
        if success:
            self.log.info("Found the prefix: {}".format(rendered_prefix))
        else:
            self.log.info("Invalid prefix: {}".format(rendered_prefix))
            raise FileNotFoundError("No prefix named {}/{} ".format(
                self.bucket, rendered_prefix))
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        unload_options = '\n\t\t\t'.join(self.unload_options)

        self.log.info("Retrieving headers from %s.%s...", self.schema, self.table)

        columns_query = """SELECT column_name
                            FROM information_schema.columns
                            WHERE table_schema = '{0}'
                            AND   table_name = '{1}'
                            ORDER BY ordinal_position
                        """.format(self.schema, self.table)

        cursor = self.hook.get_conn().cursor()
        cursor.execute(columns_query)
        rows = cursor.fetchall()
        columns = [row[0] for row in rows]
        column_names = ', '.join("\\'{0}\\'".format(c) for c in columns)
        column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c)
                                    for c in columns)

        unload_query = """
                        UNLOAD ('SELECT {0}
                        UNION ALL
                        SELECT {1} FROM {2}.{3}
                        ORDER BY 1 DESC')
                        TO 's3://{4}/{5}/{3}_'
                        with
                        credentials 'aws_access_key_id={6};aws_secret_access_key={7}'
                        {8};
                        """.format(column_names, column_castings, self.schema, self.table,
                                   self.s3_bucket, self.s3_key, credentials.access_key,
                                   credentials.secret_key, unload_options)

        self.log.info('Executing UNLOAD command...')
        self.hook.run(unload_query, self.autocommit)
        self.log.info("UNLOAD command complete...")
Пример #16
0
def copy_events_from_s3_to_redshift(*args, **kwargs):
    table = kwargs['params']['table']
    hook = S3Hook(aws_conn_id='aws_credentials')
    redshift_hook = PostgresHook('redshift')

    # get Variables
    log_data = Variable.get('LOG_DATA')
    arn_iam_role = Variable.get('iam_role')
    region = Variable.get('region')
    log_jsonpath = Variable.get('LOG_JSONPATH')
    logging.info(f"Copying from s3 {log_data} to redshift table {table}")

    # format the COPY_SQL string
    sql_stmt = create_tables.COPY_SQL.format(
        table,
        log_data,
        arn_iam_role,
        region,
        log_jsonpath
    )
    logging.info(f"COPY SQL statement is: {sql_stmt}")
    redshift_hook.run(sql_stmt)
    def build_copy(self):
        a_key, s_key = S3Hook(s3_conn_id=self.s3_conn_id).get_credentials()
        snowflake_destination = ''

        if self.database:
            snowflake_destination += '{}.'.format(self.database)

        if self.schema:
            snowflake_destination += '{}.'.format(self.schema)

        snowflake_destination += self.table

        fmt_str = {
            'snowflake_destination': snowflake_destination,
            's3_bucket': self.s3_bucket,
            's3_key': self.s3_key,
            'aws_access_key_id': a_key,
            'aws_secret_access_key': s_key,
            'file_format_name': self.file_format_name
        }

        return self.copy.format(**fmt_str)
Пример #18
0
    def __init__(
        self,
        s3_bucket,
        s3_key,
        schema,
        table,
        sql=None,
        druid_ingest_spec=None,
        unload_options=tuple(),
        include_header=False,
        autocommit=False,
        aws_conn_id="aws_default",
        redshift_conn_id="postgres_default",
        druid_conn_id="druid_ingest_default",
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.aws_conn_id = aws_conn_id
        self.redshift_conn_id = redshift_conn_id
        self.druid_conn_id = druid_conn_id
        self.s3_bucket = s3_bucket
        self.s3_key = s3_key
        self.schema = schema
        self.table = table
        self.sql = sql
        self.druid_ingest_spec = druid_ingest_spec
        self.unload_options = unload_options
        self.autocommit = autocommit
        self.include_header = include_header

        self.pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)

        if self.include_header and "PARALLEL OFF" not in [
            uo.upper().strip() for uo in unload_options
        ]:
            self.unload_options = list(unload_options) + ["PARALLEL OFF"]
Пример #19
0
    def execute(self, context):
        facebook_hook = FacebookAdsHook(
            access_token=self.access_token, facebook_ads_conn_id=self.facebook_conn_id
        )
        s3_hook = S3Hook(self.aws_conn_id)

        self.log.info("Fetch API since: %s", str(self.since))
        self.log.info("Fetch API until: %s", str(self.until))
        self.log.info("Breakdowns: %s", str(self.breakdowns))
        self.log.info("Fields: %s", str(self.insight_fields))

        time_range = {"since": self.since, "until": self.until}

        file_name = "/tmp/{key}.jsonl".format(key=uuid.uuid4().hex)

        with open(file_name, "w") as insight_file:
            for account_id in self.account_ids:
                insights = facebook_hook.insights(
                    account_id,
                    self.insight_fields,
                    self.breakdowns,
                    time_range,
                    self.time_increment,
                    self.level,
                    self.limit,
                )

                if len(insights) > 0:
                    for insight in insights:
                        insight_file.write(json.dumps(insight) + "\n")

                s3_hook.load_file(
                    filename=file_name,
                    key=self.s3_key,
                    bucket_name=self.s3_bucket,
                    replace=True,
                )
        os.remove(file_name)
    def execute(self, context):
        """
        Executes the operator logic
        :param context:
        """

        self.log.info('StagepayToRedshiftOperator execute')
        redshift= PostgresHook(postgres_conn_id=self.redshift_conn_id)
        #redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(self.aws_credentials_id)
        #self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=False)
        credentials = self.s3.get_credentials()
        #aws_hook = AwsHook(self.aws_credentials_id)
        #credentials = aws_hook.get_credentials()
        #redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        #copy_options = '\n\t\t\t'.join(self.copy_options)


        self.log.info("Clearing data from destination Redshift table")
        redshift.run("DELETE FROM {}".format(self.table))

        self.log.info("Copying data from S3 to Redshift")
        rendered_key = self.s3_key.format(**context)
        s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key)
        self.log.info('StagepayToRedshiftOperatorr s3_path: ' + s3_path)
        #formatted_sql = StageToRedshiftOperator.copy_sql.format(
        formatted_sql = StagepayToRedshiftOperator.copy_query.format(
                    self.table,
                    s3_path,
                    credentials.access_key,
                    credentials.secret_key,
                    #self.delimiter,
                    self.format_as_json
                    #copy_options=copy_options
                        )
        #redshift.run(copy_query, self.autocommit)
       # redshift.run(formatted_sql)
        redshift.run(formatted_sql,self.autocommit)
Пример #21
0
def monitor_S3_key(**context):
    """
    S3 monitor will log metrics for the target key, collecting the following metrics:
    - size (MB)
    - context type (MIME type)
    - last modified timestamp
    - metadata associated with the key
    - parts count
    - storage class
    """
    s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID)

    target_path = context["target_s3_path"]
    basename = context["path_basename"]
    log_metric("target file", target_path)

    boto3_key_object = s3_hook.get_key(key=target_path)

    key_metrics = {
        "{}-size(MB)".format(basename): (boto3_key_object.content_length / MB),
        "{}-content_type".format(basename):
        boto3_key_object.content_type,
        "{}-last_modified".format(basename):
        boto3_key_object.last_modified.__str__(),
        "{}-metadata".format(basename):
        boto3_key_object.metadata,
        "{}-parts_count".format(basename):
        boto3_key_object.parts_count,
    }

    key_metrics["{}-storage_class".format(basename)] = (
        boto3_key_object.storage_class
        if boto3_key_object.storage_class else "s3 standard")

    for metric_name, value in key_metrics.items():
        log_metric(metric_name, value)

    context["ti"].xcom_push("{}_key_metrics".format(basename), key_metrics)
Пример #22
0
def upload(**kwargs):
    """ Function to upload all of the output files from Kneaddata and Humann2 and their temporary files """
    s3 = S3Hook()
    files = os.listdir(os.path.abspath('output'))
    file_base = kwargs['ti'].xcom_pull(task_ids="parse_filename")
    [
        s3.load_file(os.path.join(os.path.abspath('output'), file_name),
                     os.path.join('output', file_name),
                     bucket_name='airflow-project',
                     replace=True) for file_name in files if
        not os.path.isdir(os.path.join(os.path.abspath('output'), file_name))
    ]
    [
        s3.load_file(os.path.join(os.path.abspath('output'),
                                  file_base + '_kneaddata_paired_humann2_temp',
                                  file_name),
                     os.path.join('output', file_name),
                     bucket_name='airflow-project',
                     replace=True)
        for file_name in os.listdir(
            os.path.join(os.path.abspath('output'), file_base +
                         '_kneaddata_paired_humann2_temp'))
    ]
Пример #23
0
    def test_execute(self, mock_hook, mock_hook2):
        mock_hook.return_value.list.return_value = MOCK_FILES
        mock_hook.return_value.download.return_value = b"testing"
        mock_hook2.return_value.list.return_value = MOCK_FILES

        operator = GoogleCloudStorageToS3Operator(task_id=TASK_ID,
                                                  bucket=GCS_BUCKET,
                                                  prefix=PREFIX,
                                                  delimiter=DELIMITER,
                                                  dest_aws_conn_id=None,
                                                  dest_s3_key=S3_BUCKET)
        # create dest bucket
        hook = S3Hook(aws_conn_id=None)
        b = hook.get_bucket('bucket')
        b.create()
        b.put_object(Key=MOCK_FILES[0], Body=b'testing')

        # we expect MOCK_FILES[1:] to be uploaded
        # and all MOCK_FILES to be present at the S3 bucket
        uploaded_files = operator.execute(None)
        self.assertEqual(sorted(MOCK_FILES[1:]), sorted(uploaded_files))
        self.assertEqual(sorted(MOCK_FILES),
                         sorted(hook.list_keys('bucket', delimiter='/')))
Пример #24
0
    def execute(self, context):
        postgres_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id)
        s3_hook = S3Hook(aws_conn_id=self._s3_conn_id)

        with postgres_hook.get_cursor() as cursor:
            cursor.execute(self._query)
            results = cursor.fetchall()
            headers = [_[0] for _ in cursor.description]

        data_buffer = io.StringIO()
        csv_writer = csv.writer(data_buffer,
                                quoting=csv.QUOTE_ALL,
                                lineterminator=os.linesep)
        csv_writer.writerow(headers)
        csv_writer.writerows(results)
        data_buffer_binary = io.BytesIO(data_buffer.getvalue().encode())

        s3_hook.load_file_obj(
            file_obj=data_buffer_binary,
            bucket_name=self._s3_bucket,
            key=self._s3_key,
            replace=True,
        )
    def _setup_dest_conn(self, dest_conn_id, results_bucket_name,
                         results_dest_name):
        """
        Setup results connection. Retrieves s3 connection and makes sure we've got location details (bucket, filename)
        :param dest_conn_id:
        :param results_bucket_name:
        :param results_dest_name:
        """
        conn = BaseHook._get_connection_from_env(dest_conn_id)
        if conn.conn_type != 's3':
            raise AttributeError(
                "Only s3 is allowed as a results destination, not {0}".format(
                    conn.conn_type))

        self.dest_conn = S3Hook(aws_conn_id=dest_conn_id)
        self.dest_conn_id = dest_conn_id

        if results_bucket_name is None or results_dest_name is None:
            raise AttributeError(
                "Specify bucket name and key name to store results")

        self.results_bucket_name = results_bucket_name
        self.results_dest_name = results_dest_name
Пример #26
0
def export_variable():
    session = settings.Session()
    s3_hook = S3Hook()
    s3_client = s3_hook.get_conn()
    query = session.query(Variable)
    allrows = query.all()
    k = ["key", "val", "is_encrypted", "description"]
    if len(allrows) > 0:
        outfileStr = ""
        f = StringIO(outfileStr)
        w = csv.DictWriter(f, k)
        for y in allrows:
            w.writerow({
                k[0]: y.key,
                k[1]: y.get_val(),
                k[2]: y.is_encrypted,
                k[3]: None
            })
        outkey = S3_KEY + 'variable.csv'
        s3_client.put_object(Bucket=S3_BUCKET, Key=outkey, Body=f.getvalue())
    session.close()

    return "OK"
Пример #27
0
    def _fetch_file_names(self):

        self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id);

        self.bucket = self.s3_hook.get_bucket(self.s3_bucket);
        if not self.bucket:
            raise AirflowException("Bucket Does Not Exist");

        s3_keys = self.s3_hook.list_keys(bucket_name=self.s3_bucket , prefix="m&");

        if s3_keys is not None and len(s3_keys) > 0:
            self.s3_path = s3_keys[random.randint(0,len(s3_keys)-1)];
            key_breaks = self.s3_path.split(".");
            index_files = key_breaks[0].split("&");
            table_name = index_files[1];
            primary_key = index_files[2];
            self.src_table = table_name + "_staging";
            self.dest_table = table_name;
            self.src_keys = [primary_key];
            self.dest_keys = [primary_key];
            return True;

        return False;
def listS3BucketKeys():

    # set necessary Airflow Variables and store them in metastore DB
    Variable.set("s3_bucket", "udacity-dend")
    Variable.set("s3_prefix", "data-pipelines")

    # instantiate S3Hook Class
    # Airflow's S3Hook Docs: https://bit.ly/2B2tHN7
    sampleHook = S3Hook(aws_conn_id='aws_credentials')

    # retrieve Variable values from metastore
    s3_bucket = Variable.get("s3_bucket")
    s3_prefix = Variable.get("s3_prefix")

    # print message
    logging.info(f'Listing Keys from S3 Bucket: {s3_bucket}/{s3_prefix}')

    # use S3Hook's "list_keys()" method return a List Object of bucket keys
    s3KeyList = sampleHook.list_keys(s3_bucket, prefix=s3_prefix)

    # iterate on "keys" object and print each item
    for key in s3KeyList:
        logging.info(f"- S3://{s3_bucket}/{key}")
Пример #29
0
def upload_to_s3(**kwargs):
    """
    Generates a CSV that is then uploaded to Google Cloud Storage using the
    S3Hook.

    This is meant to imitate the first step of a traditional ETL DAG: ingesting
    data from some external source.

    This shows how this can be done with an arbitrary python script.

    """

    df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                      columns=['col_a', 'col_b', 'col_c', 'col_d'])

    df.to_csv('test_data.csv', index=False)

    hook = S3Hook(aws_conn_id='astronomer-s3')

    hook.load_file(bucket_name='astronomer-workflows-dev',
                   key='test_data.csv',
                   filename='test_data.csv',
                   replace=True)
Пример #30
0
    def execute(self, context):
        hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        credentials = s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)
        table = f'{self.schema}.{self.table}' if self.schema is not None else self.table

        copy_query = """
            COPY {table}
            FROM 's3://{s3_bucket}/{s3_key}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(table=table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        hook.run(copy_query, self.autocommit)
        self.log.info("COPY command complete...")
def gather_posts_html(**kwargs):
    print('About to gather post index html')
    http_hook = HttpHook(method='GET', http_conn_id=http_local_posts_conn_id)
    res = http_hook.run(post_index_endpoint, headers=headers)

    print('Finished gathering post index html')

    # with the response, now we insert into the bucket
    execution_time = dt.datetime.fromisoformat(kwargs['ts'])
    print(type(execution_time))
    print(execution_time)
    formatted_execution_time = execution_time.strftime('%Y%m%d-%H%M%S')
    key = f"indexes/{formatted_execution_time}-posts.html"

    with tempfile.NamedTemporaryFile() as temp:
        print(temp)
        temp.write(res.content)
        print(f"Writing {temp.name} to html to s3 with key {key}")
        temp.seek(0)
        print(res.content)
        s3_hook = S3Hook(aws_conn_id='s3_posts_html')
        s3_hook.load_file(temp.name, key, bucket_name=posts_bucket_name)
        print('Finished writing html to s3')
Пример #32
0
    def execute(self, context):
        self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.s3 = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = self.s3.get_credentials()
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_query = """
            COPY {table}
            FROM 's3://{s3_bucket}/{s3_key}/{table}'
            with credentials
            'aws_access_key_id={access_key};aws_secret_access_key={secret_key}'
            {copy_options};
        """.format(
                   table=self.table,
                   s3_bucket=self.s3_bucket,
                   s3_key=self.s3_key,
                   access_key=credentials.access_key,
                   secret_key=credentials.secret_key,
                   copy_options=copy_options)

        self.log.info('Executing COPY command...')
        self.hook.run(copy_query)
        self.log.info("COPY command complete...")
Пример #33
0
    def execute(self, context):
        self.hook = JdbcHook(jdbc_conn_id=self.snowflake_conn_id)
        self.s3 = S3Hook(s3_conn_id=self.s3_conn_id)

        sql = self.pre_sql
        if self.drop_and_create:
            sql += self._build_pre_sql()

        s3_bucket, s3_key = self.s3.parse_s3_url(self.data_s3_key)
        if s3_bucket != S3_BUCKET:
            raise ValueError(
                'For Snowflake loads the S3 bucket must be {}. Got: {}'.format(
                    S3_BUCKET, s3_bucket))
        copy_sql = """
            COPY INTO {table}
            FROM @airflow.{stage}/{s3_key};
        """.format(
            table=self.table,
            stage=self.stage,
            s3_key=s3_key,
        )
        sql.append(copy_sql)
        self.hook.run(['BEGIN;'] + sql + ['COMMIT;'])
Пример #34
0
def upload_files_to_s3(s3_conn_id: str,
                       s3_bucket: str,
                       max_connections: int = 10,
                       **context) -> str:
    results: List[str] = []
    result_map: Union[Iterator, List] = []
    templates_dict: Dict[str, str] = context.get("templates_dict", {})
    filepaths: str = templates_dict.get("filepaths", "").strip()

    def upload_file(filepath_and_hook: Tuple[str, S3Hook],
                    bucket: str = s3_bucket):
        return _upload_file(filepath_and_hook, bucket)

    if filepaths:
        log.info(f"Connecting to s3 connection: {s3_conn_id}")
        hook = S3Hook(s3_conn_id, verify=False)
        filepath_list = filepaths.split(",")
        paths = [(os.path.abspath(fp), hook) for fp in filepath_list]
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=max_connections) as executor:
            result_map = executor.map(upload_file, paths)
    results = [r for r in result_map if r is not None]
    return ",".join(results)
Пример #35
0
 def test_parse_s3_url(self):
     parsed = S3Hook.parse_s3_url(self.s3_test_url)
     self.assertEqual(parsed,
                      ("test", "this/is/not/a-real-key.txt"),
                      "Incorrect parsing of the s3 url")
 def __init__(self, conn_id):
     S3Hook.__init__(self, s3_conn_id=conn_id)