示例#1
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
示例#2
0
 def poke(self, context):
     self.log.info('Sensor checks existence of objects: %s, %s',
                   self.bucket, self.prefix)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return bool(hook.list(self.bucket, prefix=self.prefix))
示例#3
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )

        if '*' in self.source_object:
            wildcard_position = self.source_object.index('*')
            objects = hook.list(self.source_bucket,
                                prefix=self.source_object[:wildcard_position],
                                delimiter=self.source_object[wildcard_position + 1:])
            for source_object in objects:
                self.log.info('Executing copy of gs://{0}/{1} to '
                              'gs://{2}/{3}/{1}'.format(self.source_bucket,
                                                        source_object,
                                                        self.destination_bucket,
                                                        self.destination_object,
                                                        source_object))
                hook.copy(self.source_bucket, source_object,
                          self.destination_bucket, "{}/{}".format(self.destination_object,
                                                                  source_object))
                if self.move_object:
                    hook.delete(self.source_bucket, source_object)

        else:
            self.log.info('Executing copy: %s, %s, %s, %s', self.source_bucket,
                          self.source_object,
                          self.destination_bucket or self.source_bucket,
                          self.destination_object or self.source_object)
            hook.copy(self.source_bucket, self.source_object,
                      self.destination_bucket, self.destination_object)

            if self.move_object:
                hook.delete(self.source_bucket, self.source_object)
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                          delegate_to=self.delegate_to)
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object))
        source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id))
            return max_id
 def execute(self, context):
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id
     )
     hook.insert_object_acl(bucket=self.bucket, object_name=self.object_name,
                            entity=self.entity, role=self.role,
                            generation=self.generation, user_project=self.user_project)
示例#6
0
    def execute(self, context):

        self.log.info('Executing copy - Source_Bucket: %s, Source_directory: %s, '
                      'Destination_bucket: %s, Destination_directory: %s',
                      self.source_bucket, self.source_object,
                      self.destination_bucket or self.source_bucket,
                      self.destination_directory or self.source_object)

        hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                      delegate_to=self.delegate_to)

        self.log.info('Getting list of the files to copy. Source Bucket: %s; Source Object: %s',
                      self.source_bucket, self.source_object)

        # Create a list of objects to copy from Source bucket. The function uses prefix keyword to pass the name of
        # the object to copy.
        self.files_to_copy = hook.list(bucket=self.source_bucket, prefix=self.source_object,
                                       delimiter=self.source_files_delimiter)

        # Log the names of all objects to be copied
        self.log.info('Files to copy: %s', self.files_to_copy)

        if self.files_to_copy is not None:
            for file_to_copy in self.files_to_copy:
                self.log.info('Source_Bucket: %s, Source_Object: %s, '
                              'Destination_bucket: %s, Destination_Directory: %s',
                              self.source_bucket, file_to_copy,
                              self.destination_bucket or self.source_bucket,
                              self.destination_directory + file_to_copy)
                hook.copy(self.source_bucket, file_to_copy,
                          self.destination_bucket, self.destination_directory + file_to_copy)
        else:
            self.log.info('No Files to copy.')
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning,
            labels=self.labels
        )
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_external_table(
            external_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            compression=self.compression,
            skip_leading_rows=self.skip_leading_rows,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            src_fmt_configs=self.src_fmt_configs,
            labels=self.labels
        )
示例#9
0
    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super(GoogleCloudStorageToS3Operator, self).execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key)
            existing_files = s3_hook.list_keys(bucket_name)
            files = set(files) - set(existing_files)

        if files:
            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to
            )

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google cloud storage.
     """
     hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
     for object, tmp_file_handle in files_to_upload.items():
         hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
示例#11
0
 def execute(self, context):
     logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename)
     hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
     file_bytes = hook.download(self.bucket, self.object, self.filename)
     if self.store_to_xcom_key:
         if sys.getsizeof(file_bytes) < 48000:
             context['ti'].xcom_push(key=self.store_to_xcom_key, value=file_bytes)
         else:
             raise RuntimeError('The size of the downloaded file is too large to push to XCom!')
     print(file_bytes)
 def apply_validate_fn(*args, **kwargs):
     prediction_path = kwargs["templates_dict"]["prediction_path"]
     scheme, bucket, obj, _, _ = urlsplit(prediction_path)
     if scheme != "gs" or not bucket or not obj:
         raise ValueError("Wrong format prediction_path: %s",
                          prediction_path)
     summary = os.path.join(obj.strip("/"),
                            "prediction.summary.json")
     gcs_hook = GoogleCloudStorageHook()
     summary = json.loads(gcs_hook.download(bucket, summary))
     return validate_fn(summary)
 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google cloud storage.
     """
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     for tmp_file in files_to_upload:
         hook.upload(self.bucket, tmp_file.get('file_name'),
                     tmp_file.get('file_handle').name,
                     mime_type=tmp_file.get('file_mime_type'))
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )

        self.log.info('Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s',
                      self.bucket, self.delimiter, self.prefix)

        return hook.list(bucket=self.bucket,
                         prefix=self.prefix,
                         delimiter=self.delimiter)
示例#15
0
    def execute(self, context):
        """
        Uploads the file to Google cloud storage
        """
        hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)

        hook.upload(
            bucket=self.bucket,
            object=self.dst,
            mime_type=self.mime_type,
            filename=self.src)
 def execute(self, context):
     gcp_text_to_speech_hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id)
     result = gcp_text_to_speech_hook.synthesize_speech(
         input_data=self.input_data,
         voice=self.voice,
         audio_config=self.audio_config,
         retry=self.retry,
         timeout=self.timeout,
     )
     with NamedTemporaryFile() as temp_file:
         temp_file.write(result.audio_content)
         cloud_storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id)
         cloud_storage_hook.upload(
             bucket=self.target_bucket_name, object=self.target_filename, filename=temp_file.name
         )
示例#17
0
    def execute(self, context):
        if self.labels is not None:
            self.labels.update(
                {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}
            )

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )

        hook.create_bucket(bucket_name=self.bucket_name,
                           storage_class=self.storage_class,
                           location=self.location,
                           project_id=self.project_id,
                           labels=self.labels)
示例#18
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                                  and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            schema_update_options=self.schema_update_options,
            src_fmt_configs=self.src_fmt_configs,
            time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key,
                self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table, self.max_id_key, max_id
            )
            return max_id
class GoogleCloudBucketHelper(object):
    """GoogleCloudStorageHook helper class to download GCS object."""
    GCS_PREFIX_LENGTH = 5

    def __init__(self,
                 gcp_conn_id='google_cloud_default',
                 delegate_to=None):
        self._gcs_hook = GoogleCloudStorageHook(gcp_conn_id, delegate_to)

    def google_cloud_to_local(self, file_name):
        """
        Checks whether the file specified by file_name is stored in Google Cloud
        Storage (GCS), if so, downloads the file and saves it locally. The full
        path of the saved file will be returned. Otherwise the local file_name
        will be returned immediately.

        :param file_name: The full path of input file.
        :type file_name: str
        :return: The full path of local file.
        :type: str
        """
        if not file_name.startswith('gs://'):
            return file_name

        # Extracts bucket_id and object_id by first removing 'gs://' prefix and
        # then split the remaining by path delimiter '/'.
        path_components = file_name[self.GCS_PREFIX_LENGTH:].split('/')
        if len(path_components) < 2:
            raise Exception(
                'Invalid Google Cloud Storage (GCS) object path: {}'
                .format(file_name))

        bucket_id = path_components[0]
        object_id = '/'.join(path_components[1:])
        local_file = '/tmp/dataflow{}-{}'.format(str(uuid.uuid4())[:8],
                                                 path_components[-1])
        self._gcs_hook.download(bucket_id, object_id, local_file)

        if os.stat(local_file).st_size > 0:
            return local_file
        raise Exception(
            'Failed to download Google Cloud Storage (GCS) object: {}'
            .format(file_name))
示例#20
0
    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super().execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [file.replace(prefix, '', 1) for file in existing_files]
            files = list(set(files) - set(existing_files))

        if files:
            hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to
            )

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files
示例#21
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )
        log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}'

        if self.wildcard in self.source_object:
            prefix, delimiter = self.source_object.split(self.wildcard, 1)
            objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter)

            for source_object in objects:
                if self.destination_object is None:
                    destination_object = source_object
                else:
                    destination_object = source_object.replace(prefix,
                                                               self.destination_object, 1)
                self.log.info(
                    log_message.format(self.source_bucket, source_object,
                                       self.destination_bucket, destination_object)
                )

                hook.rewrite(self.source_bucket, source_object,
                             self.destination_bucket, destination_object)
                if self.move_object:
                    hook.delete(self.source_bucket, source_object)

        else:
            self.log.info(
                log_message.format(self.source_bucket, self.source_object,
                                   self.destination_bucket or self.source_bucket,
                                   self.destination_object or self.source_object)
            )
            hook.rewrite(self.source_bucket, self.source_object,
                         self.destination_bucket, self.destination_object)

            if self.move_object:
                hook.delete(self.source_bucket, self.source_object)
    def execute(self, context):
        self.log.info('Exporting data to Cloud Storage bucket ' + self.bucket)

        if self.overwrite_existing and self.namespace:
            gcs_hook = GoogleCloudStorageHook(self.cloud_storage_conn_id)
            objects = gcs_hook.list(self.bucket, prefix=self.namespace)
            for o in objects:
                gcs_hook.delete(self.bucket, o)

        ds_hook = DatastoreHook(self.datastore_conn_id, self.delegate_to)
        result = ds_hook.export_to_storage_bucket(bucket=self.bucket,
                                                  namespace=self.namespace,
                                                  entity_filter=self.entity_filter,
                                                  labels=self.labels)
        operation_name = result['name']
        result = ds_hook.poll_operation_until_done(operation_name,
                                                   self.polling_interval_in_seconds)

        state = result['metadata']['common']['state']
        if state != 'SUCCESSFUL':
            raise AirflowException('Operation failed: result={}'.format(result))

        return result
示例#23
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )

        if self.destination_bucket is None:
            self.log.warning(
                'destination_bucket is None. Defaulting it to source_bucket (%s)',
                self.source_bucket)
            self.destination_bucket = self.source_bucket

        if WILDCARD in self.source_object:
            total_wildcards = self.source_object.count(WILDCARD)
            if total_wildcards > 1:
                error_msg = "Only one wildcard '*' is allowed in source_object parameter. " \
                            "Found {} in {}.".format(total_wildcards, self.source_object)

                raise AirflowException(error_msg)

            prefix, delimiter = self.source_object.split(WILDCARD, 1)
            objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter)

            for source_object in objects:
                if self.destination_object is None:
                    destination_object = source_object
                else:
                    destination_object = source_object.replace(prefix,
                                                               self.destination_object, 1)

                self._copy_single_object(hook=hook, source_object=source_object,
                                         destination_object=destination_object)
        else:
            self._copy_single_object(hook=hook, source_object=self.source_object,
                                     destination_object=self.destination_object)
示例#24
0
    def __init__(self):
        """
        Attempt to create hook with airflow[gcp_api].
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.hook = None

        try:
            from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=remote_conn_id)
        except:
            logging.error(
                'Could not create a GoogleCloudStorageHook with connection id '
                '"{}". Please make sure that airflow[gcp_api] is installed '
                'and the GCS connection exists.'.format(remote_conn_id))
示例#25
0
def move_error_file_func(**context):
    filename, filepath = context['ti'].xcom_pull(task_ids='create_file')
    conn = GoogleCloudStorageHook()
    target_bucket = os.getenv('UPLOAD_GCS_BUCKET_NAME')
    target_object = 'moved/' + filename
    conn.upload(target_bucket, target_object, filepath)
示例#26
0
 def _upload_to_gcs(self, files_to_upload):
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     for object, tmp_file_handle in files_to_upload.items():
         hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def compileactivity():
    hook = GoogleCloudStorageHook()

    github_response = hook.download(bucket=GOOGLE_STORAGE_BUCKET,
                                    object=GITHUB_OUTPUT_FILENAME)
    strava_response = hook.download(bucket=GOOGLE_STORAGE_BUCKET,
                                    object=STRAVA_OUTPUT_FILENAME)

    github_response_json = json.loads(github_response.decode("utf-8"))
    strava_response_json = json.loads(strava_response.decode("utf-8"))

    cleaned_github_commits = list(
        map(
            lambda item: {
                'created_at':
                dateutil.parser.parse(item['commit']['committer']['date']).
                isoformat(),
                'username':
                item['committer']['login'],
                'url':
                item['html_url'],
                'sha':
                item['sha'],
                'message':
                item['commit']['message'],
                'repo':
                item['repository']['full_name']
            }, github_response_json['items']))

    cleaned_github_commits.sort(
        key=lambda x: dateutil.parser.parse(x['created_at']), reverse=True)

    cleaned_strava_activity = list(
        map(
            lambda ride: {
                'created_at': dateutil.parser.parse(ride['start_date']).
                isoformat(),
                'name': ride['name'],
                'distance_miles': round(ride['distance'] / 1609.34, 2),
                'type': ride['type'],
                'elapsed_time_seconds': ride['elapsed_time']
            }, strava_response_json))

    cleaned_strava_activity.sort(
        key=lambda x: dateutil.parser.parse(x['created_at']), reverse=True)

    public_activity = {
        'github': cleaned_github_commits[0:4],
        'strava': cleaned_strava_activity[0:4],
    }

    hook = GoogleCloudStorageHook()

    with tempfile.NamedTemporaryFile(prefix="gcs-local") as file:
        file.write(json.dumps(public_activity).encode('utf-8'))
        file.flush()

        hook.upload(bucket=GOOGLE_STORAGE_BUCKET,
                    filename=file.name,
                    object=OUTPUT_FILENAME,
                    mime_type='application/json')
        hook.insert_object_acl(
            bucket=GOOGLE_STORAGE_BUCKET,
            object_name=OUTPUT_FILENAME,
            entity='allUsers',
            role='READER',
        )
 def execute(self, context):
     gcshook = GoogleCloudStorageHook(self.gcp_conn_id)
     self.log.info(gcshook.list("testcovidlinh"))
示例#29
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )
        log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}'

        if self.wildcard in self.source_object:
            prefix, delimiter = self.source_object.split(self.wildcard, 1)
            objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter)

            for source_object in objects:
                if self.destination_object is None:
                    destination_object = source_object
                else:
                    destination_object = source_object.replace(prefix,
                                                               self.destination_object, 1)
                self.log.info(
                    log_message.format(self.source_bucket, source_object,
                                       self.destination_bucket, destination_object)
                )

                hook.copy(self.source_bucket, source_object,
                          self.destination_bucket, destination_object)
                if self.move_object:
                    hook.delete(self.source_bucket, source_object)

        else:
            self.log.info(
                log_message.format(self.source_bucket, self.source_object,
                                   self.destination_bucket or self.source_bucket,
                                   self.destination_object or self.source_object)
            )
            hook.copy(self.source_bucket, self.source_object,
                      self.destination_bucket, self.destination_object)

            if self.move_object:
                hook.delete(self.source_bucket, self.source_object)
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super().execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(
                bucket_name, prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info(
                    '%s files are going to be synced: %s.', len(files), files
                )
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files
示例#31
0
 def poke(self, context):
     hook = GoogleCloudStorageHook()
     return self.is_bucket_updated(len(hook.list(self.bucket, prefix=self.prefix)))
示例#32
0
def build_export_dag(dag_id,
                     web3_provider_uri,
                     web3_provider_uri_archival,
                     output_bucket,
                     start_date,
                     chain='ethereum',
                     notification_emails=None,
                     schedule_interval='0 0 * * *',
                     export_max_workers=10,
                     export_batch_size=10,
                     **kwargs):

    default_dag_args = {
        "depends_on_past": False,
        "start_date": start_date,
        "email_on_failure": True,
        "email_on_retry": True,
        "retries": 5,
        "retry_delay": timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    export_daofork_traces_option = kwargs.get('export_daofork_traces_option')
    export_genesis_traces_option = kwargs.get('export_genesis_traces_option')
    export_blocks_and_transactions_toggle = kwargs.get(
        'export_blocks_and_transactions_toggle')
    export_receipts_and_logs_toggle = kwargs.get(
        'export_receipts_and_logs_toggle')
    export_contracts_toggle = kwargs.get('export_contracts_toggle')
    export_tokens_toggle = kwargs.get('export_tokens_toggle')
    extract_token_transfers_toggle = kwargs.get(
        'extract_token_transfers_toggle')
    export_traces_toggle = kwargs.get('export_traces_toggle')

    dag = DAG(
        dag_id,
        # Daily at 1am
        schedule_interval=schedule_interval,
        default_args=default_dag_args,
    )

    if output_bucket is None:
        raise ValueError("You must set OUTPUT_BUCKET environment variable")

    # Export
    def export_path(directory, date):
        return "export/{directory}/block_date={block_date}/".format(
            directory=directory, block_date=date.strftime("%Y-%m-%d"))

    cloud_storage_hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id="google_cloud_default")

    def copy_to_export_path(file_path, export_path):
        logging.info('Calling copy_to_export_path({}, {})'.format(
            file_path, export_path))
        filename = os.path.basename(file_path)
        upload_to_gcs(gcs_hook=cloud_storage_hook,
                      bucket=output_bucket,
                      object=export_path + filename,
                      filename=file_path)

    def copy_from_export_path(export_path, file_path):
        logging.info('Calling copy_from_export_path({}, {})'.format(
            export_path, file_path))
        filename = os.path.basename(file_path)
        cloud_storage_hook.download(bucket=output_bucket,
                                    object=export_path + filename,
                                    filename=file_path)

    def get_block_range(tempdir, date):
        logging.info('Calling get_block_range_for_date({}, {}, ...)'.format(
            web3_provider_uri, date))
        get_block_range_for_date.callback(provider_uri=web3_provider_uri,
                                          date=date,
                                          output=os.path.join(
                                              tempdir, "blocks_meta.txt"))

        with open(os.path.join(tempdir,
                               "blocks_meta.txt")) as block_range_file:
            block_range = block_range_file.read()
            start_block, end_block = block_range.split(",")

        return int(start_block), int(end_block)

    def export_blocks_and_transactions_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_block_range(tempdir, execution_date)

            logging.info(
                'Calling export_blocks_and_transactions({}, {}, {}, {}, {}, ...)'
                .format(start_block, end_block, export_batch_size,
                        web3_provider_uri, export_max_workers))

            export_blocks_and_transactions.callback(
                start_block=start_block,
                end_block=end_block,
                batch_size=export_batch_size,
                provider_uri=web3_provider_uri,
                max_workers=export_max_workers,
                blocks_output=os.path.join(tempdir, "blocks.csv"),
                transactions_output=os.path.join(tempdir, "transactions.csv"),
            )

            copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"),
                                export_path("blocks_meta", execution_date))

            copy_to_export_path(os.path.join(tempdir, "blocks.csv"),
                                export_path("blocks", execution_date))

            copy_to_export_path(os.path.join(tempdir, "transactions.csv"),
                                export_path("transactions", execution_date))

    def export_receipts_and_logs_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("transactions", execution_date),
                                  os.path.join(tempdir, "transactions.csv"))

            logging.info('Calling extract_csv_column(...)')
            extract_csv_column.callback(
                input=os.path.join(tempdir, "transactions.csv"),
                output=os.path.join(tempdir, "transaction_hashes.txt"),
                column="hash",
            )

            logging.info(
                'Calling export_receipts_and_logs({}, ..., {}, {}, ...)'.
                format(export_batch_size, web3_provider_uri,
                       export_max_workers))
            export_receipts_and_logs.callback(
                batch_size=export_batch_size,
                transaction_hashes=os.path.join(tempdir,
                                                "transaction_hashes.txt"),
                provider_uri=web3_provider_uri,
                max_workers=export_max_workers,
                receipts_output=os.path.join(tempdir, "receipts.csv"),
                logs_output=os.path.join(tempdir, "logs.json"),
            )

            copy_to_export_path(os.path.join(tempdir, "receipts.csv"),
                                export_path("receipts", execution_date))
            copy_to_export_path(os.path.join(tempdir, "logs.json"),
                                export_path("logs", execution_date))

    def export_contracts_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("receipts", execution_date),
                                  os.path.join(tempdir, "receipts.csv"))

            logging.info('Calling extract_csv_column(...)')
            extract_csv_column.callback(
                input=os.path.join(tempdir, "receipts.csv"),
                output=os.path.join(tempdir, "contract_addresses.txt"),
                column="contract_address",
            )

            logging.info('Calling export_contracts({}, ..., {}, {})'.format(
                export_batch_size, export_max_workers, web3_provider_uri))
            export_contracts.callback(
                batch_size=export_batch_size,
                contract_addresses=os.path.join(tempdir,
                                                "contract_addresses.txt"),
                output=os.path.join(tempdir, "contracts.json"),
                max_workers=export_max_workers,
                provider_uri=web3_provider_uri,
            )

            copy_to_export_path(os.path.join(tempdir, "contracts.json"),
                                export_path("contracts", execution_date))

    def export_tokens_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("contracts", execution_date),
                                  os.path.join(tempdir, "contracts.json"))

            logging.info('Calling filter_items(...)')
            filter_items.callback(
                input=os.path.join(tempdir, "contracts.json"),
                output=os.path.join(tempdir, "token_contracts.json"),
                predicate="item['is_erc20'] or item['is_erc721']",
            )

            logging.info('Calling extract_field(...)')
            extract_field.callback(
                input=os.path.join(tempdir, "token_contracts.json"),
                output=os.path.join(tempdir, "token_addresses.txt"),
                field="address",
            )

            logging.info('Calling export_tokens(..., {}, {})'.format(
                export_max_workers, web3_provider_uri))
            export_tokens.callback(
                token_addresses=os.path.join(tempdir, "token_addresses.txt"),
                output=os.path.join(tempdir, "tokens.csv"),
                max_workers=export_max_workers,
                provider_uri=web3_provider_uri,
            )

            copy_to_export_path(os.path.join(tempdir, "tokens.csv"),
                                export_path("tokens", execution_date))

    def extract_token_transfers_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("logs", execution_date),
                                  os.path.join(tempdir, "logs.json"))

            logging.info(
                'Calling extract_token_transfers(..., {}, ..., {})'.format(
                    export_batch_size, export_max_workers))
            extract_token_transfers.callback(
                logs=os.path.join(tempdir, "logs.json"),
                batch_size=export_batch_size,
                output=os.path.join(tempdir, "token_transfers.csv"),
                max_workers=export_max_workers,
            )

            copy_to_export_path(
                os.path.join(tempdir, "token_transfers.csv"),
                export_path("token_transfers", execution_date),
            )

    def export_traces_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_block_range(tempdir, execution_date)

            logging.info(
                'Calling export_traces({}, {}, {}, ...,{}, {}, {}, {})'.format(
                    start_block, end_block, export_batch_size,
                    export_max_workers, web3_provider_uri_archival,
                    export_genesis_traces_option,
                    export_daofork_traces_option))
            export_traces.callback(
                start_block=start_block,
                end_block=end_block,
                batch_size=export_batch_size,
                output=os.path.join(tempdir, "traces.csv"),
                max_workers=export_max_workers,
                provider_uri=web3_provider_uri_archival,
                genesis_traces=export_genesis_traces_option,
                daofork_traces=export_daofork_traces_option,
            )

            copy_to_export_path(os.path.join(tempdir, "traces.csv"),
                                export_path("traces", execution_date))

    def add_export_task(toggle, task_id, python_callable, dependencies=None):
        if toggle:
            operator = python_operator.PythonOperator(
                task_id=task_id,
                python_callable=python_callable,
                provide_context=True,
                execution_timeout=timedelta(hours=15),
                dag=dag,
            )
            if dependencies is not None and len(dependencies) > 0:
                for dependency in dependencies:
                    if dependency is not None:
                        dependency >> operator
            return operator
        else:
            return None

    MEGABYTE = 1024 * 1024

    # Helps avoid OverflowError: https://stackoverflow.com/questions/47610283/cant-upload-2gb-to-google-cloud-storage
    # https://developers.google.com/api-client-library/python/guide/media_upload#resumable-media-chunked-upload
    def upload_to_gcs(gcs_hook,
                      bucket,
                      object,
                      filename,
                      mime_type='application/octet-stream'):
        service = gcs_hook.get_conn()

        if os.path.getsize(filename) > 10 * MEGABYTE:
            media = MediaFileUpload(filename, mime_type, resumable=True)

            try:
                request = service.objects().insert(bucket=bucket,
                                                   name=object,
                                                   media_body=media)
                response = None
                while response is None:
                    status, response = request.next_chunk()
                    if status:
                        logging.info("Uploaded %d%%." %
                                     int(status.progress() * 100))

                return True
            except errors.HttpError as ex:
                if ex.resp['status'] == '404':
                    return False
                raise
        else:
            media = MediaFileUpload(filename, mime_type)

            try:
                service.objects().insert(bucket=bucket,
                                         name=object,
                                         media_body=media).execute()
                return True
            except errors.HttpError as ex:
                if ex.resp['status'] == '404':
                    return False
                raise

    # Operators
    export_blocks_and_transactions_operator = add_export_task(
        export_blocks_and_transactions_toggle,
        "export_blocks_and_transactions",
        export_blocks_and_transactions_command,
    )

    export_receipts_and_logs_operator = add_export_task(
        export_receipts_and_logs_toggle,
        "export_receipts_and_logs",
        export_receipts_and_logs_command,
        dependencies=[export_blocks_and_transactions_operator],
    )

    export_contracts_operator = add_export_task(
        export_contracts_toggle,
        "export_contracts",
        export_contracts_command,
        dependencies=[export_receipts_and_logs_operator],
    )

    export_tokens_operator = add_export_task(
        export_tokens_toggle,
        "export_tokens",
        export_tokens_command,
        dependencies=[export_contracts_operator],
    )

    extract_token_transfers_operator = add_export_task(
        extract_token_transfers_toggle,
        "extract_token_transfers",
        extract_token_transfers_command,
        dependencies=[export_receipts_and_logs_operator],
    )

    export_traces_operator = add_export_task(export_traces_toggle,
                                             "export_traces",
                                             export_traces_command)

    def get_boolean_env_variable(env_variable_name, default=True):
        raw_env = os.environ.get(env_variable_name)
        if raw_env is None or len(raw_env) == 0:
            return default
        else:
            return raw_env.lower() in ["true", "yes"]

    return dag
 def execute(self, context):
     logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename)
     hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
     print(hook.download(self.bucket, self.object, self.filename))
示例#34
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)
        log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}'

        if self.wildcard in self.source_object:
            prefix, delimiter = self.source_object.split(self.wildcard, 1)
            objects = hook.list(self.source_bucket,
                                prefix=prefix,
                                delimiter=delimiter)

            for source_object in objects:
                if self.last_modified_time is not None:
                    # Check to see if object was modified after last_modified_time
                    if hook.is_updated_after(self.source_bucket, source_object,
                                             self.last_modified_time):
                        pass
                    else:
                        continue
                if self.destination_object is None:
                    destination_object = source_object
                else:
                    destination_object = source_object.replace(
                        prefix, self.destination_object, 1)
                self.log.info(
                    log_message.format(self.source_bucket, source_object,
                                       self.destination_bucket,
                                       destination_object))

                hook.rewrite(self.source_bucket, source_object,
                             self.destination_bucket, destination_object)
                if self.move_object:
                    hook.delete(self.source_bucket, source_object)

        else:
            if self.last_modified_time is not None:
                if hook.is_updated_after(self.source_bucket,
                                         self.source_object,
                                         self.last_modified_time):
                    pass
                else:
                    return

            self.log.info(
                log_message.format(
                    self.source_bucket, self.source_object,
                    self.destination_bucket or self.source_bucket,
                    self.destination_object or self.source_object))
            hook.rewrite(self.source_bucket, self.source_object,
                         self.destination_bucket, self.destination_object)

            if self.move_object:
                hook.delete(self.source_bucket, self.source_object)
示例#35
0
    "email_on_failure": True,
    "email_on_retry": True,
    "retries": 0,
    "retry_delay": timedelta(minutes=5),
    "dagrun_timeout": timedelta(hours=4),
}

# use a less than desirable method of generating the service account name
IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod"
ENVIRONMENT = "dev" if IS_DEV else "prod"

PRIO_ADMIN_CONN = "google_cloud_prio_admin"
PRIO_A_CONN = "google_cloud_prio_a"
PRIO_B_CONN = "google_cloud_prio_b"

PROJECT_ADMIN = GoogleCloudStorageHook(PRIO_ADMIN_CONN).project_id
PROJECT_A = GoogleCloudStorageHook(PRIO_A_CONN).project_id
PROJECT_B = GoogleCloudStorageHook(PRIO_B_CONN).project_id

SERVICE_ACCOUNT_ADMIN = "prio-admin-runner@{}.iam.gserviceaccount.com".format(
    PROJECT_ADMIN
)
SERVICE_ACCOUNT_A = "prio-runner-{}-a@{}.iam.gserviceaccount.com".format(
    ENVIRONMENT, PROJECT_A
)
SERVICE_ACCOUNT_B = "prio-runner-{}-b@{}.iam.gserviceaccount.com".format(
    ENVIRONMENT, PROJECT_B
)

BUCKET_PRIVATE_A = "moz-fx-prio-{}-a-private".format(ENVIRONMENT)
BUCKET_PRIVATE_B = "moz-fx-prio-{}-b-private".format(ENVIRONMENT)
def build_export_dag(dag_id,
                     provider_uris,
                     provider_uris_archival,
                     output_bucket,
                     cloud_provider,
                     export_start_date,
                     notification_emails=None,
                     export_schedule_interval='0 0 * * *',
                     export_max_workers=10,
                     export_batch_size=10,
                     export_max_active_runs=None,
                     **kwargs):
    default_dag_args = {
        "depends_on_past": False,
        "start_date": export_start_date,
        "email_on_failure": True,
        "email_on_retry": True,
        "retries": 5,
        "retry_delay": timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    export_daofork_traces_option = kwargs.get('export_daofork_traces_option')
    export_genesis_traces_option = kwargs.get('export_genesis_traces_option')
    export_blocks_and_transactions_toggle = kwargs.get(
        'export_blocks_and_transactions_toggle')
    export_receipts_and_logs_toggle = kwargs.get(
        'export_receipts_and_logs_toggle')
    extract_contracts_toggle = kwargs.get('extract_contracts_toggle')
    extract_tokens_toggle = kwargs.get('extract_tokens_toggle')
    extract_token_transfers_toggle = kwargs.get(
        'extract_token_transfers_toggle')
    export_traces_toggle = kwargs.get('export_traces_toggle')

    if export_max_active_runs is None:
        export_max_active_runs = configuration.conf.getint(
            'core', 'max_active_runs_per_dag')

    dag = DAG(dag_id,
              schedule_interval=export_schedule_interval,
              default_args=default_dag_args,
              max_active_runs=export_max_active_runs)

    if cloud_provider == 'aws':
        from airflow.hooks.S3_hook import S3Hook
        cloud_storage_hook = S3Hook(aws_conn_id="aws_default")
    else:
        from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
        cloud_storage_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id="google_cloud_default")

    # Export
    def export_path(directory, date):
        return "export/{directory}/block_date={block_date}/".format(
            directory=directory, block_date=date.strftime("%Y-%m-%d"))

    def copy_to_export_path(file_path, export_path):
        logging.info('Calling copy_to_export_path({}, {})'.format(
            file_path, export_path))
        filename = os.path.basename(file_path)

        if cloud_provider == 'aws':
            cloud_storage_hook.load_file(filename=file_path,
                                         bucket_name=output_bucket,
                                         key=export_path + filename,
                                         replace=True,
                                         encrypt=False)
        else:
            upload_to_gcs(gcs_hook=cloud_storage_hook,
                          bucket=output_bucket,
                          object=export_path + filename,
                          filename=file_path)

    def copy_from_export_path(export_path, file_path):
        logging.info('Calling copy_from_export_path({}, {})'.format(
            export_path, file_path))
        filename = os.path.basename(file_path)
        if cloud_provider == 'aws':
            # boto3.s3.Object
            s3_object = cloud_storage_hook.get_key(bucket_name=output_bucket,
                                                   key=export_path + filename)
            s3_object.download_file(file_path)
        else:
            download_from_gcs(bucket=output_bucket,
                              object=export_path + filename,
                              filename=file_path)

    def get_block_range(tempdir, date, provider_uri):
        logging.info('Calling get_block_range_for_date({}, {}, ...)'.format(
            provider_uri, date))
        get_block_range_for_date.callback(provider_uri=provider_uri,
                                          date=date,
                                          output=os.path.join(
                                              tempdir, "blocks_meta.txt"))

        with open(os.path.join(tempdir,
                               "blocks_meta.txt")) as block_range_file:
            block_range = block_range_file.read()
            start_block, end_block = block_range.split(",")

        return int(start_block), int(end_block)

    def export_blocks_and_transactions_command(execution_date, provider_uri,
                                               **kwargs):
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_block_range(tempdir, execution_date,
                                                     provider_uri)

            logging.info(
                'Calling export_blocks_and_transactions({}, {}, {}, {}, {}, ...)'
                .format(start_block, end_block, export_batch_size,
                        provider_uri, export_max_workers))

            export_blocks_and_transactions.callback(
                start_block=start_block,
                end_block=end_block,
                batch_size=export_batch_size,
                provider_uri=provider_uri,
                max_workers=export_max_workers,
                blocks_output=os.path.join(tempdir, "blocks.csv"),
                transactions_output=os.path.join(tempdir, "transactions.csv"),
            )

            copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"),
                                export_path("blocks_meta", execution_date))

            copy_to_export_path(os.path.join(tempdir, "blocks.csv"),
                                export_path("blocks", execution_date))

            copy_to_export_path(os.path.join(tempdir, "transactions.csv"),
                                export_path("transactions", execution_date))

    def export_receipts_and_logs_command(execution_date, provider_uri,
                                         **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("transactions", execution_date),
                                  os.path.join(tempdir, "transactions.csv"))

            logging.info('Calling extract_csv_column(...)')
            extract_csv_column.callback(
                input=os.path.join(tempdir, "transactions.csv"),
                output=os.path.join(tempdir, "transaction_hashes.txt"),
                column="hash",
            )

            logging.info(
                'Calling export_receipts_and_logs({}, ..., {}, {}, ...)'.
                format(export_batch_size, provider_uri, export_max_workers))
            export_receipts_and_logs.callback(
                batch_size=export_batch_size,
                transaction_hashes=os.path.join(tempdir,
                                                "transaction_hashes.txt"),
                provider_uri=provider_uri,
                max_workers=export_max_workers,
                receipts_output=os.path.join(tempdir, "receipts.csv"),
                logs_output=os.path.join(tempdir, "logs.json"),
            )

            copy_to_export_path(os.path.join(tempdir, "receipts.csv"),
                                export_path("receipts", execution_date))
            copy_to_export_path(os.path.join(tempdir, "logs.json"),
                                export_path("logs", execution_date))

    def extract_contracts_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("traces", execution_date),
                                  os.path.join(tempdir, "traces.csv"))

            logging.info('Calling extract_contracts(..., {}, {})'.format(
                export_batch_size, export_max_workers))
            extract_contracts.callback(
                traces=os.path.join(tempdir, "traces.csv"),
                output=os.path.join(tempdir, "contracts.json"),
                batch_size=export_batch_size,
                max_workers=export_max_workers,
            )

            copy_to_export_path(os.path.join(tempdir, "contracts.json"),
                                export_path("contracts", execution_date))

    def extract_tokens_command(execution_date, provider_uri, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("contracts", execution_date),
                                  os.path.join(tempdir, "contracts.json"))

            logging.info('Calling extract_tokens(..., {}, {})'.format(
                export_max_workers, provider_uri))
            extract_tokens.callback(
                contracts=os.path.join(tempdir, "contracts.json"),
                output=os.path.join(tempdir, "tokens.csv"),
                max_workers=export_max_workers,
                provider_uri=provider_uri,
            )

            copy_to_export_path(os.path.join(tempdir, "tokens.csv"),
                                export_path("tokens", execution_date))

    def extract_token_transfers_command(execution_date, **kwargs):
        with TemporaryDirectory() as tempdir:
            copy_from_export_path(export_path("logs", execution_date),
                                  os.path.join(tempdir, "logs.json"))

            logging.info(
                'Calling extract_token_transfers(..., {}, ..., {})'.format(
                    export_batch_size, export_max_workers))
            extract_token_transfers.callback(
                logs=os.path.join(tempdir, "logs.json"),
                batch_size=export_batch_size,
                output=os.path.join(tempdir, "token_transfers.csv"),
                max_workers=export_max_workers,
            )

            copy_to_export_path(
                os.path.join(tempdir, "token_transfers.csv"),
                export_path("token_transfers", execution_date),
            )

    def export_traces_command(execution_date, provider_uri, **kwargs):
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_block_range(tempdir, execution_date,
                                                     provider_uri)

            logging.info(
                'Calling export_traces({}, {}, {}, ...,{}, {}, {}, {})'.format(
                    start_block, end_block, export_batch_size,
                    export_max_workers, provider_uri,
                    export_genesis_traces_option,
                    export_daofork_traces_option))
            export_traces.callback(
                start_block=start_block,
                end_block=end_block,
                batch_size=export_batch_size,
                output=os.path.join(tempdir, "traces.csv"),
                max_workers=export_max_workers,
                provider_uri=provider_uri,
                genesis_traces=export_genesis_traces_option,
                daofork_traces=export_daofork_traces_option,
            )

            copy_to_export_path(os.path.join(tempdir, "traces.csv"),
                                export_path("traces", execution_date))

    def add_export_task(toggle, task_id, python_callable, dependencies=None):
        if toggle:
            operator = python_operator.PythonOperator(
                task_id=task_id,
                python_callable=python_callable,
                provide_context=True,
                execution_timeout=timedelta(hours=15),
                dag=dag,
            )
            if dependencies is not None and len(dependencies) > 0:
                for dependency in dependencies:
                    if dependency is not None:
                        dependency >> operator
            return operator
        else:
            return None

    # Operators

    export_blocks_and_transactions_operator = add_export_task(
        export_blocks_and_transactions_toggle,
        "export_blocks_and_transactions",
        add_provider_uri_fallback_loop(export_blocks_and_transactions_command,
                                       provider_uris),
    )

    export_receipts_and_logs_operator = add_export_task(
        export_receipts_and_logs_toggle,
        "export_receipts_and_logs",
        add_provider_uri_fallback_loop(export_receipts_and_logs_command,
                                       provider_uris),
        dependencies=[export_blocks_and_transactions_operator],
    )

    extract_token_transfers_operator = add_export_task(
        extract_token_transfers_toggle,
        "extract_token_transfers",
        extract_token_transfers_command,
        dependencies=[export_receipts_and_logs_operator],
    )

    export_traces_operator = add_export_task(
        export_traces_toggle, "export_traces",
        add_provider_uri_fallback_loop(export_traces_command,
                                       provider_uris_archival))

    extract_contracts_operator = add_export_task(
        extract_contracts_toggle,
        "extract_contracts",
        extract_contracts_command,
        dependencies=[export_traces_operator],
    )

    extract_tokens_operator = add_export_task(
        extract_tokens_toggle,
        "extract_tokens",
        add_provider_uri_fallback_loop(extract_tokens_command, provider_uris),
        dependencies=[extract_contracts_operator],
    )

    return dag
示例#37
0
 def poke(self, context):
     self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return hook.exists(self.bucket, self.object)
示例#38
0
 def poke(self, context):
     hook = GoogleCloudStorageHook()
     return self.is_bucket_updated(len(hook.list(self.bucket, prefix=self.prefix)))
示例#39
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS. Requires
    airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and
    REMOTE_LOG_CONN_ID configuration options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcp_api].
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.hook = None

        try:
            from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=remote_conn_id)
        except:
            logging.error(
                'Could not create a GoogleCloudStorageHook with connection id '
                '"{}". Please make sure that airflow[gcp_api] is installed '
                'and the GCS connection exists.'.format(remote_conn_id))

    def read(self, remote_log_location, return_error=False):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=True):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:
            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                from tempfile import NamedTemporaryFile
                with NamedTemporaryFile(mode='w+') as tmpfile:
                    tmpfile.write(log)
                    # Force the file to be flushed, since we're doing the
                    # upload from within the file context (it hasn't been
                    # closed).
                    tmpfile.flush()
                    self.hook.upload(bkt, blob, tmpfile.name)
            except:
                # raise/return error if we get here
                logging.error('Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)
示例#40
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and \
                self.schema_object and \
                self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(
                gcs_hook.download(self.bucket,
                                  self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                src_fmt_configs=self.src_fmt_configs)
        else:
            cursor.run_load(destination_project_dataset_table=self.
                            destination_project_dataset_table,
                            schema_fields=schema_fields,
                            source_uris=source_uris,
                            source_format=self.source_format,
                            create_disposition=self.create_disposition,
                            skip_leading_rows=self.skip_leading_rows,
                            write_disposition=self.write_disposition,
                            field_delimiter=self.field_delimiter,
                            max_bad_records=self.max_bad_records,
                            quote_character=self.quote_character,
                            ignore_unknown_values=self.ignore_unknown_values,
                            allow_quoted_newlines=self.allow_quoted_newlines,
                            allow_jagged_rows=self.allow_jagged_rows,
                            schema_update_options=self.schema_update_options,
                            src_fmt_configs=self.src_fmt_configs,
                            time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info('Loaded BQ data with max %s.%s=%s',
                          self.destination_project_dataset_table,
                          self.max_id_key, max_id)
            return max_id
示例#41
0
 def poke(self, context):
     self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return hook.exists(self.bucket, self.object)
示例#42
0
class GoogleDisplayVideo360SDFToBigQueryOperator(GoogleMarketingPlatformBaseOperator):
    """Make a request to SDF API and upload the data to BQ."""

    DEFAULT_SDF_TABLE_NAMES = {
        'LINE_ITEM': 'SDFLineItem',
        'AD_GROUP': 'SDFAdGroup',
        'AD': 'SDFAd',
        'INSERTION_ORDER': 'SDFInsertionOrder',
        'CAMPAIGN': 'SDFCampaign'
    }

    SDF_API_RESPONSE_KEYS = {
        'LINE_ITEM': 'lineItems',
        'AD_GROUP': 'adGroups',
        'AD': 'ads',
        'INSERTION_ORDER': 'insertionOrders',
        'CAMPAIGN': 'campaigns'
    }

    def __init__(self,
                 gcp_conn_id='google_cloud_default',
                 gcs_bucket=None,
                 schema=None,
                 bq_dataset=None,
                 write_disposition=None,
                 cloud_project_id=None,
                 file_types=None,
                 filter_ids=None,
                 api_version=None,
                 filter_type=None,
                 table_names=DEFAULT_SDF_TABLE_NAMES,
                 sdf_api_response_keys=SDF_API_RESPONSE_KEYS,
                 *args,
                 **kwargs):
        super(GoogleDisplayVideo360SDFToBigQueryOperator, self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.service = None
        self.hook = None
        self.bq_hook = None
        self.gcs_hook = None
        self.gcs_bucket = gcs_bucket
        self.schema = schema
        self.bq_dataset = bq_dataset
        self.write_disposition = write_disposition
        self.cloud_project_id = cloud_project_id
        self.file_types = file_types
        self.filter_ids = filter_ids
        self.api_version = api_version
        self.filter_type = filter_type
        self.table_names = table_names
        self.sdf_api_response_keys = sdf_api_response_keys

    def execute(self, context):
        if self.hook is None:
            self.hook = GoogleDisplayVideo360Hook(gcp_conn_id=self.gcp_conn_id)
        if self.bq_hook is None:
            self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id)
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id)

        request_body = {'fileTypes': self.file_types, 'filterType': self.filter_type, 'filterIds': self.filter_ids,
                        'version': self.api_version}

        logger.info('Request body: %s ' % request_body)
        request = self.hook.get_service().sdf().download(body=request_body)
        response = request.execute()

        for file_type in self.file_types:
            temp_file = None
            try:
                logger.info('Uploading SDF to GCS')
                temp_file = tempfile.NamedTemporaryFile(delete=False)
                response_key = self.sdf_api_response_keys.get(file_type)
                temp_file.write(response[response_key].encode('utf-8'))
                temp_file.close()
                filename = '%d_%s_%s_%s.json' % (time.time() * 1e+9, randint(
                    1, 1000000), response_key, 'sdf')
                self.gcs_hook.upload(self.gcs_bucket, filename, temp_file.name)
                logger.info('SDF upload to GCS complete')
            finally:
                if temp_file:
                    temp_file.close()
                os.unlink(temp_file.name)

            sdf_file = 'gs://%s/%s' % (self.gcs_bucket, filename)

            bq_table = self.table_names.get(file_type)
            bq_table = '%s.%s' % (self.bq_dataset, bq_table)
            schema = SDF_VERSIONED_SCHEMA_TYPES.get(self.api_version).get(file_type)
            try:
                bq_base_cursor = self.bq_hook.get_conn().cursor()
                logger.info('Uploading SDF to BigQuery')
                bq_base_cursor.run_load(
                    destination_project_dataset_table=bq_table,
                    schema_fields=schema,
                    source_uris=[sdf_file],
                    source_format='CSV',
                    skip_leading_rows=1,
                    write_disposition=self.write_disposition)
            finally:
                logger.info('Deleting SDF from GCS')
                self.gcs_hook.delete(self.gcs_bucket, filename)
    def execute(self, context):
        ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id)
        gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id)
        try:
            since_formatted = datetime.strptime(self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            since_formatted = str(self.since)
        try:
            until_formatted = datetime.strptime(self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
        except:
            until_formatted = str(self.until)
        report = ga_conn.get_analytics_report(self.view_id,
                                              since_formatted,
                                              until_formatted,
                                              self.sampling_level,
                                              self.dimensions,
                                              self.metrics,
                                              self.page_size,
                                              self.include_empty_rows)

        columnHeader = report.get('columnHeader', {})
        # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future
        # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..)
        dimensionHeaders = [
            {'name': header.replace('ga:', ''), 'type': 'varchar(255)'}
            for header
            in columnHeader.get('dimensions', [])
        ]
        metricHeaders = [
            {'name': entry.get('name').replace('ga:', ''),
             'type': self.metricMap.get(entry.get('type'), 'varchar(255)')}
            for entry
            in columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
        ]

        with NamedTemporaryFile("w") as ga_file:
            rows = report.get('data', {}).get('rows', [])

            for row_counter, row in enumerate(rows):
                root_data_obj = {}
                dimensions = row.get('dimensions', [])
                metrics = row.get('metrics', [])

                for index, dimension in enumerate(dimensions):
                    header = dimensionHeaders[index].get('name').lower()
                    root_data_obj[header] = dimension

                for metric in metrics:
                    data = {}
                    data.update(root_data_obj)

                    for index, value in enumerate(metric.get('values', [])):
                        header = metricHeaders[index].get('name').lower()
                        data[header] = value

                    data['viewid'] = self.view_id
                    data['timestamp'] = self.since

                    ga_file.write(json.dumps(data) + ('' if row_counter == len(rows) else '\n'))

            gcs_conn.upload(self.gcs_bucket,
                            self.gcs_objname,
                            ga_file.name)
示例#44
0
class GoogleDisplayVideo360DownloadReportOperator(GoogleMarketingPlatformBaseOperator):
    """Downloads a Display & Video 360 report into Google Cloud Storage.

    Attributes:
      report_url: The Google Cloud Storage url where the latest report is stored.
          (templated)
      destination_bucket: The destination Google cloud storage bucket where the
          report should be written to. (templated)
      destination_object: The destination name of the object in the destination
          Google cloud storage bucket. (templated)
          If the destination points to an existing folder, the report will be
          written under the specified folder.
      gcp_conn_id: The connection ID to use when fetching connection info.
      delegate_to: The account to impersonate, if any.

    XComs:
      destination_bucket: The Google cloud storage bucket the report was written
          to.
      destination_object: The Google cloud storage URI for the report.
    """

    template_fields = ['report_url', 'destination_bucket', 'destination_object']

    def __init__(self,
                 report_url,
                 destination_bucket,
                 destination_object=None,
                 chunk_size=5 * 1024 * 1024,
                 gcp_conn_id='google_cloud_default',
                 delegate_to=None,
                 *args,
                 **kwargs):
        super(GoogleDisplayVideo360DownloadReportOperator, self).__init__(*args, **kwargs)
        self.report_url = report_url
        self.destination_bucket = destination_bucket
        self.destination_object = destination_object
        self.chunk_size = chunk_size
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.hook = None

    @staticmethod
    def _download_report(source_url, destination_file, chunk_size):
        response = requests.head(source_url)
        content_length = int(response.headers['Content-Length'])

        start_byte = 0
        while start_byte < content_length:
            end_byte = start_byte + chunk_size - 1
            if end_byte >= content_length:
                end_byte = content_length - 1

            headers = {'Range': 'bytes=%s-%s' % (start_byte, end_byte)}
            response = requests.get(source_url, stream=True, headers=headers)
            chunk = response.raw.read()
            destination_file.write(chunk)
            start_byte = end_byte + 1
        destination_file.close()

    @staticmethod
    def _get_destination_uri(destination_object, report_url):
        report_file_name = urlparse(report_url).path.split('/')[2]

        if destination_object is None:
            return report_file_name

        if destination_object.endswith('/'):
            return destination_object + report_file_name

        return destination_object

    def execute(self, context):
        if self.hook is None:
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id,
                delegate_to=self.delegate_to)

        temp_file = tempfile.NamedTemporaryFile(delete=False)
        try:
            # TODO(efolgar): Directly stream to storage instead of temp file
            self._download_report(self.report_url, temp_file, self.chunk_size)
            destination_object_name = self._get_destination_uri(
                self.destination_object, self.report_url)
            self.hook.upload(
                bucket=self.destination_bucket,
                object=destination_object_name,
                filename=temp_file.name,
                multipart=True)

            context['task_instance'].xcom_push(
                'destination_bucket', self.destination_bucket)
            context['task_instance'].xcom_push(
                'destination_object', destination_object_name)
        finally:
            temp_file.close()
            os.unlink(temp_file.name)
示例#45
0
 def poke(self, context):
     self.logger.info('Sensor checks existence of : %s, %s', self.bucket, self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return hook.is_updated_after(self.bucket, self.object, self.ts_func(context))
示例#46
0
def build_export_dag(dag_id,
                     provider_uris,
                     output_bucket,
                     export_start_date,
                     notification_emails=None,
                     export_schedule_interval='0 0 * * *',
                     export_max_workers=10,
                     export_batch_size=10,
                     export_max_active_runs=None,
                     provider_uris_shuffle=False,
                     **kwargs):
    default_dag_args = {
        "depends_on_past": False,
        "start_date": export_start_date,
        "email_on_failure": True,
        "email_on_retry": True,
        "retries": 5,
        "retry_delay": timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    if export_max_active_runs is None:
        export_max_active_runs = configuration.conf.getint(
            'core', 'max_active_runs_per_dag')

    dag = DAG(dag_id,
              schedule_interval=export_schedule_interval,
              default_args=default_dag_args,
              max_active_runs=export_max_active_runs)

    from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
    cloud_storage_hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id="google_cloud_default")

    # Export
    def export_path(directory, date):
        return "export/{directory}/block_date={block_date}/".format(
            directory=directory, block_date=date.strftime("%Y-%m-%d"))

    def copy_to_export_path(file_path, export_path):
        logging.info('Calling copy_to_export_path({}, {})'.format(
            file_path, export_path))
        filename = os.path.basename(file_path)

        upload_to_gcs(gcs_hook=cloud_storage_hook,
                      bucket=output_bucket,
                      object=export_path + filename,
                      filename=file_path)

    def copy_from_export_path(export_path, file_path):
        logging.info('Calling copy_from_export_path({}, {})'.format(
            export_path, file_path))
        filename = os.path.basename(file_path)
        download_from_gcs(bucket=output_bucket,
                          object=export_path + filename,
                          filename=file_path)

    def get_block_range(tempdir, date, provider_uri):
        logging.info('Calling get_block_range_for_date({}, {}, ...)'.format(
            provider_uri, date))
        get_block_range_for_date.callback(provider_uri=provider_uri,
                                          date=date,
                                          output=os.path.join(
                                              tempdir, "blocks_meta.txt"))

        with open(os.path.join(tempdir,
                               "blocks_meta.txt")) as block_range_file:
            block_range = block_range_file.read()
            start_block, end_block = block_range.split(",")

        return int(start_block), int(end_block)

    def export_blocks_command(execution_date, provider_uri, **kwargs):
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_block_range(tempdir, execution_date,
                                                     provider_uri)

            logging.info(
                'Calling export_blocks({}, {}, {}, {}, {}, ...)'.format(
                    start_block, end_block, export_batch_size, provider_uri,
                    export_max_workers))

            export_blocks.callback(
                start_block=start_block,
                end_block=end_block,
                provider_uri=provider_uri,
                max_workers=export_max_workers,
                blocks_output=os.path.join(tempdir, "blocks.json"),
                transactions_output=os.path.join(tempdir, "transactions.json"),
                actions_output=os.path.join(tempdir, "actions.json"),
            )

            copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"),
                                export_path("blocks_meta", execution_date))

            copy_to_export_path(os.path.join(tempdir, "blocks.json"),
                                export_path("blocks", execution_date))

            copy_to_export_path(os.path.join(tempdir, "transactions.json"),
                                export_path("transactions", execution_date))

            copy_to_export_path(os.path.join(tempdir, "actions.json"),
                                export_path("actions", execution_date))

    def add_export_task(toggle, task_id, python_callable, dependencies=None):
        if toggle:
            operator = python_operator.PythonOperator(
                task_id=task_id,
                python_callable=python_callable,
                provide_context=True,
                execution_timeout=timedelta(hours=48),
                dag=dag,
            )
            if dependencies is not None and len(dependencies) > 0:
                for dependency in dependencies:
                    if dependency is not None:
                        dependency >> operator
            return operator
        else:
            return None

    # Operators

    export_blocks_operator = add_export_task(
        True,
        "export_blocks",
        add_provider_uri_fallback_loop(export_blocks_command, provider_uris,
                                       provider_uris_shuffle),
    )

    return dag
示例#47
0
class GoogleCampaignManagerDownloadReportOperator(BaseOperator):
    """Downloads a Campaign Manager report into Google Cloud Storage.

  Attributes:
    report_id: The DCM report ID with which the report file is associated with.
        (templated)
    file_id: The DCM file ID of the report file to download. (templated)
    destination_bucket: The destination Google cloud storage bucket where the
        report should be written to. (templated)
    destination_object: The destination name of the object in the destination
        Google cloud storage bucket. (templated)
        If the destination points to an existing folder, the report will be
        written under the specified folder.
    gcp_conn_id: The connection ID to use when fetching connection info.
    delegate_to: The account to impersonate, if any.

  XComs:
    destination_bucket: The Google cloud storage bucket the report was written
        to.
    destination_object: The Google cloud storage URI for the report.
  """

    template_fields = [
        'report_id', 'file_id', 'destination_bucket', 'destination_object'
    ]

    def __init__(self,
                 report_id,
                 file_id,
                 destination_bucket,
                 destination_object=None,
                 gcp_conn_id='google_cloud_default',
                 chunk_size=5 * 1024 * 1024,
                 delegate_to=None,
                 *args,
                 **kwargs):
        super(GoogleCampaignManagerDownloadReportOperator,
              self).__init__(*args, **kwargs)
        self.file_id = file_id
        self.report_id = report_id
        self.destination_bucket = destination_bucket
        self.destination_object = destination_object
        self.chunk_size = chunk_size
        self.gcp_conn_id = gcp_conn_id
        self.delegate_to = delegate_to
        self.gcs_hook = None
        self.cm_hook = None

    def _download_report(self, report_id, file_id, destination_file,
                         chunk_size):
        file_metadata = self.cm_hook.get_service().files().get(
            reportId=report_id, fileId=file_id).execute()

        if file_metadata['status'] != 'REPORT_AVAILABLE':
            msg = 'File with ID = %s and Report ID = %s not available, status = %s.' % (
                file_id, report_id, file_metadata['status'])
            raise Exception(msg)

        request = self.cm_hook.get_service().files().get_media(
            reportId=report_id, fileId=file_id)

        downloader = http.MediaIoBaseDownload(destination_file,
                                              request,
                                              chunksize=chunk_size)

        download_finished = False
        while not download_finished:
            _, download_finished = downloader.next_chunk()

        return file_metadata['fileName']

    def _get_destination_uri(self, destination_object, report_file_name):
        report_file_name = '%s.csv.gz' % report_file_name

        if destination_object is None:
            return report_file_name

        if destination_object.endswith('/'):
            return destination_object + report_file_name

        return destination_object

    def execute(self, context):
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id,
                delegate_to=self.delegate_to)
        if self.cm_hook is None:
            self.cm_hook = GoogleCampaignManagerHook(
                gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to)

        temp_file = tempfile.NamedTemporaryFile(delete=False)
        try:
            report_file_name = self._download_report(self.report_id,
                                                     self.file_id, temp_file,
                                                     self.chunk_size)

            destination_object_name = self._get_destination_uri(
                self.destination_object, report_file_name)

            self.gcs_hook.upload(bucket=self.destination_bucket,
                                 object=destination_object_name,
                                 filename=temp_file.name,
                                 gzip=True,
                                 multipart=True)

            context['task_instance'].xcom_push('destination_bucket',
                                               self.destination_bucket)
            context['task_instance'].xcom_push('destination_object',
                                               destination_object_name)
        finally:
            temp_file.close()
            os.unlink(temp_file.name)
cloud_provider = read_var("cloud_provider",
                          var_prefix="icon_",
                          required=False,
                          cloud_provider="gcp")

output_bucket = read_var("output_bucket", var_prefix="icon_", required=True)

if cloud_provider == "aws":
    from airflow.hooks.S3_hook import S3Hook

    cloud_storage_hook = S3Hook(aws_conn_id="aws_default")
if cloud_provider == "gcp":
    from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
    from iconetl_airflow.build_export_dag import upload_to_gcs

    cloud_storage_hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id="google_cloud_default")

default_dag_args = {
    "depends_on_past": False,
    "start_date": start_date,
}

dag = DAG(
    "icon_generate_nightly_reports",
    schedule_interval="30 1 * * *",
    default_args=default_dag_args,
)

reports_folder = os.path.dirname(
    os.path.realpath(__file__)) + "/nightly_report_scripts"
class GeotabToBigQueryOperator(BaseOperator):

    @apply_defaults
    def __init__(self,
                 gcs_conn_id,
                 gcs_bucket_name,
                 bq_project_name,
                 bq_dataset_name,
                 bq_table_name,
                 bq_table_schema,
                 update_info_dataset_id,
                 update_info_table_id,
                 geotab_conn_id,
                 geotab_data_type_name,
                 partition_column = None,
                 is_append_mode = True,
                 add_snapshot_time_column = False,
                 selected_column_list = [],
                 fields_preprocessing_map = [],
                 parse_data_field = False,
                 *args,
                 **kwargs):
        super(GeotabToBigQueryOperator, self).__init__(*args, **kwargs)
        self.gcs_conn_id = gcs_conn_id
        self.gcs_bucket_name = gcs_bucket_name
        self.bq_project_name = bq_project_name
        self.bq_dataset_name = bq_dataset_name
        self.bq_table_name = bq_table_name
        self.bq_table_schema = bq_table_schema
        self.update_info_dataset_id = update_info_dataset_id
        self.update_info_table_id = update_info_table_id
        self.geotab_conn_id = geotab_conn_id
        self.geotab_data_type_name = geotab_data_type_name
        self.partition_column = partition_column
        self.is_append_mode = is_append_mode
        self.add_snapshot_time_column = add_snapshot_time_column
        self.selected_column_list = selected_column_list
        self.fields_preprocessing_map = fields_preprocessing_map
        self.parse_data_field = parse_data_field

    def execute(self, context):
        self.log.info(f"start execute")
        try:
            self.init()
            self.calc_patch_interval()
            self.get_geotab_data()
            self.do_preprocess()
            self.write_to_csv()
            self.send_to_gcs()
            self.push_to_bigquery()
            self.set_last_updated_time()
        except Exception as e:
            self.log.exception(e)
            raise
        finally:
            self.clean_up()

    def init(self):
        self.log.info(f"init() is started")

        # bucket connection
        self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None)

        # bigquery connection
        self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False)
        bq_conn = self.bq_hook.get_conn()
        self.bq_cursor = bq_conn.cursor()

        # geotab connection
        self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id)

        params = self.geotab_hook.get_connection(self.geotab_conn_id)
        self.log.info(f"login: "******", password: "******", schema: " + params.schema)

    def calc_patch_interval(self):
        self.log.info(f"calc_patch_interval() is started")
        self.interval_start_at = self.get_bigquery_last_updated_time()
        self.log.info(f"last updated: [{self.interval_start_at}]")
        # add 1 second to interval_start_at for [ )
        self.interval_start_at = self.add_seconds(self.interval_start_at, 1)
        self.interval_end_at = self.calc_inverval_end_time(self.interval_start_at, PATCH_INTERVAL_DAY)
        self.log.info(f"patch interval range: [{self.interval_start_at}, {self.interval_end_at})")

    def get_geotab_data(self):
        self.log.info(f"get_geotab_data() is started")
        date_params= { 'fromDate': self.interval_start_at, 'toDate': self.interval_end_at}
        if self.geotab_data_type_name is 'DeviceStatusInfo':
            self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name)
        else:
            self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name, params=date_params)

    def do_preprocess(self):
        self.log.info(f"do_preprocess() is started")
        self.geotab_df = pd.DataFrame(self.geotab_json_data)
        # refine columns
        if len(self.selected_column_list) > 0:
            self.geotab_df = self.geotab_df[self.selected_column_list]
        # parse json
        for replace_set in self.fields_preprocessing_map:
            if len(replace_set) != 3:
                continue
            self.geotab_df[replace_set[2]]=self.geotab_df[replace_set[0]].map(lambda s: s[replace_set[1]])
        # add snapshotAt column
        if self.add_snapshot_time_column:
            self.geotab_df.insert(loc=0, column='snapshotAt', value=self.interval_end_at)
        #
        if self.parse_data_field:
            self.parse_data_field_for_customdata()

    def parse_data_field_for_customdata(self):
        self.geotab_df['device']=self.geotab_df['device'].map(lambda s: s['id'])
        self.geotab_df['data'] = self.geotab_df['data'].apply(lambda x: base64.b64decode(x).hex())

        customdata_name = ["Pkt Sequence", "Pkt Type", "Temperature", "Humidity", "PM1.0(1st)", "PM1.0(2nd)", "PM2.5(1st)", "PM2.5(2nd)", "PM10(1st)", "PM10(2nd)", "CO(1st)", "CO(2nd)", "CO2(1st)", "TBD(1st)", "TBD(2nd)"]
        # customdata_unit = ["[-]", "[-]", "[℃]", "[%]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[ppm]", "[ppm]", "[ppm]", "[minute]", "[index]"]
        customdata_byte = [2,1,2,2,2,2,2,2,2,2,2,2,2,1,1]
        customdata_set = {}

        list_ = []
        for index,row in self.geotab_df.iterrows():
            idx_to = 0
            for i in range(len(customdata_name)):
                idx_from = idx_to
                idx_to = idx_from + customdata_byte[i]*2
                customdata_set[customdata_name[i]] = int(row['data'][idx_from:idx_to],16)
                list_.append(customdata_set)

        customdata_set["Temperature"] = customdata_set["Temperature"] * 0.1 - 100
        customdata_set["Humidity"] *= 0.1
        customdata_set["CO(1st)"] *= 0.1
        customdata_set["CO(2nd)"] *= 0.1

        self.geotab_df = pd.merge(self.geotab_df.loc[:, self.geotab_df.columns != 'data'], pd.DataFrame(list_, columns = customdata_set.keys()), left_index=True, right_index=True)

    def write_to_csv(self):
        self.log.info(f"write_to_csv() is started")
        self.csv_file_name = self.geotab_data_type_name + '.csv'
        self.geotab_df.to_csv(self.csv_file_name, header=None, index=False)

    def send_to_gcs(self):
        self.log.info(f"send_to_gcs() is started")
        self.gcs_hook.upload(self.gcs_bucket_name, self.csv_file_name, self.csv_file_name)

    def push_to_bigquery(self):
        self.log.info(f"push_to_bigquery() is started")
        tp_dictionary = None

        if self.partition_column is not None:
            tp = bigquery.table.TimePartitioning()
            tp.expiration_ms = None
            tp.field = self.partition_column
            tp_dictionary = tp.to_api_repr()
        
        # check table existence
        is_table_exist = False
        if self.bq_hook.table_exists(self.bq_project_name, self.bq_dataset_name, self.bq_table_name):
            is_table_exist = True

        create_disposition='CREATE_IF_NEEDED'
        write_disposition='WRITE_TRUNCATE'
        if self.is_append_mode and is_table_exist:
            create_disposition='CREATE_NEVER'
            write_disposition='WRITE_APPEND'

        self.bq_cursor.run_load(
            destination_project_dataset_table=self.bq_table_id(self.bq_project_name, self.bq_dataset_name, self.bq_table_name),
            schema_fields=self.bq_table_schema,
            source_uris=self.gs_uri(self.gcs_bucket_name, self.csv_file_name),
            create_disposition=create_disposition,
            write_disposition=write_disposition,
            max_bad_records=0,
            allow_quoted_newlines=True,
            field_delimiter=',',
            src_fmt_configs={'nullMarker': 'NULL'},
            time_partitioning=tp_dictionary
        )

    def set_last_updated_time(self):
        self.log.info(f"set_last_updated_time() is started")
        self.set_bigquery_last_updated_time()

    def clean_up(self):
        self.log.info(f"clean_up() is started")
        if os.path.isfile(self.csv_file_name):
            os.remove(self.csv_file_name)
        #self.gcs_hook.delete(self.gcs_bucket_name, self.csv_file_name)

    ###############################################################################
    # helper
    def add_seconds(self, target_time, second_value):
        time_seconds_added = dt.datetime.strptime(target_time, '%Y-%m-%d %H:%M:%S')
        time_seconds_added = time_seconds_added + dt.timedelta(seconds=second_value)
        return time_seconds_added.strftime('%Y-%m-%d %H:%M:%S')

    def calc_inverval_end_time(self, interval_start_time, interval_day):
        start_time = dt.datetime.strptime(interval_start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo = dt.timezone.utc)
        end_time = dt.datetime.now(dt.timezone.utc)
        if interval_day >= 1:
            end_time = start_time + dt.timedelta(days=interval_day)
        # subtract 1 second to interval_end_at for [ )
        end_time = end_time + dt.timedelta(seconds=-1)

        current_time = dt.datetime.now(dt.timezone.utc)
        if (end_time > current_time):
            end_time = current_time

        interval_end_time =  end_time.strftime('%Y-%m-%d %H:%M:%S')
        return interval_end_time

    def bq_table_id(self, project, dataset, table):
        return f"{project}:{dataset}.{table}"

    def gs_uri(self, bucket, file_key):
        return f"gs://{bucket}/{file_key}"

    def replace_escape_char(self, file_name):
        with open(file_name, 'r') as file:
            data = file.read()
        data = data.replace("'", "\"")
        with open(file_name, 'w') as file:
            file.write(data)

    def get_bigquery_last_updated_time(self):
        bq_query = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                          self.bq_dataset_name, self.bq_table_name)
        self.log.info(f"get_bigquery_last_updated_time() - debug query: {bq_query}")
        bq_conn = self.bq_hook.get_conn()
        temp_bq_cursor = bq_conn.cursor()
        temp_bq_cursor.execute(bq_query)
        last_updated_time_value_row = temp_bq_cursor.fetchone()
        if last_updated_time_value_row is None:
            self.log.info(f"there is no bigquery table: {self.bq_table_name}")
            return PATCH_START_TIME
        else:
            last_updated = last_updated_time_value_row[0]
            self.log.info(f"get bigquery last updated time value: {last_updated}")
            last_updated_formatted = dt.datetime.strptime(last_updated, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S')
            return last_updated_formatted

    def set_bigquery_last_updated_time(self):
        # first, check the inserted data
        bq_query_get = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                              self.bq_dataset_name, self.bq_table_name)
        bq_conn_get = self.bq_hook.get_conn()
        bq_cursor_get = bq_conn_get.cursor()
        bq_cursor_get.execute(bq_query_get)
        last_updated_time_value_row = bq_cursor_get.fetchone()
       
        # insert or update last_updated time
        last_updated_formatted = dt.datetime.strptime(self.interval_end_at, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d%H%M%S')
        bq_query_set = ''
        if last_updated_time_value_row is None:
            self.log.info(f"insert last_updated: {self.bq_table_name}")
            bq_query_set = self.insert_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                                     self.bq_dataset_name, self.bq_table_name, last_updated_formatted)
        else:
            self.log.info(f"update last_updated: {self.bq_table_name}")
            bq_query_set = self.update_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id,
                                                                     self.bq_dataset_name, self.bq_table_name, last_updated_formatted)
        bq_conn_set = self.bq_hook.get_conn()
        bq_cursor_set = bq_conn_set.cursor()
        bq_cursor_set.execute(bq_query_set)

    # queries for last_updated column
    def get_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table):
        return f"SELECT last_updated FROM {ref_dataset}.{ref_table} WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"

    def insert_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value):
        return f"INSERT INTO {ref_dataset}.{ref_table} (dataset_id, table_id, last_updated) VALUES ('{target_dataset}', '{target_table}', '{last_updated_value}');"

    def update_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value):
        return f"UPDATE {ref_dataset}.{ref_table} SET last_updated = '{last_updated_value}' WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"
示例#50
0
def build_export_dag(
    dag_id,
    provider_uris,
    output_bucket,
    export_start_date,
    export_end_date=None,
    notification_emails=None,
    export_schedule_interval='0 0 * * *',
    export_max_workers=5,
    export_max_active_runs=None,
    gzip=False,
):
    """Build Export DAG"""
    default_dag_args = {
        "depends_on_past": False,
        "start_date": export_start_date,
        "end_date": export_end_date,
        "email_on_failure": True,
        "email_on_retry": False,
        "retries": 10,
        "retry_delay": timedelta(minutes=5)
    }

    if notification_emails and len(notification_emails) > 0:
        default_dag_args['email'] = [
            email.strip() for email in notification_emails.split(',')
        ]

    if export_max_active_runs is None:
        export_max_active_runs = configuration.conf.getint(
            'core', 'max_active_runs_per_dag')

    dag = DAG(dag_id,
              schedule_interval=export_schedule_interval,
              default_args=default_dag_args,
              max_active_runs=export_max_active_runs)

    from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
    cloud_storage_hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id="google_cloud_default")

    def import_zilliqaetl():
        from zilliqaetl.cli import (
            get_ds_block_range_for_date,
            get_tx_block_range_for_date,
            export_ds_blocks,
            export_tx_blocks,
        )
        globals()['get_ds_block_range_for_date'] = get_ds_block_range_for_date
        globals()['get_tx_block_range_for_date'] = get_tx_block_range_for_date
        globals()['export_ds_blocks'] = export_ds_blocks
        globals()['export_tx_blocks'] = export_tx_blocks

    # Export
    def export_path(directory, date):
        return "export/{directory}/block_date={block_date}/".format(
            directory=directory, block_date=date.strftime("%Y-%m-%d"))

    def copy_to_export_path(file_path,
                            export_path,
                            upload_empty_if_not_exist=True):
        logging.info('Calling copy_to_export_path({}, {})'.format(
            file_path, export_path))
        filename = os.path.basename(file_path)
        if gzip:
            filename = Path(file_path).stem + '.gz'

        if not os.path.exists(file_path):
            if upload_empty_if_not_exist:
                open(file_path, mode='a').close()
            else:
                raise ValueError('File {} does not exist'.format(file_path))

        upload_to_gcs(gcs_hook=cloud_storage_hook,
                      bucket=output_bucket,
                      object=export_path + filename,
                      filename=file_path,
                      gzip=gzip)

    def get_ds_block_range(tempdir, date, provider_uri):
        logging.info('Calling get_ds_block_range_for_date({}, {}, ...)'.format(
            provider_uri, date))
        get_ds_block_range_for_date.callback(provider_uri=provider_uri,
                                             date=date,
                                             output=os.path.join(
                                                 tempdir,
                                                 "ds_block_range.txt"))

        with open(os.path.join(tempdir,
                               "ds_block_range.txt")) as block_range_file:
            block_range = block_range_file.read()
            start_block, end_block = block_range.split(",")

        return int(start_block), int(end_block)

    def get_tx_block_range(tempdir, date, provider_uri):
        logging.info('Calling get_tx_block_range_for_date({}, {}, ...)'.format(
            provider_uri, date))
        get_tx_block_range_for_date.callback(provider_uri=provider_uri,
                                             date=date,
                                             output=os.path.join(
                                                 tempdir,
                                                 "tx_block_range.txt"))

        with open(os.path.join(tempdir,
                               "tx_block_range.txt")) as block_range_file:
            block_range = block_range_file.read()
            start_block, end_block = block_range.split(",")

        return int(start_block), int(end_block)

    def export_ds_blocks_command(execution_date, provider_uri, **kwargs):
        import_zilliqaetl()
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_ds_block_range(
                tempdir, execution_date, provider_uri)

            logging.info('Calling export_ds_blocks({}, {}, {}, {}, {})'.format(
                start_block, end_block, provider_uri, export_max_workers,
                tempdir))

            export_ds_blocks.callback(start_block=start_block,
                                      end_block=end_block,
                                      provider_uri=provider_uri,
                                      max_workers=export_max_workers,
                                      output_dir=tempdir,
                                      output_format='json')

            copy_to_export_path(os.path.join(tempdir, "ds_block_range.txt"),
                                export_path("ds_block_range", execution_date))

            copy_to_export_path(os.path.join(tempdir, "ds_blocks.json"),
                                export_path("ds_blocks", execution_date))

    def export_tx_blocks_command(execution_date, provider_uri, **kwargs):
        import_zilliqaetl()
        with TemporaryDirectory() as tempdir:
            start_block, end_block = get_tx_block_range(
                tempdir, execution_date, provider_uri)

            logging.info('Calling export_tx_blocks({}, {}, {}, {}, {})'.format(
                start_block, end_block, provider_uri, export_max_workers,
                tempdir))

            export_tx_blocks.callback(start_block=start_block,
                                      end_block=end_block,
                                      provider_uri=provider_uri,
                                      max_workers=export_max_workers,
                                      output_dir=tempdir,
                                      output_format='json',
                                      rate_limit=3)

            copy_to_export_path(os.path.join(tempdir, "tx_block_range.txt"),
                                export_path("tx_block_range", execution_date))

            copy_to_export_path(os.path.join(tempdir, "tx_blocks.json"),
                                export_path("tx_blocks", execution_date))

            copy_to_export_path(os.path.join(tempdir, "transactions.json"),
                                export_path("transactions", execution_date))

            copy_to_export_path(os.path.join(tempdir, "transitions.json"),
                                export_path("transitions", execution_date))

            copy_to_export_path(os.path.join(tempdir, "event_logs.json"),
                                export_path("event_logs", execution_date))

            copy_to_export_path(os.path.join(tempdir, "exceptions.json"),
                                export_path("exceptions", execution_date))

    def add_export_task(toggle, task_id, python_callable, dependencies=None):
        if toggle:
            operator = python_operator.PythonOperator(
                task_id=task_id,
                python_callable=python_callable,
                provide_context=True,
                execution_timeout=timedelta(hours=48),
                dag=dag,
            )
            if dependencies is not None and len(dependencies) > 0:
                for dependency in dependencies:
                    if dependency is not None:
                        dependency >> operator
            return operator
        else:
            return None

    # Operators

    add_export_task(
        True,
        "export_ds_blocks",
        add_provider_uri_fallback_loop(export_ds_blocks_command,
                                       provider_uris),
    )

    add_export_task(
        True, "export_tx_blocks",
        add_provider_uri_fallback_loop(export_tx_blocks_command,
                                       provider_uris))

    return dag
示例#51
0
def list_objects(bucket=None):
    hook = GoogleCloudStorageHook()
    storage_objects = hook.list(bucket)

    return storage_objects
示例#52
0
    def execute(self, context):

        hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if '*' in self.source_object:
            wildcard_position = self.source_object.index('*')
            objects = hook.list(
                self.source_bucket,
                prefix=self.source_object[:wildcard_position],
                delimiter=self.source_object[wildcard_position + 1:])

            for source_object in objects:
                if self.destination_object:
                    destination_object = "{}/{}".format(
                        self.destination_object,
                        source_object[wildcard_position:])
                else:
                    destination_object = source_object
                self.log.info('Executing copy of gs://{0}/{1} to '
                              'gs://{2}/{3}'.format(self.source_bucket,
                                                    source_object,
                                                    self.destination_bucket,
                                                    destination_object))

                hook.copy(self.source_bucket, source_object,
                          self.destination_bucket, destination_object)
                if self.move_object:
                    hook.delete(self.source_bucket, source_object)

        else:
            self.log.info('Executing copy of gs://{0}/{1} to '
                          'gs://{2}/{3}'.format(
                              self.source_bucket, self.source_object,
                              self.destination_bucket or self.source_bucket,
                              self.destination_object or self.source_object))
            hook.copy(self.source_bucket, self.source_object,
                      self.destination_bucket, self.destination_object)

            if self.move_object:
                hook.delete(self.source_bucket, self.source_object)
示例#53
0
 def execute(self, context):
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id
     )
     hook.insert_bucket_acl(bucket_name=self.bucket, entity=self.entity, role=self.role,
                            user_project=self.user_project)
示例#54
0
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super().execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(bucket_name,
                                                    prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info('%s files are going to be synced: %s.',
                              len(files), files)
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files
示例#55
0
 def __init__(self,
              gcp_conn_id='google_cloud_default',
              delegate_to=None):
     self._gcs_hook = GoogleCloudStorageHook(gcp_conn_id, delegate_to)
 def _get_gcs_hook(self):
     if self.gcs_hook is None:
         self.gcs_hook = GoogleCloudStorageHook(
             google_cloud_storage_conn_id=self.gcs_conn_id,
             delegate_to=self.delegate_to)
     return self.gcs_hook
示例#57
0
 def execute(self, context):
     logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename)
     hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id)
     print(hook.download(self.bucket, self.object, self.filename))
示例#58
0
def check_gcs_file_exists(file_name, google_cloud_conn_id, bucket):
    hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id=google_cloud_conn_id)
    return hook.exists(bucket, file_name)
 def execute(self, context):
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id
     )
     hook.insert_bucket_acl(bucket=self.bucket, entity=self.entity, role=self.role,
                            user_project=self.user_project)
示例#60
0
 def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None):
     self._gcs_hook = GoogleCloudStorageHook(gcp_conn_id, delegate_to)