def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super(AdlsToGoogleCloudStorageOperator, self).execute(context) g_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def poke(self, context): self.log.info('Sensor checks existence of objects: %s, %s', self.bucket, self.prefix) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return bool(hook.list(self.bucket, prefix=self.prefix))
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) if '*' in self.source_object: wildcard_position = self.source_object.index('*') objects = hook.list(self.source_bucket, prefix=self.source_object[:wildcard_position], delimiter=self.source_object[wildcard_position + 1:]) for source_object in objects: self.log.info('Executing copy of gs://{0}/{1} to ' 'gs://{2}/{3}/{1}'.format(self.source_bucket, source_object, self.destination_bucket, self.destination_object, source_object)) hook.copy(self.source_bucket, source_object, self.destination_bucket, "{}/{}".format(self.destination_object, source_object)) if self.move_object: hook.delete(self.source_bucket, source_object) else: self.log.info('Executing copy: %s, %s, %s, %s', self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object) hook.copy(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
def execute(self, context): gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object)) source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects) conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id)) return max_id
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id ) hook.insert_object_acl(bucket=self.bucket, object_name=self.object_name, entity=self.entity, role=self.role, generation=self.generation, user_project=self.user_project)
def execute(self, context): self.log.info('Executing copy - Source_Bucket: %s, Source_directory: %s, ' 'Destination_bucket: %s, Destination_directory: %s', self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_directory or self.source_object) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) self.log.info('Getting list of the files to copy. Source Bucket: %s; Source Object: %s', self.source_bucket, self.source_object) # Create a list of objects to copy from Source bucket. The function uses prefix keyword to pass the name of # the object to copy. self.files_to_copy = hook.list(bucket=self.source_bucket, prefix=self.source_object, delimiter=self.source_files_delimiter) # Log the names of all objects to be copied self.log.info('Files to copy: %s', self.files_to_copy) if self.files_to_copy is not None: for file_to_copy in self.files_to_copy: self.log.info('Source_Bucket: %s, Source_Object: %s, ' 'Destination_bucket: %s, Destination_Directory: %s', self.source_bucket, file_to_copy, self.destination_bucket or self.source_bucket, self.destination_directory + file_to_copy) hook.copy(self.source_bucket, file_to_copy, self.destination_bucket, self.destination_directory + file_to_copy) else: self.log.info('No Files to copy.')
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.gcs_schema_object: gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( gcs_bucket, gcs_object).decode("utf-8")) else: schema_fields = self.schema_fields conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_empty_table( project_id=self.project_id, dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, time_partitioning=self.time_partitioning, labels=self.labels )
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs, labels=self.labels )
def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super(GoogleCloudStorageToS3Operator, self).execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key) existing_files = s3_hook.list_keys(bucket_name) files = set(files) - set(existing_files) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object, tmp_file_handle in files_to_upload.items(): hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def execute(self, context): logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) file_bytes = hook.download(self.bucket, self.object, self.filename) if self.store_to_xcom_key: if sys.getsizeof(file_bytes) < 48000: context['ti'].xcom_push(key=self.store_to_xcom_key, value=file_bytes) else: raise RuntimeError('The size of the downloaded file is too large to push to XCom!') print(file_bytes)
def apply_validate_fn(*args, **kwargs): prediction_path = kwargs["templates_dict"]["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError("Wrong format prediction_path: %s", prediction_path) summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GoogleCloudStorageHook() summary = json.loads(gcs_hook.download(bucket, summary)) return validate_fn(summary)
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google cloud storage. """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for tmp_file in files_to_upload: hook.upload(self.bucket, tmp_file.get('file_name'), tmp_file.get('file_handle').name, mime_type=tmp_file.get('file_mime_type'))
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) self.log.info('Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s', self.bucket, self.delimiter, self.prefix) return hook.list(bucket=self.bucket, prefix=self.prefix, delimiter=self.delimiter)
def execute(self, context): """ Uploads the file to Google cloud storage """ hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) hook.upload( bucket=self.bucket, object=self.dst, mime_type=self.mime_type, filename=self.src)
def execute(self, context): gcp_text_to_speech_hook = GCPTextToSpeechHook(gcp_conn_id=self.gcp_conn_id) result = gcp_text_to_speech_hook.synthesize_speech( input_data=self.input_data, voice=self.voice, audio_config=self.audio_config, retry=self.retry, timeout=self.timeout, ) with NamedTemporaryFile() as temp_file: temp_file.write(result.audio_content) cloud_storage_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id) cloud_storage_hook.upload( bucket=self.target_bucket_name, object=self.target_filename, filename=temp_file.name )
def execute(self, context): if self.labels is not None: self.labels.update( {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')} ) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) hook.create_bucket(bucket_name=self.bucket_name, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id ) return max_id
class GoogleCloudBucketHelper(object): """GoogleCloudStorageHook helper class to download GCS object.""" GCS_PREFIX_LENGTH = 5 def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): self._gcs_hook = GoogleCloudStorageHook(gcp_conn_id, delegate_to) def google_cloud_to_local(self, file_name): """ Checks whether the file specified by file_name is stored in Google Cloud Storage (GCS), if so, downloads the file and saves it locally. The full path of the saved file will be returned. Otherwise the local file_name will be returned immediately. :param file_name: The full path of input file. :type file_name: str :return: The full path of local file. :type: str """ if not file_name.startswith('gs://'): return file_name # Extracts bucket_id and object_id by first removing 'gs://' prefix and # then split the remaining by path delimiter '/'. path_components = file_name[self.GCS_PREFIX_LENGTH:].split('/') if len(path_components) < 2: raise Exception( 'Invalid Google Cloud Storage (GCS) object path: {}' .format(file_name)) bucket_id = path_components[0] object_id = '/'.join(path_components[1:]) local_file = '/tmp/dataflow{}-{}'.format(str(uuid.uuid4())[:8], path_components[-1]) self._gcs_hook.download(bucket_id, object_id, local_file) if os.stat(local_file).st_size > 0: return local_file raise Exception( 'Failed to download Google Cloud Storage (GCS) object: {}' .format(file_name))
def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super().execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key) # look for the bucket and the prefix to avoid look into # parent directories/keys existing_files = s3_hook.list_keys(bucket_name, prefix=prefix) # in case that no files exists, return an empty array to avoid errors existing_files = existing_files if existing_files is not None else [] # remove the prefix for the existing files to allow the match existing_files = [file.replace(prefix, '', 1) for file in existing_files] files = list(set(files) - set(existing_files)) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}' if self.wildcard in self.source_object: prefix, delimiter = self.source_object.split(self.wildcard, 1) objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: if self.destination_object is None: destination_object = source_object else: destination_object = source_object.replace(prefix, self.destination_object, 1) self.log.info( log_message.format(self.source_bucket, source_object, self.destination_bucket, destination_object) ) hook.rewrite(self.source_bucket, source_object, self.destination_bucket, destination_object) if self.move_object: hook.delete(self.source_bucket, source_object) else: self.log.info( log_message.format(self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object) ) hook.rewrite(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
def execute(self, context): self.log.info('Exporting data to Cloud Storage bucket ' + self.bucket) if self.overwrite_existing and self.namespace: gcs_hook = GoogleCloudStorageHook(self.cloud_storage_conn_id) objects = gcs_hook.list(self.bucket, prefix=self.namespace) for o in objects: gcs_hook.delete(self.bucket, o) ds_hook = DatastoreHook(self.datastore_conn_id, self.delegate_to) result = ds_hook.export_to_storage_bucket(bucket=self.bucket, namespace=self.namespace, entity_filter=self.entity_filter, labels=self.labels) operation_name = result['name'] result = ds_hook.poll_operation_until_done(operation_name, self.polling_interval_in_seconds) state = result['metadata']['common']['state'] if state != 'SUCCESSFUL': raise AirflowException('Operation failed: result={}'.format(result)) return result
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) if self.destination_bucket is None: self.log.warning( 'destination_bucket is None. Defaulting it to source_bucket (%s)', self.source_bucket) self.destination_bucket = self.source_bucket if WILDCARD in self.source_object: total_wildcards = self.source_object.count(WILDCARD) if total_wildcards > 1: error_msg = "Only one wildcard '*' is allowed in source_object parameter. " \ "Found {} in {}.".format(total_wildcards, self.source_object) raise AirflowException(error_msg) prefix, delimiter = self.source_object.split(WILDCARD, 1) objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: if self.destination_object is None: destination_object = source_object else: destination_object = source_object.replace(prefix, self.destination_object, 1) self._copy_single_object(hook=hook, source_object=source_object, destination_object=destination_object) else: self._copy_single_object(hook=hook, source_object=self.source_object, destination_object=self.destination_object)
def __init__(self): """ Attempt to create hook with airflow[gcp_api]. """ remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') self.hook = None try: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=remote_conn_id) except: logging.error( 'Could not create a GoogleCloudStorageHook with connection id ' '"{}". Please make sure that airflow[gcp_api] is installed ' 'and the GCS connection exists.'.format(remote_conn_id))
def move_error_file_func(**context): filename, filepath = context['ti'].xcom_pull(task_ids='create_file') conn = GoogleCloudStorageHook() target_bucket = os.getenv('UPLOAD_GCS_BUCKET_NAME') target_object = 'moved/' + filename conn.upload(target_bucket, target_object, filepath)
def _upload_to_gcs(self, files_to_upload): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) for object, tmp_file_handle in files_to_upload.items(): hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json')
def compileactivity(): hook = GoogleCloudStorageHook() github_response = hook.download(bucket=GOOGLE_STORAGE_BUCKET, object=GITHUB_OUTPUT_FILENAME) strava_response = hook.download(bucket=GOOGLE_STORAGE_BUCKET, object=STRAVA_OUTPUT_FILENAME) github_response_json = json.loads(github_response.decode("utf-8")) strava_response_json = json.loads(strava_response.decode("utf-8")) cleaned_github_commits = list( map( lambda item: { 'created_at': dateutil.parser.parse(item['commit']['committer']['date']). isoformat(), 'username': item['committer']['login'], 'url': item['html_url'], 'sha': item['sha'], 'message': item['commit']['message'], 'repo': item['repository']['full_name'] }, github_response_json['items'])) cleaned_github_commits.sort( key=lambda x: dateutil.parser.parse(x['created_at']), reverse=True) cleaned_strava_activity = list( map( lambda ride: { 'created_at': dateutil.parser.parse(ride['start_date']). isoformat(), 'name': ride['name'], 'distance_miles': round(ride['distance'] / 1609.34, 2), 'type': ride['type'], 'elapsed_time_seconds': ride['elapsed_time'] }, strava_response_json)) cleaned_strava_activity.sort( key=lambda x: dateutil.parser.parse(x['created_at']), reverse=True) public_activity = { 'github': cleaned_github_commits[0:4], 'strava': cleaned_strava_activity[0:4], } hook = GoogleCloudStorageHook() with tempfile.NamedTemporaryFile(prefix="gcs-local") as file: file.write(json.dumps(public_activity).encode('utf-8')) file.flush() hook.upload(bucket=GOOGLE_STORAGE_BUCKET, filename=file.name, object=OUTPUT_FILENAME, mime_type='application/json') hook.insert_object_acl( bucket=GOOGLE_STORAGE_BUCKET, object_name=OUTPUT_FILENAME, entity='allUsers', role='READER', )
def execute(self, context): gcshook = GoogleCloudStorageHook(self.gcp_conn_id) self.log.info(gcshook.list("testcovidlinh"))
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}' if self.wildcard in self.source_object: prefix, delimiter = self.source_object.split(self.wildcard, 1) objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: if self.destination_object is None: destination_object = source_object else: destination_object = source_object.replace(prefix, self.destination_object, 1) self.log.info( log_message.format(self.source_bucket, source_object, self.destination_bucket, destination_object) ) hook.copy(self.source_bucket, source_object, self.destination_bucket, destination_object) if self.move_object: hook.delete(self.source_bucket, source_object) else: self.log.info( log_message.format(self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object) ) hook.copy(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.dest_gcs_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list( bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info( '%s files are going to be synced: %s.', len(files), files ) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files
def poke(self, context): hook = GoogleCloudStorageHook() return self.is_bucket_updated(len(hook.list(self.bucket, prefix=self.prefix)))
def build_export_dag(dag_id, web3_provider_uri, web3_provider_uri_archival, output_bucket, start_date, chain='ethereum', notification_emails=None, schedule_interval='0 0 * * *', export_max_workers=10, export_batch_size=10, **kwargs): default_dag_args = { "depends_on_past": False, "start_date": start_date, "email_on_failure": True, "email_on_retry": True, "retries": 5, "retry_delay": timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] export_daofork_traces_option = kwargs.get('export_daofork_traces_option') export_genesis_traces_option = kwargs.get('export_genesis_traces_option') export_blocks_and_transactions_toggle = kwargs.get( 'export_blocks_and_transactions_toggle') export_receipts_and_logs_toggle = kwargs.get( 'export_receipts_and_logs_toggle') export_contracts_toggle = kwargs.get('export_contracts_toggle') export_tokens_toggle = kwargs.get('export_tokens_toggle') extract_token_transfers_toggle = kwargs.get( 'extract_token_transfers_toggle') export_traces_toggle = kwargs.get('export_traces_toggle') dag = DAG( dag_id, # Daily at 1am schedule_interval=schedule_interval, default_args=default_dag_args, ) if output_bucket is None: raise ValueError("You must set OUTPUT_BUCKET environment variable") # Export def export_path(directory, date): return "export/{directory}/block_date={block_date}/".format( directory=directory, block_date=date.strftime("%Y-%m-%d")) cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default") def copy_to_export_path(file_path, export_path): logging.info('Calling copy_to_export_path({}, {})'.format( file_path, export_path)) filename = os.path.basename(file_path) upload_to_gcs(gcs_hook=cloud_storage_hook, bucket=output_bucket, object=export_path + filename, filename=file_path) def copy_from_export_path(export_path, file_path): logging.info('Calling copy_from_export_path({}, {})'.format( export_path, file_path)) filename = os.path.basename(file_path) cloud_storage_hook.download(bucket=output_bucket, object=export_path + filename, filename=file_path) def get_block_range(tempdir, date): logging.info('Calling get_block_range_for_date({}, {}, ...)'.format( web3_provider_uri, date)) get_block_range_for_date.callback(provider_uri=web3_provider_uri, date=date, output=os.path.join( tempdir, "blocks_meta.txt")) with open(os.path.join(tempdir, "blocks_meta.txt")) as block_range_file: block_range = block_range_file.read() start_block, end_block = block_range.split(",") return int(start_block), int(end_block) def export_blocks_and_transactions_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: start_block, end_block = get_block_range(tempdir, execution_date) logging.info( 'Calling export_blocks_and_transactions({}, {}, {}, {}, {}, ...)' .format(start_block, end_block, export_batch_size, web3_provider_uri, export_max_workers)) export_blocks_and_transactions.callback( start_block=start_block, end_block=end_block, batch_size=export_batch_size, provider_uri=web3_provider_uri, max_workers=export_max_workers, blocks_output=os.path.join(tempdir, "blocks.csv"), transactions_output=os.path.join(tempdir, "transactions.csv"), ) copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"), export_path("blocks_meta", execution_date)) copy_to_export_path(os.path.join(tempdir, "blocks.csv"), export_path("blocks", execution_date)) copy_to_export_path(os.path.join(tempdir, "transactions.csv"), export_path("transactions", execution_date)) def export_receipts_and_logs_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("transactions", execution_date), os.path.join(tempdir, "transactions.csv")) logging.info('Calling extract_csv_column(...)') extract_csv_column.callback( input=os.path.join(tempdir, "transactions.csv"), output=os.path.join(tempdir, "transaction_hashes.txt"), column="hash", ) logging.info( 'Calling export_receipts_and_logs({}, ..., {}, {}, ...)'. format(export_batch_size, web3_provider_uri, export_max_workers)) export_receipts_and_logs.callback( batch_size=export_batch_size, transaction_hashes=os.path.join(tempdir, "transaction_hashes.txt"), provider_uri=web3_provider_uri, max_workers=export_max_workers, receipts_output=os.path.join(tempdir, "receipts.csv"), logs_output=os.path.join(tempdir, "logs.json"), ) copy_to_export_path(os.path.join(tempdir, "receipts.csv"), export_path("receipts", execution_date)) copy_to_export_path(os.path.join(tempdir, "logs.json"), export_path("logs", execution_date)) def export_contracts_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("receipts", execution_date), os.path.join(tempdir, "receipts.csv")) logging.info('Calling extract_csv_column(...)') extract_csv_column.callback( input=os.path.join(tempdir, "receipts.csv"), output=os.path.join(tempdir, "contract_addresses.txt"), column="contract_address", ) logging.info('Calling export_contracts({}, ..., {}, {})'.format( export_batch_size, export_max_workers, web3_provider_uri)) export_contracts.callback( batch_size=export_batch_size, contract_addresses=os.path.join(tempdir, "contract_addresses.txt"), output=os.path.join(tempdir, "contracts.json"), max_workers=export_max_workers, provider_uri=web3_provider_uri, ) copy_to_export_path(os.path.join(tempdir, "contracts.json"), export_path("contracts", execution_date)) def export_tokens_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("contracts", execution_date), os.path.join(tempdir, "contracts.json")) logging.info('Calling filter_items(...)') filter_items.callback( input=os.path.join(tempdir, "contracts.json"), output=os.path.join(tempdir, "token_contracts.json"), predicate="item['is_erc20'] or item['is_erc721']", ) logging.info('Calling extract_field(...)') extract_field.callback( input=os.path.join(tempdir, "token_contracts.json"), output=os.path.join(tempdir, "token_addresses.txt"), field="address", ) logging.info('Calling export_tokens(..., {}, {})'.format( export_max_workers, web3_provider_uri)) export_tokens.callback( token_addresses=os.path.join(tempdir, "token_addresses.txt"), output=os.path.join(tempdir, "tokens.csv"), max_workers=export_max_workers, provider_uri=web3_provider_uri, ) copy_to_export_path(os.path.join(tempdir, "tokens.csv"), export_path("tokens", execution_date)) def extract_token_transfers_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("logs", execution_date), os.path.join(tempdir, "logs.json")) logging.info( 'Calling extract_token_transfers(..., {}, ..., {})'.format( export_batch_size, export_max_workers)) extract_token_transfers.callback( logs=os.path.join(tempdir, "logs.json"), batch_size=export_batch_size, output=os.path.join(tempdir, "token_transfers.csv"), max_workers=export_max_workers, ) copy_to_export_path( os.path.join(tempdir, "token_transfers.csv"), export_path("token_transfers", execution_date), ) def export_traces_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: start_block, end_block = get_block_range(tempdir, execution_date) logging.info( 'Calling export_traces({}, {}, {}, ...,{}, {}, {}, {})'.format( start_block, end_block, export_batch_size, export_max_workers, web3_provider_uri_archival, export_genesis_traces_option, export_daofork_traces_option)) export_traces.callback( start_block=start_block, end_block=end_block, batch_size=export_batch_size, output=os.path.join(tempdir, "traces.csv"), max_workers=export_max_workers, provider_uri=web3_provider_uri_archival, genesis_traces=export_genesis_traces_option, daofork_traces=export_daofork_traces_option, ) copy_to_export_path(os.path.join(tempdir, "traces.csv"), export_path("traces", execution_date)) def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable, provide_context=True, execution_timeout=timedelta(hours=15), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None MEGABYTE = 1024 * 1024 # Helps avoid OverflowError: https://stackoverflow.com/questions/47610283/cant-upload-2gb-to-google-cloud-storage # https://developers.google.com/api-client-library/python/guide/media_upload#resumable-media-chunked-upload def upload_to_gcs(gcs_hook, bucket, object, filename, mime_type='application/octet-stream'): service = gcs_hook.get_conn() if os.path.getsize(filename) > 10 * MEGABYTE: media = MediaFileUpload(filename, mime_type, resumable=True) try: request = service.objects().insert(bucket=bucket, name=object, media_body=media) response = None while response is None: status, response = request.next_chunk() if status: logging.info("Uploaded %d%%." % int(status.progress() * 100)) return True except errors.HttpError as ex: if ex.resp['status'] == '404': return False raise else: media = MediaFileUpload(filename, mime_type) try: service.objects().insert(bucket=bucket, name=object, media_body=media).execute() return True except errors.HttpError as ex: if ex.resp['status'] == '404': return False raise # Operators export_blocks_and_transactions_operator = add_export_task( export_blocks_and_transactions_toggle, "export_blocks_and_transactions", export_blocks_and_transactions_command, ) export_receipts_and_logs_operator = add_export_task( export_receipts_and_logs_toggle, "export_receipts_and_logs", export_receipts_and_logs_command, dependencies=[export_blocks_and_transactions_operator], ) export_contracts_operator = add_export_task( export_contracts_toggle, "export_contracts", export_contracts_command, dependencies=[export_receipts_and_logs_operator], ) export_tokens_operator = add_export_task( export_tokens_toggle, "export_tokens", export_tokens_command, dependencies=[export_contracts_operator], ) extract_token_transfers_operator = add_export_task( extract_token_transfers_toggle, "extract_token_transfers", extract_token_transfers_command, dependencies=[export_receipts_and_logs_operator], ) export_traces_operator = add_export_task(export_traces_toggle, "export_traces", export_traces_command) def get_boolean_env_variable(env_variable_name, default=True): raw_env = os.environ.get(env_variable_name) if raw_env is None or len(raw_env) == 0: return default else: return raw_env.lower() in ["true", "yes"] return dag
def execute(self, context): logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) print(hook.download(self.bucket, self.object, self.filename))
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) log_message = 'Executing copy of gs://{0}/{1} to gs://{2}/{3}' if self.wildcard in self.source_object: prefix, delimiter = self.source_object.split(self.wildcard, 1) objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: if self.last_modified_time is not None: # Check to see if object was modified after last_modified_time if hook.is_updated_after(self.source_bucket, source_object, self.last_modified_time): pass else: continue if self.destination_object is None: destination_object = source_object else: destination_object = source_object.replace( prefix, self.destination_object, 1) self.log.info( log_message.format(self.source_bucket, source_object, self.destination_bucket, destination_object)) hook.rewrite(self.source_bucket, source_object, self.destination_bucket, destination_object) if self.move_object: hook.delete(self.source_bucket, source_object) else: if self.last_modified_time is not None: if hook.is_updated_after(self.source_bucket, self.source_object, self.last_modified_time): pass else: return self.log.info( log_message.format( self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object)) hook.rewrite(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
"email_on_failure": True, "email_on_retry": True, "retries": 0, "retry_delay": timedelta(minutes=5), "dagrun_timeout": timedelta(hours=4), } # use a less than desirable method of generating the service account name IS_DEV = environ.get("DEPLOY_ENVIRONMENT") != "prod" ENVIRONMENT = "dev" if IS_DEV else "prod" PRIO_ADMIN_CONN = "google_cloud_prio_admin" PRIO_A_CONN = "google_cloud_prio_a" PRIO_B_CONN = "google_cloud_prio_b" PROJECT_ADMIN = GoogleCloudStorageHook(PRIO_ADMIN_CONN).project_id PROJECT_A = GoogleCloudStorageHook(PRIO_A_CONN).project_id PROJECT_B = GoogleCloudStorageHook(PRIO_B_CONN).project_id SERVICE_ACCOUNT_ADMIN = "prio-admin-runner@{}.iam.gserviceaccount.com".format( PROJECT_ADMIN ) SERVICE_ACCOUNT_A = "prio-runner-{}-a@{}.iam.gserviceaccount.com".format( ENVIRONMENT, PROJECT_A ) SERVICE_ACCOUNT_B = "prio-runner-{}-b@{}.iam.gserviceaccount.com".format( ENVIRONMENT, PROJECT_B ) BUCKET_PRIVATE_A = "moz-fx-prio-{}-a-private".format(ENVIRONMENT) BUCKET_PRIVATE_B = "moz-fx-prio-{}-b-private".format(ENVIRONMENT)
def build_export_dag(dag_id, provider_uris, provider_uris_archival, output_bucket, cloud_provider, export_start_date, notification_emails=None, export_schedule_interval='0 0 * * *', export_max_workers=10, export_batch_size=10, export_max_active_runs=None, **kwargs): default_dag_args = { "depends_on_past": False, "start_date": export_start_date, "email_on_failure": True, "email_on_retry": True, "retries": 5, "retry_delay": timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] export_daofork_traces_option = kwargs.get('export_daofork_traces_option') export_genesis_traces_option = kwargs.get('export_genesis_traces_option') export_blocks_and_transactions_toggle = kwargs.get( 'export_blocks_and_transactions_toggle') export_receipts_and_logs_toggle = kwargs.get( 'export_receipts_and_logs_toggle') extract_contracts_toggle = kwargs.get('extract_contracts_toggle') extract_tokens_toggle = kwargs.get('extract_tokens_toggle') extract_token_transfers_toggle = kwargs.get( 'extract_token_transfers_toggle') export_traces_toggle = kwargs.get('export_traces_toggle') if export_max_active_runs is None: export_max_active_runs = configuration.conf.getint( 'core', 'max_active_runs_per_dag') dag = DAG(dag_id, schedule_interval=export_schedule_interval, default_args=default_dag_args, max_active_runs=export_max_active_runs) if cloud_provider == 'aws': from airflow.hooks.S3_hook import S3Hook cloud_storage_hook = S3Hook(aws_conn_id="aws_default") else: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default") # Export def export_path(directory, date): return "export/{directory}/block_date={block_date}/".format( directory=directory, block_date=date.strftime("%Y-%m-%d")) def copy_to_export_path(file_path, export_path): logging.info('Calling copy_to_export_path({}, {})'.format( file_path, export_path)) filename = os.path.basename(file_path) if cloud_provider == 'aws': cloud_storage_hook.load_file(filename=file_path, bucket_name=output_bucket, key=export_path + filename, replace=True, encrypt=False) else: upload_to_gcs(gcs_hook=cloud_storage_hook, bucket=output_bucket, object=export_path + filename, filename=file_path) def copy_from_export_path(export_path, file_path): logging.info('Calling copy_from_export_path({}, {})'.format( export_path, file_path)) filename = os.path.basename(file_path) if cloud_provider == 'aws': # boto3.s3.Object s3_object = cloud_storage_hook.get_key(bucket_name=output_bucket, key=export_path + filename) s3_object.download_file(file_path) else: download_from_gcs(bucket=output_bucket, object=export_path + filename, filename=file_path) def get_block_range(tempdir, date, provider_uri): logging.info('Calling get_block_range_for_date({}, {}, ...)'.format( provider_uri, date)) get_block_range_for_date.callback(provider_uri=provider_uri, date=date, output=os.path.join( tempdir, "blocks_meta.txt")) with open(os.path.join(tempdir, "blocks_meta.txt")) as block_range_file: block_range = block_range_file.read() start_block, end_block = block_range.split(",") return int(start_block), int(end_block) def export_blocks_and_transactions_command(execution_date, provider_uri, **kwargs): with TemporaryDirectory() as tempdir: start_block, end_block = get_block_range(tempdir, execution_date, provider_uri) logging.info( 'Calling export_blocks_and_transactions({}, {}, {}, {}, {}, ...)' .format(start_block, end_block, export_batch_size, provider_uri, export_max_workers)) export_blocks_and_transactions.callback( start_block=start_block, end_block=end_block, batch_size=export_batch_size, provider_uri=provider_uri, max_workers=export_max_workers, blocks_output=os.path.join(tempdir, "blocks.csv"), transactions_output=os.path.join(tempdir, "transactions.csv"), ) copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"), export_path("blocks_meta", execution_date)) copy_to_export_path(os.path.join(tempdir, "blocks.csv"), export_path("blocks", execution_date)) copy_to_export_path(os.path.join(tempdir, "transactions.csv"), export_path("transactions", execution_date)) def export_receipts_and_logs_command(execution_date, provider_uri, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("transactions", execution_date), os.path.join(tempdir, "transactions.csv")) logging.info('Calling extract_csv_column(...)') extract_csv_column.callback( input=os.path.join(tempdir, "transactions.csv"), output=os.path.join(tempdir, "transaction_hashes.txt"), column="hash", ) logging.info( 'Calling export_receipts_and_logs({}, ..., {}, {}, ...)'. format(export_batch_size, provider_uri, export_max_workers)) export_receipts_and_logs.callback( batch_size=export_batch_size, transaction_hashes=os.path.join(tempdir, "transaction_hashes.txt"), provider_uri=provider_uri, max_workers=export_max_workers, receipts_output=os.path.join(tempdir, "receipts.csv"), logs_output=os.path.join(tempdir, "logs.json"), ) copy_to_export_path(os.path.join(tempdir, "receipts.csv"), export_path("receipts", execution_date)) copy_to_export_path(os.path.join(tempdir, "logs.json"), export_path("logs", execution_date)) def extract_contracts_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("traces", execution_date), os.path.join(tempdir, "traces.csv")) logging.info('Calling extract_contracts(..., {}, {})'.format( export_batch_size, export_max_workers)) extract_contracts.callback( traces=os.path.join(tempdir, "traces.csv"), output=os.path.join(tempdir, "contracts.json"), batch_size=export_batch_size, max_workers=export_max_workers, ) copy_to_export_path(os.path.join(tempdir, "contracts.json"), export_path("contracts", execution_date)) def extract_tokens_command(execution_date, provider_uri, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("contracts", execution_date), os.path.join(tempdir, "contracts.json")) logging.info('Calling extract_tokens(..., {}, {})'.format( export_max_workers, provider_uri)) extract_tokens.callback( contracts=os.path.join(tempdir, "contracts.json"), output=os.path.join(tempdir, "tokens.csv"), max_workers=export_max_workers, provider_uri=provider_uri, ) copy_to_export_path(os.path.join(tempdir, "tokens.csv"), export_path("tokens", execution_date)) def extract_token_transfers_command(execution_date, **kwargs): with TemporaryDirectory() as tempdir: copy_from_export_path(export_path("logs", execution_date), os.path.join(tempdir, "logs.json")) logging.info( 'Calling extract_token_transfers(..., {}, ..., {})'.format( export_batch_size, export_max_workers)) extract_token_transfers.callback( logs=os.path.join(tempdir, "logs.json"), batch_size=export_batch_size, output=os.path.join(tempdir, "token_transfers.csv"), max_workers=export_max_workers, ) copy_to_export_path( os.path.join(tempdir, "token_transfers.csv"), export_path("token_transfers", execution_date), ) def export_traces_command(execution_date, provider_uri, **kwargs): with TemporaryDirectory() as tempdir: start_block, end_block = get_block_range(tempdir, execution_date, provider_uri) logging.info( 'Calling export_traces({}, {}, {}, ...,{}, {}, {}, {})'.format( start_block, end_block, export_batch_size, export_max_workers, provider_uri, export_genesis_traces_option, export_daofork_traces_option)) export_traces.callback( start_block=start_block, end_block=end_block, batch_size=export_batch_size, output=os.path.join(tempdir, "traces.csv"), max_workers=export_max_workers, provider_uri=provider_uri, genesis_traces=export_genesis_traces_option, daofork_traces=export_daofork_traces_option, ) copy_to_export_path(os.path.join(tempdir, "traces.csv"), export_path("traces", execution_date)) def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable, provide_context=True, execution_timeout=timedelta(hours=15), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None # Operators export_blocks_and_transactions_operator = add_export_task( export_blocks_and_transactions_toggle, "export_blocks_and_transactions", add_provider_uri_fallback_loop(export_blocks_and_transactions_command, provider_uris), ) export_receipts_and_logs_operator = add_export_task( export_receipts_and_logs_toggle, "export_receipts_and_logs", add_provider_uri_fallback_loop(export_receipts_and_logs_command, provider_uris), dependencies=[export_blocks_and_transactions_operator], ) extract_token_transfers_operator = add_export_task( extract_token_transfers_toggle, "extract_token_transfers", extract_token_transfers_command, dependencies=[export_receipts_and_logs_operator], ) export_traces_operator = add_export_task( export_traces_toggle, "export_traces", add_provider_uri_fallback_loop(export_traces_command, provider_uris_archival)) extract_contracts_operator = add_export_task( extract_contracts_toggle, "extract_contracts", extract_contracts_command, dependencies=[export_traces_operator], ) extract_tokens_operator = add_export_task( extract_tokens_toggle, "extract_tokens", add_provider_uri_fallback_loop(extract_tokens_command, provider_uris), dependencies=[extract_contracts_operator], ) return dag
def poke(self, context): self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.exists(self.bucket, self.object)
class GCSLog(object): """ Utility class for reading and writing logs in GCS. Requires airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration options in airflow.cfg. """ def __init__(self): """ Attempt to create hook with airflow[gcp_api]. """ remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') self.hook = None try: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=remote_conn_id) except: logging.error( 'Could not create a GoogleCloudStorageHook with connection id ' '"{}". Please make sure that airflow[gcp_api] is installed ' 'and the GCS connection exists.'.format(remote_conn_id)) def read(self, remote_log_location, return_error=False): """ Returns the log found at the remote_log_location. :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.download(bkt, blob).decode() except: pass # raise/return error if we get here err = 'Could not read logs from {}'.format(remote_log_location) logging.error(err) return err if return_error else '' def write(self, log, remote_log_location, append=True): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: bkt, blob = self.parse_gcs_url(remote_log_location) from tempfile import NamedTemporaryFile with NamedTemporaryFile(mode='w+') as tmpfile: tmpfile.write(log) # Force the file to be flushed, since we're doing the # upload from within the file context (it hasn't been # closed). tmpfile.flush() self.hook.upload(bkt, blob, tmpfile.name) except: # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location)) def parse_gcs_url(self, gsurl): """ Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a tuple containing the corresponding bucket and blob. """ # Python 3 try: from urllib.parse import urlparse # Python 2 except ImportError: from urlparse import urlparse parsed_url = urlparse(gsurl) if not parsed_url.netloc: raise AirflowException('Please provide a bucket name') else: bucket = parsed_url.netloc blob = parsed_url.path.strip('/') return (bucket, blob)
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and \ self.schema_object and \ self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs) else: cursor.run_load(destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info('Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id) return max_id
class GoogleDisplayVideo360SDFToBigQueryOperator(GoogleMarketingPlatformBaseOperator): """Make a request to SDF API and upload the data to BQ.""" DEFAULT_SDF_TABLE_NAMES = { 'LINE_ITEM': 'SDFLineItem', 'AD_GROUP': 'SDFAdGroup', 'AD': 'SDFAd', 'INSERTION_ORDER': 'SDFInsertionOrder', 'CAMPAIGN': 'SDFCampaign' } SDF_API_RESPONSE_KEYS = { 'LINE_ITEM': 'lineItems', 'AD_GROUP': 'adGroups', 'AD': 'ads', 'INSERTION_ORDER': 'insertionOrders', 'CAMPAIGN': 'campaigns' } def __init__(self, gcp_conn_id='google_cloud_default', gcs_bucket=None, schema=None, bq_dataset=None, write_disposition=None, cloud_project_id=None, file_types=None, filter_ids=None, api_version=None, filter_type=None, table_names=DEFAULT_SDF_TABLE_NAMES, sdf_api_response_keys=SDF_API_RESPONSE_KEYS, *args, **kwargs): super(GoogleDisplayVideo360SDFToBigQueryOperator, self).__init__(*args, **kwargs) self.gcp_conn_id = gcp_conn_id self.service = None self.hook = None self.bq_hook = None self.gcs_hook = None self.gcs_bucket = gcs_bucket self.schema = schema self.bq_dataset = bq_dataset self.write_disposition = write_disposition self.cloud_project_id = cloud_project_id self.file_types = file_types self.filter_ids = filter_ids self.api_version = api_version self.filter_type = filter_type self.table_names = table_names self.sdf_api_response_keys = sdf_api_response_keys def execute(self, context): if self.hook is None: self.hook = GoogleDisplayVideo360Hook(gcp_conn_id=self.gcp_conn_id) if self.bq_hook is None: self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id) if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcp_conn_id) request_body = {'fileTypes': self.file_types, 'filterType': self.filter_type, 'filterIds': self.filter_ids, 'version': self.api_version} logger.info('Request body: %s ' % request_body) request = self.hook.get_service().sdf().download(body=request_body) response = request.execute() for file_type in self.file_types: temp_file = None try: logger.info('Uploading SDF to GCS') temp_file = tempfile.NamedTemporaryFile(delete=False) response_key = self.sdf_api_response_keys.get(file_type) temp_file.write(response[response_key].encode('utf-8')) temp_file.close() filename = '%d_%s_%s_%s.json' % (time.time() * 1e+9, randint( 1, 1000000), response_key, 'sdf') self.gcs_hook.upload(self.gcs_bucket, filename, temp_file.name) logger.info('SDF upload to GCS complete') finally: if temp_file: temp_file.close() os.unlink(temp_file.name) sdf_file = 'gs://%s/%s' % (self.gcs_bucket, filename) bq_table = self.table_names.get(file_type) bq_table = '%s.%s' % (self.bq_dataset, bq_table) schema = SDF_VERSIONED_SCHEMA_TYPES.get(self.api_version).get(file_type) try: bq_base_cursor = self.bq_hook.get_conn().cursor() logger.info('Uploading SDF to BigQuery') bq_base_cursor.run_load( destination_project_dataset_table=bq_table, schema_fields=schema, source_uris=[sdf_file], source_format='CSV', skip_leading_rows=1, write_disposition=self.write_disposition) finally: logger.info('Deleting SDF from GCS') self.gcs_hook.delete(self.gcs_bucket, filename)
def execute(self, context): ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id) gcs_conn = GoogleCloudStorageHook(self.gcs_conn_id) try: since_formatted = datetime.strptime(self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: since_formatted = str(self.since) try: until_formatted = datetime.strptime(self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') except: until_formatted = str(self.until) report = ga_conn.get_analytics_report(self.view_id, since_formatted, until_formatted, self.sampling_level, self.dimensions, self.metrics, self.page_size, self.include_empty_rows) columnHeader = report.get('columnHeader', {}) # Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future # Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..) dimensionHeaders = [ {'name': header.replace('ga:', ''), 'type': 'varchar(255)'} for header in columnHeader.get('dimensions', []) ] metricHeaders = [ {'name': entry.get('name').replace('ga:', ''), 'type': self.metricMap.get(entry.get('type'), 'varchar(255)')} for entry in columnHeader.get('metricHeader', {}).get('metricHeaderEntries', []) ] with NamedTemporaryFile("w") as ga_file: rows = report.get('data', {}).get('rows', []) for row_counter, row in enumerate(rows): root_data_obj = {} dimensions = row.get('dimensions', []) metrics = row.get('metrics', []) for index, dimension in enumerate(dimensions): header = dimensionHeaders[index].get('name').lower() root_data_obj[header] = dimension for metric in metrics: data = {} data.update(root_data_obj) for index, value in enumerate(metric.get('values', [])): header = metricHeaders[index].get('name').lower() data[header] = value data['viewid'] = self.view_id data['timestamp'] = self.since ga_file.write(json.dumps(data) + ('' if row_counter == len(rows) else '\n')) gcs_conn.upload(self.gcs_bucket, self.gcs_objname, ga_file.name)
class GoogleDisplayVideo360DownloadReportOperator(GoogleMarketingPlatformBaseOperator): """Downloads a Display & Video 360 report into Google Cloud Storage. Attributes: report_url: The Google Cloud Storage url where the latest report is stored. (templated) destination_bucket: The destination Google cloud storage bucket where the report should be written to. (templated) destination_object: The destination name of the object in the destination Google cloud storage bucket. (templated) If the destination points to an existing folder, the report will be written under the specified folder. gcp_conn_id: The connection ID to use when fetching connection info. delegate_to: The account to impersonate, if any. XComs: destination_bucket: The Google cloud storage bucket the report was written to. destination_object: The Google cloud storage URI for the report. """ template_fields = ['report_url', 'destination_bucket', 'destination_object'] def __init__(self, report_url, destination_bucket, destination_object=None, chunk_size=5 * 1024 * 1024, gcp_conn_id='google_cloud_default', delegate_to=None, *args, **kwargs): super(GoogleDisplayVideo360DownloadReportOperator, self).__init__(*args, **kwargs) self.report_url = report_url self.destination_bucket = destination_bucket self.destination_object = destination_object self.chunk_size = chunk_size self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.hook = None @staticmethod def _download_report(source_url, destination_file, chunk_size): response = requests.head(source_url) content_length = int(response.headers['Content-Length']) start_byte = 0 while start_byte < content_length: end_byte = start_byte + chunk_size - 1 if end_byte >= content_length: end_byte = content_length - 1 headers = {'Range': 'bytes=%s-%s' % (start_byte, end_byte)} response = requests.get(source_url, stream=True, headers=headers) chunk = response.raw.read() destination_file.write(chunk) start_byte = end_byte + 1 destination_file.close() @staticmethod def _get_destination_uri(destination_object, report_url): report_file_name = urlparse(report_url).path.split('/')[2] if destination_object is None: return report_file_name if destination_object.endswith('/'): return destination_object + report_file_name return destination_object def execute(self, context): if self.hook is None: self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) temp_file = tempfile.NamedTemporaryFile(delete=False) try: # TODO(efolgar): Directly stream to storage instead of temp file self._download_report(self.report_url, temp_file, self.chunk_size) destination_object_name = self._get_destination_uri( self.destination_object, self.report_url) self.hook.upload( bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, multipart=True) context['task_instance'].xcom_push( 'destination_bucket', self.destination_bucket) context['task_instance'].xcom_push( 'destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
def poke(self, context): self.logger.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.is_updated_after(self.bucket, self.object, self.ts_func(context))
def build_export_dag(dag_id, provider_uris, output_bucket, export_start_date, notification_emails=None, export_schedule_interval='0 0 * * *', export_max_workers=10, export_batch_size=10, export_max_active_runs=None, provider_uris_shuffle=False, **kwargs): default_dag_args = { "depends_on_past": False, "start_date": export_start_date, "email_on_failure": True, "email_on_retry": True, "retries": 5, "retry_delay": timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] if export_max_active_runs is None: export_max_active_runs = configuration.conf.getint( 'core', 'max_active_runs_per_dag') dag = DAG(dag_id, schedule_interval=export_schedule_interval, default_args=default_dag_args, max_active_runs=export_max_active_runs) from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default") # Export def export_path(directory, date): return "export/{directory}/block_date={block_date}/".format( directory=directory, block_date=date.strftime("%Y-%m-%d")) def copy_to_export_path(file_path, export_path): logging.info('Calling copy_to_export_path({}, {})'.format( file_path, export_path)) filename = os.path.basename(file_path) upload_to_gcs(gcs_hook=cloud_storage_hook, bucket=output_bucket, object=export_path + filename, filename=file_path) def copy_from_export_path(export_path, file_path): logging.info('Calling copy_from_export_path({}, {})'.format( export_path, file_path)) filename = os.path.basename(file_path) download_from_gcs(bucket=output_bucket, object=export_path + filename, filename=file_path) def get_block_range(tempdir, date, provider_uri): logging.info('Calling get_block_range_for_date({}, {}, ...)'.format( provider_uri, date)) get_block_range_for_date.callback(provider_uri=provider_uri, date=date, output=os.path.join( tempdir, "blocks_meta.txt")) with open(os.path.join(tempdir, "blocks_meta.txt")) as block_range_file: block_range = block_range_file.read() start_block, end_block = block_range.split(",") return int(start_block), int(end_block) def export_blocks_command(execution_date, provider_uri, **kwargs): with TemporaryDirectory() as tempdir: start_block, end_block = get_block_range(tempdir, execution_date, provider_uri) logging.info( 'Calling export_blocks({}, {}, {}, {}, {}, ...)'.format( start_block, end_block, export_batch_size, provider_uri, export_max_workers)) export_blocks.callback( start_block=start_block, end_block=end_block, provider_uri=provider_uri, max_workers=export_max_workers, blocks_output=os.path.join(tempdir, "blocks.json"), transactions_output=os.path.join(tempdir, "transactions.json"), actions_output=os.path.join(tempdir, "actions.json"), ) copy_to_export_path(os.path.join(tempdir, "blocks_meta.txt"), export_path("blocks_meta", execution_date)) copy_to_export_path(os.path.join(tempdir, "blocks.json"), export_path("blocks", execution_date)) copy_to_export_path(os.path.join(tempdir, "transactions.json"), export_path("transactions", execution_date)) copy_to_export_path(os.path.join(tempdir, "actions.json"), export_path("actions", execution_date)) def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable, provide_context=True, execution_timeout=timedelta(hours=48), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None # Operators export_blocks_operator = add_export_task( True, "export_blocks", add_provider_uri_fallback_loop(export_blocks_command, provider_uris, provider_uris_shuffle), ) return dag
class GoogleCampaignManagerDownloadReportOperator(BaseOperator): """Downloads a Campaign Manager report into Google Cloud Storage. Attributes: report_id: The DCM report ID with which the report file is associated with. (templated) file_id: The DCM file ID of the report file to download. (templated) destination_bucket: The destination Google cloud storage bucket where the report should be written to. (templated) destination_object: The destination name of the object in the destination Google cloud storage bucket. (templated) If the destination points to an existing folder, the report will be written under the specified folder. gcp_conn_id: The connection ID to use when fetching connection info. delegate_to: The account to impersonate, if any. XComs: destination_bucket: The Google cloud storage bucket the report was written to. destination_object: The Google cloud storage URI for the report. """ template_fields = [ 'report_id', 'file_id', 'destination_bucket', 'destination_object' ] def __init__(self, report_id, file_id, destination_bucket, destination_object=None, gcp_conn_id='google_cloud_default', chunk_size=5 * 1024 * 1024, delegate_to=None, *args, **kwargs): super(GoogleCampaignManagerDownloadReportOperator, self).__init__(*args, **kwargs) self.file_id = file_id self.report_id = report_id self.destination_bucket = destination_bucket self.destination_object = destination_object self.chunk_size = chunk_size self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.gcs_hook = None self.cm_hook = None def _download_report(self, report_id, file_id, destination_file, chunk_size): file_metadata = self.cm_hook.get_service().files().get( reportId=report_id, fileId=file_id).execute() if file_metadata['status'] != 'REPORT_AVAILABLE': msg = 'File with ID = %s and Report ID = %s not available, status = %s.' % ( file_id, report_id, file_metadata['status']) raise Exception(msg) request = self.cm_hook.get_service().files().get_media( reportId=report_id, fileId=file_id) downloader = http.MediaIoBaseDownload(destination_file, request, chunksize=chunk_size) download_finished = False while not download_finished: _, download_finished = downloader.next_chunk() return file_metadata['fileName'] def _get_destination_uri(self, destination_object, report_file_name): report_file_name = '%s.csv.gz' % report_file_name if destination_object is None: return report_file_name if destination_object.endswith('/'): return destination_object + report_file_name return destination_object def execute(self, context): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if self.cm_hook is None: self.cm_hook = GoogleCampaignManagerHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) temp_file = tempfile.NamedTemporaryFile(delete=False) try: report_file_name = self._download_report(self.report_id, self.file_id, temp_file, self.chunk_size) destination_object_name = self._get_destination_uri( self.destination_object, report_file_name) self.gcs_hook.upload(bucket=self.destination_bucket, object=destination_object_name, filename=temp_file.name, gzip=True, multipart=True) context['task_instance'].xcom_push('destination_bucket', self.destination_bucket) context['task_instance'].xcom_push('destination_object', destination_object_name) finally: temp_file.close() os.unlink(temp_file.name)
cloud_provider = read_var("cloud_provider", var_prefix="icon_", required=False, cloud_provider="gcp") output_bucket = read_var("output_bucket", var_prefix="icon_", required=True) if cloud_provider == "aws": from airflow.hooks.S3_hook import S3Hook cloud_storage_hook = S3Hook(aws_conn_id="aws_default") if cloud_provider == "gcp": from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from iconetl_airflow.build_export_dag import upload_to_gcs cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default") default_dag_args = { "depends_on_past": False, "start_date": start_date, } dag = DAG( "icon_generate_nightly_reports", schedule_interval="30 1 * * *", default_args=default_dag_args, ) reports_folder = os.path.dirname( os.path.realpath(__file__)) + "/nightly_report_scripts"
class GeotabToBigQueryOperator(BaseOperator): @apply_defaults def __init__(self, gcs_conn_id, gcs_bucket_name, bq_project_name, bq_dataset_name, bq_table_name, bq_table_schema, update_info_dataset_id, update_info_table_id, geotab_conn_id, geotab_data_type_name, partition_column = None, is_append_mode = True, add_snapshot_time_column = False, selected_column_list = [], fields_preprocessing_map = [], parse_data_field = False, *args, **kwargs): super(GeotabToBigQueryOperator, self).__init__(*args, **kwargs) self.gcs_conn_id = gcs_conn_id self.gcs_bucket_name = gcs_bucket_name self.bq_project_name = bq_project_name self.bq_dataset_name = bq_dataset_name self.bq_table_name = bq_table_name self.bq_table_schema = bq_table_schema self.update_info_dataset_id = update_info_dataset_id self.update_info_table_id = update_info_table_id self.geotab_conn_id = geotab_conn_id self.geotab_data_type_name = geotab_data_type_name self.partition_column = partition_column self.is_append_mode = is_append_mode self.add_snapshot_time_column = add_snapshot_time_column self.selected_column_list = selected_column_list self.fields_preprocessing_map = fields_preprocessing_map self.parse_data_field = parse_data_field def execute(self, context): self.log.info(f"start execute") try: self.init() self.calc_patch_interval() self.get_geotab_data() self.do_preprocess() self.write_to_csv() self.send_to_gcs() self.push_to_bigquery() self.set_last_updated_time() except Exception as e: self.log.exception(e) raise finally: self.clean_up() def init(self): self.log.info(f"init() is started") # bucket connection self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None) # bigquery connection self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False) bq_conn = self.bq_hook.get_conn() self.bq_cursor = bq_conn.cursor() # geotab connection self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id) params = self.geotab_hook.get_connection(self.geotab_conn_id) self.log.info(f"login: "******", password: "******", schema: " + params.schema) def calc_patch_interval(self): self.log.info(f"calc_patch_interval() is started") self.interval_start_at = self.get_bigquery_last_updated_time() self.log.info(f"last updated: [{self.interval_start_at}]") # add 1 second to interval_start_at for [ ) self.interval_start_at = self.add_seconds(self.interval_start_at, 1) self.interval_end_at = self.calc_inverval_end_time(self.interval_start_at, PATCH_INTERVAL_DAY) self.log.info(f"patch interval range: [{self.interval_start_at}, {self.interval_end_at})") def get_geotab_data(self): self.log.info(f"get_geotab_data() is started") date_params= { 'fromDate': self.interval_start_at, 'toDate': self.interval_end_at} if self.geotab_data_type_name is 'DeviceStatusInfo': self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name) else: self.geotab_json_data = self.geotab_hook.get(type_name=self.geotab_data_type_name, params=date_params) def do_preprocess(self): self.log.info(f"do_preprocess() is started") self.geotab_df = pd.DataFrame(self.geotab_json_data) # refine columns if len(self.selected_column_list) > 0: self.geotab_df = self.geotab_df[self.selected_column_list] # parse json for replace_set in self.fields_preprocessing_map: if len(replace_set) != 3: continue self.geotab_df[replace_set[2]]=self.geotab_df[replace_set[0]].map(lambda s: s[replace_set[1]]) # add snapshotAt column if self.add_snapshot_time_column: self.geotab_df.insert(loc=0, column='snapshotAt', value=self.interval_end_at) # if self.parse_data_field: self.parse_data_field_for_customdata() def parse_data_field_for_customdata(self): self.geotab_df['device']=self.geotab_df['device'].map(lambda s: s['id']) self.geotab_df['data'] = self.geotab_df['data'].apply(lambda x: base64.b64decode(x).hex()) customdata_name = ["Pkt Sequence", "Pkt Type", "Temperature", "Humidity", "PM1.0(1st)", "PM1.0(2nd)", "PM2.5(1st)", "PM2.5(2nd)", "PM10(1st)", "PM10(2nd)", "CO(1st)", "CO(2nd)", "CO2(1st)", "TBD(1st)", "TBD(2nd)"] # customdata_unit = ["[-]", "[-]", "[℃]", "[%]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[mg/㎥]", "[ppm]", "[ppm]", "[ppm]", "[minute]", "[index]"] customdata_byte = [2,1,2,2,2,2,2,2,2,2,2,2,2,1,1] customdata_set = {} list_ = [] for index,row in self.geotab_df.iterrows(): idx_to = 0 for i in range(len(customdata_name)): idx_from = idx_to idx_to = idx_from + customdata_byte[i]*2 customdata_set[customdata_name[i]] = int(row['data'][idx_from:idx_to],16) list_.append(customdata_set) customdata_set["Temperature"] = customdata_set["Temperature"] * 0.1 - 100 customdata_set["Humidity"] *= 0.1 customdata_set["CO(1st)"] *= 0.1 customdata_set["CO(2nd)"] *= 0.1 self.geotab_df = pd.merge(self.geotab_df.loc[:, self.geotab_df.columns != 'data'], pd.DataFrame(list_, columns = customdata_set.keys()), left_index=True, right_index=True) def write_to_csv(self): self.log.info(f"write_to_csv() is started") self.csv_file_name = self.geotab_data_type_name + '.csv' self.geotab_df.to_csv(self.csv_file_name, header=None, index=False) def send_to_gcs(self): self.log.info(f"send_to_gcs() is started") self.gcs_hook.upload(self.gcs_bucket_name, self.csv_file_name, self.csv_file_name) def push_to_bigquery(self): self.log.info(f"push_to_bigquery() is started") tp_dictionary = None if self.partition_column is not None: tp = bigquery.table.TimePartitioning() tp.expiration_ms = None tp.field = self.partition_column tp_dictionary = tp.to_api_repr() # check table existence is_table_exist = False if self.bq_hook.table_exists(self.bq_project_name, self.bq_dataset_name, self.bq_table_name): is_table_exist = True create_disposition='CREATE_IF_NEEDED' write_disposition='WRITE_TRUNCATE' if self.is_append_mode and is_table_exist: create_disposition='CREATE_NEVER' write_disposition='WRITE_APPEND' self.bq_cursor.run_load( destination_project_dataset_table=self.bq_table_id(self.bq_project_name, self.bq_dataset_name, self.bq_table_name), schema_fields=self.bq_table_schema, source_uris=self.gs_uri(self.gcs_bucket_name, self.csv_file_name), create_disposition=create_disposition, write_disposition=write_disposition, max_bad_records=0, allow_quoted_newlines=True, field_delimiter=',', src_fmt_configs={'nullMarker': 'NULL'}, time_partitioning=tp_dictionary ) def set_last_updated_time(self): self.log.info(f"set_last_updated_time() is started") self.set_bigquery_last_updated_time() def clean_up(self): self.log.info(f"clean_up() is started") if os.path.isfile(self.csv_file_name): os.remove(self.csv_file_name) #self.gcs_hook.delete(self.gcs_bucket_name, self.csv_file_name) ############################################################################### # helper def add_seconds(self, target_time, second_value): time_seconds_added = dt.datetime.strptime(target_time, '%Y-%m-%d %H:%M:%S') time_seconds_added = time_seconds_added + dt.timedelta(seconds=second_value) return time_seconds_added.strftime('%Y-%m-%d %H:%M:%S') def calc_inverval_end_time(self, interval_start_time, interval_day): start_time = dt.datetime.strptime(interval_start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo = dt.timezone.utc) end_time = dt.datetime.now(dt.timezone.utc) if interval_day >= 1: end_time = start_time + dt.timedelta(days=interval_day) # subtract 1 second to interval_end_at for [ ) end_time = end_time + dt.timedelta(seconds=-1) current_time = dt.datetime.now(dt.timezone.utc) if (end_time > current_time): end_time = current_time interval_end_time = end_time.strftime('%Y-%m-%d %H:%M:%S') return interval_end_time def bq_table_id(self, project, dataset, table): return f"{project}:{dataset}.{table}" def gs_uri(self, bucket, file_key): return f"gs://{bucket}/{file_key}" def replace_escape_char(self, file_name): with open(file_name, 'r') as file: data = file.read() data = data.replace("'", "\"") with open(file_name, 'w') as file: file.write(data) def get_bigquery_last_updated_time(self): bq_query = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name) self.log.info(f"get_bigquery_last_updated_time() - debug query: {bq_query}") bq_conn = self.bq_hook.get_conn() temp_bq_cursor = bq_conn.cursor() temp_bq_cursor.execute(bq_query) last_updated_time_value_row = temp_bq_cursor.fetchone() if last_updated_time_value_row is None: self.log.info(f"there is no bigquery table: {self.bq_table_name}") return PATCH_START_TIME else: last_updated = last_updated_time_value_row[0] self.log.info(f"get bigquery last updated time value: {last_updated}") last_updated_formatted = dt.datetime.strptime(last_updated, '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S') return last_updated_formatted def set_bigquery_last_updated_time(self): # first, check the inserted data bq_query_get = self.get_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name) bq_conn_get = self.bq_hook.get_conn() bq_cursor_get = bq_conn_get.cursor() bq_cursor_get.execute(bq_query_get) last_updated_time_value_row = bq_cursor_get.fetchone() # insert or update last_updated time last_updated_formatted = dt.datetime.strptime(self.interval_end_at, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d%H%M%S') bq_query_set = '' if last_updated_time_value_row is None: self.log.info(f"insert last_updated: {self.bq_table_name}") bq_query_set = self.insert_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name, last_updated_formatted) else: self.log.info(f"update last_updated: {self.bq_table_name}") bq_query_set = self.update_last_updated_time_value_query(self.update_info_dataset_id, self.update_info_table_id, self.bq_dataset_name, self.bq_table_name, last_updated_formatted) bq_conn_set = self.bq_hook.get_conn() bq_cursor_set = bq_conn_set.cursor() bq_cursor_set.execute(bq_query_set) # queries for last_updated column def get_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table): return f"SELECT last_updated FROM {ref_dataset}.{ref_table} WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';" def insert_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value): return f"INSERT INTO {ref_dataset}.{ref_table} (dataset_id, table_id, last_updated) VALUES ('{target_dataset}', '{target_table}', '{last_updated_value}');" def update_last_updated_time_value_query(self, ref_dataset, ref_table, target_dataset, target_table, last_updated_value): return f"UPDATE {ref_dataset}.{ref_table} SET last_updated = '{last_updated_value}' WHERE dataset_id = '{target_dataset}' AND table_id = '{target_table}';"
def build_export_dag( dag_id, provider_uris, output_bucket, export_start_date, export_end_date=None, notification_emails=None, export_schedule_interval='0 0 * * *', export_max_workers=5, export_max_active_runs=None, gzip=False, ): """Build Export DAG""" default_dag_args = { "depends_on_past": False, "start_date": export_start_date, "end_date": export_end_date, "email_on_failure": True, "email_on_retry": False, "retries": 10, "retry_delay": timedelta(minutes=5) } if notification_emails and len(notification_emails) > 0: default_dag_args['email'] = [ email.strip() for email in notification_emails.split(',') ] if export_max_active_runs is None: export_max_active_runs = configuration.conf.getint( 'core', 'max_active_runs_per_dag') dag = DAG(dag_id, schedule_interval=export_schedule_interval, default_args=default_dag_args, max_active_runs=export_max_active_runs) from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook cloud_storage_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id="google_cloud_default") def import_zilliqaetl(): from zilliqaetl.cli import ( get_ds_block_range_for_date, get_tx_block_range_for_date, export_ds_blocks, export_tx_blocks, ) globals()['get_ds_block_range_for_date'] = get_ds_block_range_for_date globals()['get_tx_block_range_for_date'] = get_tx_block_range_for_date globals()['export_ds_blocks'] = export_ds_blocks globals()['export_tx_blocks'] = export_tx_blocks # Export def export_path(directory, date): return "export/{directory}/block_date={block_date}/".format( directory=directory, block_date=date.strftime("%Y-%m-%d")) def copy_to_export_path(file_path, export_path, upload_empty_if_not_exist=True): logging.info('Calling copy_to_export_path({}, {})'.format( file_path, export_path)) filename = os.path.basename(file_path) if gzip: filename = Path(file_path).stem + '.gz' if not os.path.exists(file_path): if upload_empty_if_not_exist: open(file_path, mode='a').close() else: raise ValueError('File {} does not exist'.format(file_path)) upload_to_gcs(gcs_hook=cloud_storage_hook, bucket=output_bucket, object=export_path + filename, filename=file_path, gzip=gzip) def get_ds_block_range(tempdir, date, provider_uri): logging.info('Calling get_ds_block_range_for_date({}, {}, ...)'.format( provider_uri, date)) get_ds_block_range_for_date.callback(provider_uri=provider_uri, date=date, output=os.path.join( tempdir, "ds_block_range.txt")) with open(os.path.join(tempdir, "ds_block_range.txt")) as block_range_file: block_range = block_range_file.read() start_block, end_block = block_range.split(",") return int(start_block), int(end_block) def get_tx_block_range(tempdir, date, provider_uri): logging.info('Calling get_tx_block_range_for_date({}, {}, ...)'.format( provider_uri, date)) get_tx_block_range_for_date.callback(provider_uri=provider_uri, date=date, output=os.path.join( tempdir, "tx_block_range.txt")) with open(os.path.join(tempdir, "tx_block_range.txt")) as block_range_file: block_range = block_range_file.read() start_block, end_block = block_range.split(",") return int(start_block), int(end_block) def export_ds_blocks_command(execution_date, provider_uri, **kwargs): import_zilliqaetl() with TemporaryDirectory() as tempdir: start_block, end_block = get_ds_block_range( tempdir, execution_date, provider_uri) logging.info('Calling export_ds_blocks({}, {}, {}, {}, {})'.format( start_block, end_block, provider_uri, export_max_workers, tempdir)) export_ds_blocks.callback(start_block=start_block, end_block=end_block, provider_uri=provider_uri, max_workers=export_max_workers, output_dir=tempdir, output_format='json') copy_to_export_path(os.path.join(tempdir, "ds_block_range.txt"), export_path("ds_block_range", execution_date)) copy_to_export_path(os.path.join(tempdir, "ds_blocks.json"), export_path("ds_blocks", execution_date)) def export_tx_blocks_command(execution_date, provider_uri, **kwargs): import_zilliqaetl() with TemporaryDirectory() as tempdir: start_block, end_block = get_tx_block_range( tempdir, execution_date, provider_uri) logging.info('Calling export_tx_blocks({}, {}, {}, {}, {})'.format( start_block, end_block, provider_uri, export_max_workers, tempdir)) export_tx_blocks.callback(start_block=start_block, end_block=end_block, provider_uri=provider_uri, max_workers=export_max_workers, output_dir=tempdir, output_format='json', rate_limit=3) copy_to_export_path(os.path.join(tempdir, "tx_block_range.txt"), export_path("tx_block_range", execution_date)) copy_to_export_path(os.path.join(tempdir, "tx_blocks.json"), export_path("tx_blocks", execution_date)) copy_to_export_path(os.path.join(tempdir, "transactions.json"), export_path("transactions", execution_date)) copy_to_export_path(os.path.join(tempdir, "transitions.json"), export_path("transitions", execution_date)) copy_to_export_path(os.path.join(tempdir, "event_logs.json"), export_path("event_logs", execution_date)) copy_to_export_path(os.path.join(tempdir, "exceptions.json"), export_path("exceptions", execution_date)) def add_export_task(toggle, task_id, python_callable, dependencies=None): if toggle: operator = python_operator.PythonOperator( task_id=task_id, python_callable=python_callable, provide_context=True, execution_timeout=timedelta(hours=48), dag=dag, ) if dependencies is not None and len(dependencies) > 0: for dependency in dependencies: if dependency is not None: dependency >> operator return operator else: return None # Operators add_export_task( True, "export_ds_blocks", add_provider_uri_fallback_loop(export_ds_blocks_command, provider_uris), ) add_export_task( True, "export_tx_blocks", add_provider_uri_fallback_loop(export_tx_blocks_command, provider_uris)) return dag
def list_objects(bucket=None): hook = GoogleCloudStorageHook() storage_objects = hook.list(bucket) return storage_objects
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if '*' in self.source_object: wildcard_position = self.source_object.index('*') objects = hook.list( self.source_bucket, prefix=self.source_object[:wildcard_position], delimiter=self.source_object[wildcard_position + 1:]) for source_object in objects: if self.destination_object: destination_object = "{}/{}".format( self.destination_object, source_object[wildcard_position:]) else: destination_object = source_object self.log.info('Executing copy of gs://{0}/{1} to ' 'gs://{2}/{3}'.format(self.source_bucket, source_object, self.destination_bucket, destination_object)) hook.copy(self.source_bucket, source_object, self.destination_bucket, destination_object) if self.move_object: hook.delete(self.source_bucket, source_object) else: self.log.info('Executing copy of gs://{0}/{1} to ' 'gs://{2}/{3}'.format( self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, self.destination_object or self.source_object)) hook.copy(self.source_bucket, self.source_object, self.destination_bucket, self.destination_object) if self.move_object: hook.delete(self.source_bucket, self.source_object)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id ) hook.insert_bucket_acl(bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project)
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.dest_gcs_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('%s files are going to be synced: %s.', len(files), files) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files
def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): self._gcs_hook = GoogleCloudStorageHook(gcp_conn_id, delegate_to)
def _get_gcs_hook(self): if self.gcs_hook is None: self.gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=self.delegate_to) return self.gcs_hook
def execute(self, context): logging.info('Executing download: %s, %s, %s', self.bucket, self.object, self.filename) hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id) print(hook.download(self.bucket, self.object, self.filename))
def check_gcs_file_exists(file_name, google_cloud_conn_id, bucket): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=google_cloud_conn_id) return hook.exists(bucket, file_name)
def execute(self, context): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id ) hook.insert_bucket_acl(bucket=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project)