def execute(self, context: "Context") -> list: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) self.log.info( 'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s', self.bucket, self.delimiter, self.prefix, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) return hook.list(bucket_name=self.bucket, prefix=self.prefix, delimiter=self.delimiter)
def execute(self, context: "Context"): hook = DataprocMetastoreHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) self.log.info("Exporting metadata from Dataproc Metastore service: %s", self.service_id) hook.export_metadata( destination_gcs_folder=self.destination_gcs_folder, project_id=self.project_id, region=self.region, service_id=self.service_id, request_id=self.request_id, database_dump_type=self.database_dump_type, retry=self.retry, timeout=self.timeout, metadata=self.metadata, ) metadata_export = self._wait_for_export_metadata(hook) self.log.info("Metadata from service %s exported successfully", self.service_id) DataprocMetastoreLink.persist(context=context, task_instance=self, url=METASTORE_EXPORT_LINK) uri = self._get_uri_from_destination( MetadataExport.to_dict(metadata_export)["destination_gcs_uri"]) StorageLink.persist(context=context, task_instance=self, uri=uri) return MetadataExport.to_dict(metadata_export)
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) hook.insert_bucket_acl( bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project )
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self._get_uri(self.destination_bucket, self.destination_object), project_id=hook.project_id, ) hook.sync( source_bucket=self.source_bucket, destination_bucket=self.destination_bucket, source_object=self.source_object, destination_object=self.destination_object, recursive=self.recursive, delete_extra_files=self.delete_extra_files, allow_overwrite=self.allow_overwrite, )
def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket_name, project_id=self.project_id or hook.project_id, ) try: hook.create_bucket( bucket_name=self.bucket_name, resource=self.resource, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels, ) except Conflict: # HTTP 409 self.log.warning("Bucket %s already exists", self.bucket_name)
def execute(self, context: 'Context') -> dict: self.log.info('Exporting data to Cloud Storage bucket %s', self.bucket) if self.overwrite_existing and self.namespace: gcs_hook = GCSHook(self.cloud_storage_conn_id, impersonation_chain=self.impersonation_chain) objects = gcs_hook.list(self.bucket, prefix=self.namespace) for obj in objects: gcs_hook.delete(self.bucket, obj) ds_hook = DatastoreHook( gcp_conn_id=self.datastore_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) result = ds_hook.export_to_storage_bucket( bucket=self.bucket, namespace=self.namespace, entity_filter=self.entity_filter, labels=self.labels, project_id=self.project_id, ) operation_name = result['name'] result = ds_hook.poll_operation_until_done( operation_name, self.polling_interval_in_seconds) state = result['metadata']['common']['state'] if state != 'SUCCESSFUL': raise AirflowException(f'Operation failed: result={result}') StorageLink.persist( context=context, task_instance=self, uri= f"{self.bucket}/{result['response']['outputUrl'].split('/')[3]}", project_id=self.project_id or ds_hook.project_id, ) return result
class DataprocMetastoreExportMetadataOperator(BaseOperator): """ Exports metadata from a service. :param destination_gcs_folder: A Cloud Storage URI of a folder, in the format ``gs://<bucket_name>/<path_inside_bucket>``. A sub-folder ``<export_folder>`` containing exported files will be created below it. :param project_id: Required. The ID of the Google Cloud project that the service belongs to. :param region: Required. The ID of the Google Cloud region that the service belongs to. :param service_id: Required. The ID of the metastore service, which is used as the final component of the metastore service's name. This value must be between 2 and 63 characters long inclusive, begin with a letter, end with a letter or number, and consist of alphanumeric ASCII characters or hyphens. This corresponds to the ``service_id`` field on the ``request`` instance; if ``request`` is provided, this should not be set. :param request_id: Optional. A unique id used to identify the request. :param retry: Optional. Designation of what errors, if any, should be retried. :param timeout: Optional. The timeout for this request. :param metadata: Optional. Strings which should be sent along with the request as metadata. :param gcp_conn_id: The connection ID to use connecting to Google Cloud. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). """ template_fields: Sequence[str] = ( 'project_id', 'impersonation_chain', ) operator_extra_links = (DataprocMetastoreLink(), StorageLink()) def __init__( self, *, destination_gcs_folder: str, project_id: str, region: str, service_id: str, request_id: Optional[str] = None, database_dump_type: Optional[DatabaseDumpSpec] = None, retry: Optional[Retry] = None, timeout: Optional[float] = None, metadata: Sequence[Tuple[str, str]] = (), gcp_conn_id: str = "google_cloud_default", impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.destination_gcs_folder = destination_gcs_folder self.project_id = project_id self.region = region self.service_id = service_id self.request_id = request_id self.database_dump_type = database_dump_type self.retry = retry self.timeout = timeout self.metadata = metadata self.gcp_conn_id = gcp_conn_id self.impersonation_chain = impersonation_chain def execute(self, context: "Context"): hook = DataprocMetastoreHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain ) self.log.info("Exporting metadata from Dataproc Metastore service: %s", self.service_id) hook.export_metadata( destination_gcs_folder=self.destination_gcs_folder, project_id=self.project_id, region=self.region, service_id=self.service_id, request_id=self.request_id, database_dump_type=self.database_dump_type, retry=self.retry, timeout=self.timeout, metadata=self.metadata, ) metadata_export = self._wait_for_export_metadata(hook) self.log.info("Metadata from service %s exported successfully", self.service_id) DataprocMetastoreLink.persist(context=context, task_instance=self, url=METASTORE_EXPORT_LINK) uri = self._get_uri_from_destination(MetadataExport.to_dict(metadata_export)["destination_gcs_uri"]) StorageLink.persist(context=context, task_instance=self, uri=uri) return MetadataExport.to_dict(metadata_export) def _get_uri_from_destination(self, destination_uri: str): return destination_uri[5:] if destination_uri.startswith("gs://") else destination_uri def _wait_for_export_metadata(self, hook: DataprocMetastoreHook): """ Workaround to check that export was created successfully. We discovered a issue to parse result to MetadataExport inside the SDK """ for time_to_wait in exponential_sleep_generator(initial=10, maximum=120): sleep(time_to_wait) service = hook.get_service( region=self.region, project_id=self.project_id, service_id=self.service_id, retry=self.retry, timeout=self.timeout, metadata=self.metadata, ) activities: MetadataManagementActivity = service.metadata_management_activity metadata_export: MetadataExport = activities.metadata_exports[0] if metadata_export.state == MetadataExport.State.SUCCEEDED: return metadata_export if metadata_export.state == MetadataExport.State.FAILED: raise AirflowException( f"Exporting metadata from Dataproc Metastore {metadata_export.name} FAILED" )
class GCSSynchronizeBucketsOperator(BaseOperator): """ Synchronizes the contents of the buckets or bucket's directories in the Google Cloud Services. Parameters ``source_object`` and ``destination_object`` describe the root sync directory. If they are not passed, the entire bucket will be synchronized. They should point to directories. .. note:: The synchronization of individual files is not supported. Only entire directories can be synchronized. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:GCSSynchronizeBuckets` :param source_bucket: The name of the bucket containing the source objects. :param destination_bucket: The name of the bucket containing the destination objects. :param source_object: The root sync directory in the source bucket. :param destination_object: The root sync directory in the destination bucket. :param recursive: If True, subdirectories will be considered :param allow_overwrite: if True, the files will be overwritten if a mismatched file is found. By default, overwriting files is not allowed :param delete_extra_files: if True, deletes additional files from the source that not found in the destination. By default extra files are not deleted. .. note:: This option can delete data quickly if you specify the wrong source/destination combination. :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud. :param delegate_to: The account to impersonate using domain-wide delegation of authority, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). """ template_fields: Sequence[str] = ( 'source_bucket', 'destination_bucket', 'source_object', 'destination_object', 'recursive', 'delete_extra_files', 'allow_overwrite', 'gcp_conn_id', 'delegate_to', 'impersonation_chain', ) operator_extra_links = (StorageLink(), ) def __init__( self, *, source_bucket: str, destination_bucket: str, source_object: Optional[str] = None, destination_object: Optional[str] = None, recursive: bool = True, delete_extra_files: bool = False, allow_overwrite: bool = False, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.source_bucket = source_bucket self.destination_bucket = destination_bucket self.source_object = source_object self.destination_object = destination_object self.recursive = recursive self.delete_extra_files = delete_extra_files self.allow_overwrite = allow_overwrite self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.impersonation_chain = impersonation_chain def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self._get_uri(self.destination_bucket, self.destination_object), project_id=hook.project_id, ) hook.sync( source_bucket=self.source_bucket, destination_bucket=self.destination_bucket, source_object=self.source_object, destination_object=self.destination_object, recursive=self.recursive, delete_extra_files=self.delete_extra_files, allow_overwrite=self.allow_overwrite, ) def _get_uri(self, gcs_bucket: str, gcs_object: Optional[str]) -> str: if gcs_object and gcs_object[-1] == "/": gcs_object = gcs_object[:-1] return f"{gcs_bucket}/{gcs_object}" if gcs_object else gcs_bucket
def execute(self, context: "Context") -> List[str]: # Define intervals and prefixes. try: timespan_start = context["data_interval_start"] timespan_end = context["data_interval_end"] except KeyError: timespan_start = pendulum.instance(context["execution_date"]) following_execution_date = context["dag"].following_schedule( context["execution_date"]) if following_execution_date is None: timespan_end = None else: timespan_end = pendulum.instance(following_execution_date) if timespan_end is None: # Only possible in Airflow before 2.2. self.log.warning( "No following schedule found, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max elif timespan_start >= timespan_end: # Airflow 2.2 sets start == end for non-perodic schedules. self.log.warning( "DAG schedule not periodic, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max timespan_start = timespan_start.in_timezone(timezone.utc) timespan_end = timespan_end.in_timezone(timezone.utc) source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.source_prefix, timespan_start, ) destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.destination_prefix, timespan_start, ) source_hook = GCSHook( gcp_conn_id=self.source_gcp_conn_id, impersonation_chain=self.source_impersonation_chain, ) destination_hook = GCSHook( gcp_conn_id=self.destination_gcp_conn_id, impersonation_chain=self.destination_impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.destination_bucket, project_id=destination_hook.project_id, ) # Fetch list of files. blobs_to_transform = source_hook.list_by_timespan( bucket_name=self.source_bucket, prefix=source_prefix_interp, timespan_start=timespan_start, timespan_end=timespan_end, ) with TemporaryDirectory() as temp_input_dir, TemporaryDirectory( ) as temp_output_dir: temp_input_dir_path = Path(temp_input_dir) temp_output_dir_path = Path(temp_output_dir) # TODO: download in parallel. for blob_to_transform in blobs_to_transform: destination_file = temp_input_dir_path / blob_to_transform destination_file.parent.mkdir(parents=True, exist_ok=True) try: source_hook.download( bucket_name=self.source_bucket, object_name=blob_to_transform, filename=str(destination_file), chunk_size=self.chunk_size, num_max_attempts=self.download_num_attempts, ) except GoogleCloudError: if self.download_continue_on_fail: continue raise self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance( self.transform_script, str) else self.transform_script cmd += [ str(temp_input_dir_path), str(temp_output_dir_path), timespan_start.replace(microsecond=0).isoformat(), timespan_end.replace(microsecond=0).isoformat(), ] with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) as process: self.log.info("Process output:") if process.stdout: for line in iter(process.stdout.readline, b''): self.log.info( line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode: raise AirflowException( f"Transform script failed: {process.returncode}") self.log.info( "Transformation succeeded. Output temporarily located at %s", temp_output_dir_path) files_uploaded = [] # TODO: upload in parallel. for upload_file in temp_output_dir_path.glob("**/*"): if upload_file.is_dir(): continue upload_file_name = str( upload_file.relative_to(temp_output_dir_path)) if self.destination_prefix is not None: upload_file_name = f"{destination_prefix_interp}/{upload_file_name}" self.log.info("Uploading file %s to %s", upload_file, upload_file_name) try: destination_hook.upload( bucket_name=self.destination_bucket, object_name=upload_file_name, filename=str(upload_file), chunk_size=self.chunk_size, num_max_attempts=self.upload_num_attempts, ) files_uploaded.append(str(upload_file_name)) except GoogleCloudError: if self.upload_continue_on_fail: continue raise return files_uploaded
class GCSTimeSpanFileTransformOperator(BaseOperator): """ Determines a list of objects that were added or modified at a GCS source location during a specific time-span, copies them to a temporary location on the local file system, runs a transform on this file as specified by the transformation script and uploads the output to the destination bucket. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:GCSTimeSpanFileTransformOperator` The locations of the source and the destination files in the local filesystem is provided as an first and second arguments to the transformation script. The time-span is passed to the transform script as third and fourth argument as UTC ISO 8601 string. The transformation script is expected to read the data from source, transform it and write the output to the local destination file. :param source_bucket: The bucket to fetch data from. (templated) :param source_prefix: Prefix string which filters objects whose name begin with this prefix. Can interpolate execution date and time components. (templated) :param source_gcp_conn_id: The connection ID to use connecting to Google Cloud to download files to be processed. :param source_impersonation_chain: Optional service account to impersonate using short-term credentials (to download files to be processed), or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). :param destination_bucket: The bucket to write data to. (templated) :param destination_prefix: Prefix string for the upload location. Can interpolate execution date and time components. (templated) :param destination_gcp_conn_id: The connection ID to use connecting to Google Cloud to upload processed files. :param destination_impersonation_chain: Optional service account to impersonate using short-term credentials (to upload processed files), or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). :param transform_script: location of the executable transformation script or list of arguments passed to subprocess ex. `['python', 'script.py', 10]`. (templated) :param chunk_size: The size of a chunk of data when downloading or uploading (in bytes). This must be a multiple of 256 KB (per the google clout storage API specification). :param download_continue_on_fail: With this set to true, if a download fails the task does not error out but will still continue. :param upload_chunk_size: The size of a chunk of data when uploading (in bytes). This must be a multiple of 256 KB (per the google clout storage API specification). :param upload_continue_on_fail: With this set to true, if an upload fails the task does not error out but will still continue. :param upload_num_attempts: Number of attempts to try to upload a single file. """ template_fields: Sequence[str] = ( 'source_bucket', 'source_prefix', 'destination_bucket', 'destination_prefix', 'transform_script', 'source_impersonation_chain', 'destination_impersonation_chain', ) operator_extra_links = (StorageLink(), ) @staticmethod def interpolate_prefix(prefix: str, dt: datetime.datetime) -> Optional[str]: """Interpolate prefix with datetime. :param prefix: The prefix to interpolate :param dt: The datetime to interpolate """ return dt.strftime(prefix) if prefix else None def __init__( self, *, source_bucket: str, source_prefix: str, source_gcp_conn_id: str, destination_bucket: str, destination_prefix: str, destination_gcp_conn_id: str, transform_script: Union[str, List[str]], source_impersonation_chain: Optional[Union[str, Sequence[str]]] = None, destination_impersonation_chain: Optional[Union[str, Sequence[str]]] = None, chunk_size: Optional[int] = None, download_continue_on_fail: Optional[bool] = False, download_num_attempts: int = 1, upload_continue_on_fail: Optional[bool] = False, upload_num_attempts: int = 1, **kwargs, ) -> None: super().__init__(**kwargs) self.source_bucket = source_bucket self.source_prefix = source_prefix self.source_gcp_conn_id = source_gcp_conn_id self.source_impersonation_chain = source_impersonation_chain self.destination_bucket = destination_bucket self.destination_prefix = destination_prefix self.destination_gcp_conn_id = destination_gcp_conn_id self.destination_impersonation_chain = destination_impersonation_chain self.transform_script = transform_script self.output_encoding = sys.getdefaultencoding() self.chunk_size = chunk_size self.download_continue_on_fail = download_continue_on_fail self.download_num_attempts = download_num_attempts self.upload_continue_on_fail = upload_continue_on_fail self.upload_num_attempts = upload_num_attempts def execute(self, context: "Context") -> List[str]: # Define intervals and prefixes. try: timespan_start = context["data_interval_start"] timespan_end = context["data_interval_end"] except KeyError: timespan_start = pendulum.instance(context["execution_date"]) following_execution_date = context["dag"].following_schedule( context["execution_date"]) if following_execution_date is None: timespan_end = None else: timespan_end = pendulum.instance(following_execution_date) if timespan_end is None: # Only possible in Airflow before 2.2. self.log.warning( "No following schedule found, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max elif timespan_start >= timespan_end: # Airflow 2.2 sets start == end for non-perodic schedules. self.log.warning( "DAG schedule not periodic, setting timespan end to max %s", timespan_end) timespan_end = DateTime.max timespan_start = timespan_start.in_timezone(timezone.utc) timespan_end = timespan_end.in_timezone(timezone.utc) source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.source_prefix, timespan_start, ) destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.destination_prefix, timespan_start, ) source_hook = GCSHook( gcp_conn_id=self.source_gcp_conn_id, impersonation_chain=self.source_impersonation_chain, ) destination_hook = GCSHook( gcp_conn_id=self.destination_gcp_conn_id, impersonation_chain=self.destination_impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.destination_bucket, project_id=destination_hook.project_id, ) # Fetch list of files. blobs_to_transform = source_hook.list_by_timespan( bucket_name=self.source_bucket, prefix=source_prefix_interp, timespan_start=timespan_start, timespan_end=timespan_end, ) with TemporaryDirectory() as temp_input_dir, TemporaryDirectory( ) as temp_output_dir: temp_input_dir_path = Path(temp_input_dir) temp_output_dir_path = Path(temp_output_dir) # TODO: download in parallel. for blob_to_transform in blobs_to_transform: destination_file = temp_input_dir_path / blob_to_transform destination_file.parent.mkdir(parents=True, exist_ok=True) try: source_hook.download( bucket_name=self.source_bucket, object_name=blob_to_transform, filename=str(destination_file), chunk_size=self.chunk_size, num_max_attempts=self.download_num_attempts, ) except GoogleCloudError: if self.download_continue_on_fail: continue raise self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance( self.transform_script, str) else self.transform_script cmd += [ str(temp_input_dir_path), str(temp_output_dir_path), timespan_start.replace(microsecond=0).isoformat(), timespan_end.replace(microsecond=0).isoformat(), ] with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) as process: self.log.info("Process output:") if process.stdout: for line in iter(process.stdout.readline, b''): self.log.info( line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode: raise AirflowException( f"Transform script failed: {process.returncode}") self.log.info( "Transformation succeeded. Output temporarily located at %s", temp_output_dir_path) files_uploaded = [] # TODO: upload in parallel. for upload_file in temp_output_dir_path.glob("**/*"): if upload_file.is_dir(): continue upload_file_name = str( upload_file.relative_to(temp_output_dir_path)) if self.destination_prefix is not None: upload_file_name = f"{destination_prefix_interp}/{upload_file_name}" self.log.info("Uploading file %s to %s", upload_file, upload_file_name) try: destination_hook.upload( bucket_name=self.destination_bucket, object_name=upload_file_name, filename=str(upload_file), chunk_size=self.chunk_size, num_max_attempts=self.upload_num_attempts, ) files_uploaded.append(str(upload_file_name)) except GoogleCloudError: if self.upload_continue_on_fail: continue raise return files_uploaded
class GCSCreateBucketOperator(BaseOperator): """ Creates a new bucket. Google Cloud Storage uses a flat namespace, so you can't create a bucket with a name that is already in use. .. seealso:: For more information, see Bucket Naming Guidelines: https://cloud.google.com/storage/docs/bucketnaming.html#requirements :param bucket_name: The name of the bucket. (templated) :param resource: An optional dict with parameters for creating the bucket. For information on available parameters, see Cloud Storage API doc: https://cloud.google.com/storage/docs/json_api/v1/buckets/insert :param storage_class: This defines how objects in the bucket are stored and determines the SLA and the cost of storage (templated). Values include - ``MULTI_REGIONAL`` - ``REGIONAL`` - ``STANDARD`` - ``NEARLINE`` - ``COLDLINE``. If this value is not specified when the bucket is created, it will default to STANDARD. :param location: The location of the bucket. (templated) Object data for objects in the bucket resides in physical storage within this region. Defaults to US. .. seealso:: https://developers.google.com/storage/docs/bucket-locations :param project_id: The ID of the Google Cloud Project. (templated) :param labels: User-provided labels, in key/value pairs. :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud. :param delegate_to: The account to impersonate using domain-wide delegation of authority, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). The following Operator would create a new bucket ``test-bucket`` with ``MULTI_REGIONAL`` storage class in ``EU`` region .. code-block:: python CreateBucket = GoogleCloudStorageCreateBucketOperator( task_id="CreateNewBucket", bucket_name="test-bucket", storage_class="MULTI_REGIONAL", location="EU", labels={"env": "dev", "team": "airflow"}, gcp_conn_id="airflow-conn-id", ) """ template_fields: Sequence[str] = ( 'bucket_name', 'storage_class', 'location', 'project_id', 'impersonation_chain', ) ui_color = '#f0eee4' operator_extra_links = (StorageLink(), ) def __init__( self, *, bucket_name: str, resource: Optional[Dict] = None, storage_class: str = 'MULTI_REGIONAL', location: str = 'US', project_id: Optional[str] = None, labels: Optional[Dict] = None, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.bucket_name = bucket_name self.resource = resource self.storage_class = storage_class self.location = location self.project_id = project_id self.labels = labels self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.impersonation_chain = impersonation_chain def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket_name, project_id=self.project_id or hook.project_id, ) try: hook.create_bucket( bucket_name=self.bucket_name, resource=self.resource, storage_class=self.storage_class, location=self.location, project_id=self.project_id, labels=self.labels, ) except Conflict: # HTTP 409 self.log.warning("Bucket %s already exists", self.bucket_name)
class GCSBucketCreateAclEntryOperator(BaseOperator): """ Creates a new ACL entry on the specified bucket. .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:GCSBucketCreateAclEntryOperator` :param bucket: Name of a bucket. :param entity: The entity holding the permission, in one of the following forms: user-userId, user-email, group-groupId, group-email, domain-domain, project-team-projectId, allUsers, allAuthenticatedUsers :param role: The access permission for the entity. Acceptable values are: "OWNER", "READER", "WRITER". :param user_project: (Optional) The project to be billed for this request. Required for Requester Pays buckets. :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). """ # [START gcs_bucket_create_acl_template_fields] template_fields: Sequence[str] = ( 'bucket', 'entity', 'role', 'user_project', 'impersonation_chain', ) # [END gcs_bucket_create_acl_template_fields] operator_extra_links = (StorageLink(), ) def __init__( self, *, bucket: str, entity: str, role: str, user_project: Optional[str] = None, gcp_conn_id: str = 'google_cloud_default', impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.bucket = bucket self.entity = entity self.role = role self.user_project = user_project self.gcp_conn_id = gcp_conn_id self.impersonation_chain = impersonation_chain def execute(self, context: "Context") -> None: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) hook.insert_bucket_acl(bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project)
class GCSListObjectsOperator(BaseOperator): """ List all objects from the bucket with the given string prefix and delimiter in name. This operator returns a python list with the name of objects which can be used by `xcom` in the downstream task. :param bucket: The Google Cloud Storage bucket to find the objects. (templated) :param prefix: Prefix string which filters objects whose name begin with this prefix. (templated) :param delimiter: The delimiter by which you want to filter the objects. (templated) For e.g to lists the CSV files from in a directory in GCS you would use delimiter='.csv'. :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud. :param delegate_to: The account to impersonate using domain-wide delegation of authority, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). **Example**: The following Operator would list all the Avro files from ``sales/sales-2017`` folder in ``data`` bucket. :: GCS_Files = GoogleCloudStorageListOperator( task_id='GCS_Files', bucket='data', prefix='sales/sales-2017/', delimiter='.avro', gcp_conn_id=google_cloud_conn_id ) """ template_fields: Sequence[str] = ( 'bucket', 'prefix', 'delimiter', 'impersonation_chain', ) ui_color = '#f0eee4' operator_extra_links = (StorageLink(), ) def __init__( self, *, bucket: str, prefix: Optional[str] = None, delimiter: Optional[str] = None, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.bucket = bucket self.prefix = prefix self.delimiter = delimiter self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.impersonation_chain = impersonation_chain def execute(self, context: "Context") -> list: hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) self.log.info( 'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s', self.bucket, self.delimiter, self.prefix, ) StorageLink.persist( context=context, task_instance=self, uri=self.bucket, project_id=hook.project_id, ) return hook.list(bucket_name=self.bucket, prefix=self.prefix, delimiter=self.delimiter)
class CloudDatastoreExportEntitiesOperator(BaseOperator): """ Export entities from Google Cloud Datastore to Cloud Storage .. seealso:: For more information on how to use this operator, take a look at the guide: :ref:`howto/operator:CloudDatastoreExportEntitiesOperator` .. seealso:: https://cloud.google.com/datastore/docs/export-import-entities :param bucket: name of the cloud storage bucket to backup data :param namespace: optional namespace path in the specified Cloud Storage bucket to backup data. If this namespace does not exist in GCS, it will be created. :param datastore_conn_id: the name of the Datastore connection id to use :param cloud_storage_conn_id: the name of the cloud storage connection id to force-write backup :param delegate_to: The account to impersonate using domain-wide delegation of authority, if any. For this to work, the service account making the request must have domain-wide delegation enabled. :param entity_filter: description of what data from the project is included in the export, refer to https://cloud.google.com/datastore/docs/reference/rest/Shared.Types/EntityFilter :param labels: client-assigned labels for cloud storage :param polling_interval_in_seconds: number of seconds to wait before polling for execution status again :param overwrite_existing: if the storage bucket + namespace is not empty, it will be emptied prior to exports. This enables overwriting existing backups. :param impersonation_chain: Optional service account to impersonate using short-term credentials, or chained list of accounts required to get the access_token of the last account in the list, which will be impersonated in the request. If set as a string, the account must grant the originating account the Service Account Token Creator IAM role. If set as a sequence, the identities from the list must grant Service Account Token Creator IAM role to the directly preceding identity, with first account from the list granting this role to the originating account (templated). """ template_fields: Sequence[str] = ( 'bucket', 'namespace', 'entity_filter', 'labels', 'impersonation_chain', ) operator_extra_links = (StorageLink(), ) def __init__( self, *, bucket: str, namespace: Optional[str] = None, datastore_conn_id: str = 'google_cloud_default', cloud_storage_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None, entity_filter: Optional[dict] = None, labels: Optional[dict] = None, polling_interval_in_seconds: int = 10, overwrite_existing: bool = False, project_id: Optional[str] = None, impersonation_chain: Optional[Union[str, Sequence[str]]] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.datastore_conn_id = datastore_conn_id self.cloud_storage_conn_id = cloud_storage_conn_id self.delegate_to = delegate_to self.bucket = bucket self.namespace = namespace self.entity_filter = entity_filter self.labels = labels self.polling_interval_in_seconds = polling_interval_in_seconds self.overwrite_existing = overwrite_existing self.project_id = project_id self.impersonation_chain = impersonation_chain def execute(self, context: 'Context') -> dict: self.log.info('Exporting data to Cloud Storage bucket %s', self.bucket) if self.overwrite_existing and self.namespace: gcs_hook = GCSHook(self.cloud_storage_conn_id, impersonation_chain=self.impersonation_chain) objects = gcs_hook.list(self.bucket, prefix=self.namespace) for obj in objects: gcs_hook.delete(self.bucket, obj) ds_hook = DatastoreHook( gcp_conn_id=self.datastore_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) result = ds_hook.export_to_storage_bucket( bucket=self.bucket, namespace=self.namespace, entity_filter=self.entity_filter, labels=self.labels, project_id=self.project_id, ) operation_name = result['name'] result = ds_hook.poll_operation_until_done( operation_name, self.polling_interval_in_seconds) state = result['metadata']['common']['state'] if state != 'SUCCESSFUL': raise AirflowException(f'Operation failed: result={result}') StorageLink.persist( context=context, task_instance=self, uri= f"{self.bucket}/{result['response']['outputUrl'].split('/')[3]}", project_id=self.project_id or ds_hook.project_id, ) return result