def poll_sync( self, connector_id: str, initial_last_sync_completion: datetime.datetime, poll_interval: float = DEFAULT_POLL_INTERVAL, poll_timeout: float = None, ) -> Dict[str, Any]: """ Given a Fivetran connector and the timestamp at which the previous sync completed, poll until the next sync completes. The previous sync completion time is necessary because the only way to tell when a sync completes is when this value changes. Args: connector_id (str): The Fivetran Connector ID. You can retrieve this value from the "Setup" tab of a given connector in the Fivetran UI. initial_last_sync_completion (datetime.datetime): The timestamp of the last completed sync (successful or otherwise) for this connector, prior to running this method. poll_interval (float): The time (in seconds) that will be waited between successive polls. poll_timeout (float): The maximum time that will waited before this operation is timed out. By default, this will never time out. Returns: Dict[str, Any]: Parsed json data representing the API response. """ poll_start = datetime.datetime.now() while True: ( curr_last_sync_completion, curr_last_sync_succeeded, curr_sync_state, ) = self.get_connector_sync_status(connector_id) self._log.info(f"Polled '{connector_id}'. Status: [{curr_sync_state}]") if curr_last_sync_completion > initial_last_sync_completion: break if poll_timeout and datetime.datetime.now() > poll_start + datetime.timedelta( seconds=poll_timeout ): raise Failure( f"Sync for connector '{connector_id}' timed out after {datetime.datetime.now() - poll_start}." ) # Sleep for the configured time interval before polling again. time.sleep(poll_interval) connector_details = self.get_connector_details(connector_id) if not curr_last_sync_succeeded: raise Failure( f"Sync for connector '{connector_id}' failed!", metadata={ "connector_details": MetadataValue.json(connector_details), "log_url": MetadataValue.url(get_fivetran_logs_url(connector_details)), }, ) return connector_details
def read_s3_key(context): s3_key = context.solid_config["s3_key"] bucket = context.solid_config["bucket"] path = f"s3://{bucket}/{s3_key}" context.log.info(f"Found file {path}") yield AssetMaterialization( asset_key=AssetKey(["log_s3", path]), metadata={"S3 path": MetadataValue.url(path)}, ) yield Output(path)
def the_solid(_context): yield AssetObservation( asset_key="foo", metadata={ "text": "FOO", "int": 22, "url": MetadataValue.url("http://fake.com"), "float": 0.1, "python": MetadataValue.python_artifact(MetadataValue), }, )
def the_solid(_context): yield AssetMaterialization( asset_key="foo", metadata={ "text": "FOO", "int": 22, "url": MetadataValue.url("http://fake.com"), "float": 0.1, "path": MetadataValue.path(Path("/a/b.csv")), "python": MetadataValue.python_artifact(MetadataValue), }, )
def my_asset_key_materialization_op(context): df = read_df() remote_storage_path = persist_to_storage(df) yield AssetMaterialization( asset_key=AssetKey(["dashboard", "my_cool_site"]), description="Persisted result to storage", metadata={ "dashboard_url": MetadataValue.url("http://mycoolsite.com/dashboard"), "size (bytes)": calculate_bytes(df), }, ) yield Output(remote_storage_path)
def my_failure_metadata_op(): path = "/path/to/files" my_files = get_files(path) if len(my_files) == 0: raise Failure( description="No files to process", metadata={ "filepath": MetadataValue.path(path), "dashboard_url": MetadataValue.url("http://mycoolsite.com/failures"), }, ) return some_calculation(my_files)
def result_to_materialization( result: Dict[str, Any], asset_key_prefix: Optional[List[str]] = None, docs_url: Optional[str] = None, ) -> Optional[AssetMaterialization]: """ This is a hacky solution that attempts to consolidate parsing many of the potential formats that dbt can provide its results in. This is known to work for CLI Outputs for dbt versions 0.18+, as well as RPC responses for a similar time period, but as the RPC response schema is not documented nor enforced, this can become out of date easily. """ asset_key_prefix = check.opt_list_param(asset_key_prefix, "asset_key_prefix", of_type=str) # status comes from set of fields rather than "status" if "fail" in result: success = not result.get("fail") and not result.get("skip") and not result.get("error") else: success = result["status"] == "success" if not success: return None # all versions represent timing the same way metadata = {"Execution Time (seconds)": result["execution_time"]} metadata.update(_timing_to_metadata(result["timing"])) # working with a response that contains the node block (RPC and CLI 0.18.x) if "node" in result: unique_id = result["node"]["unique_id"] metadata.update(_node_result_to_metadata(result["node"])) else: unique_id = result["unique_id"] id_prefix = unique_id.split(".") # only generate materializations for models if id_prefix[0] != "model": return None if docs_url: metadata["docs_url"] = MetadataValue.url(f"{docs_url}#!/model/{unique_id}") return AssetMaterialization( description=f"dbt node: {unique_id}", metadata=metadata, asset_key=asset_key_prefix + id_prefix, )
def my_metadata_output(context): df = get_some_data() yield Output( df, metadata={ "text_metadata": "Text-based metadata for this event", "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_data"), "raw_count": len(df), "size (bytes)": calculate_bytes(df), }, )
def my_metadata_expectation_op(context, df): df = do_some_transform(df) context.log_event( ExpectationResult( success=len(df) > 0, description="ensure dataframe has rows", metadata={ "text_metadata": "Text-based metadata for this event", "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_data"), "raw_count": len(df), "size (bytes)": calculate_bytes(df), }, )) return df
def my_metadata_materialization_op(context): df = read_df() remote_storage_path = persist_to_storage(df) context.log_event( AssetMaterialization( asset_key="my_dataset", description="Persisted result to storage", metadata={ "text_metadata": "Text-based metadata for this event", "path": MetadataValue.path(remote_storage_path), "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_data"), "size (bytes)": calculate_bytes(df), }, )) return remote_storage_path
def _table_data_to_materialization( fivetran_output: FivetranOutput, asset_key_prefix: List[str], schema_name: str, table_data: Dict[str, Any], ) -> AssetMaterialization: table_name = table_data["name_in_destination"] asset_key = asset_key_prefix + [schema_name, table_name] if not table_data["enabled"]: return None metadata = { "connector_url": MetadataValue.url( get_fivetran_connector_url(fivetran_output.connector_details) ) } if table_data.get("columns"): metadata["column_info"] = MetadataValue.json(table_data.get("columns")) return AssetMaterialization( asset_key=asset_key, description=f"Table generated via Fivetran sync: {schema_name}.{table_name}", metadata=metadata, )
def many_table_materializations(_context): with open(file_relative_path(__file__, MARKDOWN_EXAMPLE), "r") as f: md_str = f.read() for table in raw_tables: yield AssetMaterialization( asset_key="table_info", metadata={ "table_name": table, "table_path": MetadataValue.path(f"/path/to/{table}"), "table_data": { "name": table }, "table_name_big": MetadataValue.url(f"https://bigty.pe/{table}"), "table_blurb": MetadataValue.md(md_str), "big_int": 29119888133298982934829348, "float_nan": float("nan"), }, )
partitions_def=daily_partitions_def) def downstream_daily_partitioned_asset(upstream_daily_partitioned_asset): assert upstream_daily_partitioned_asset is None @asset( metadata={"owner": "*****@*****.**"}, partitions_def=HourlyPartitionsDefinition( start_date=datetime(2022, 3, 12, 0, 0)), ) def hourly_partitioned_asset(): pass @asset( metadata={ "owner": "*****@*****.**", "text_metadata": "Text-based metadata about this asset", "path": MetadataValue.path("/unpartitioned/asset"), "dashboard_url": MetadataValue.url("http://mycoolsite.com/url_for_my_asset"), }, ) def unpartitioned_asset(): pass partitioned_asset_group = AssetGroup.from_current_module()
def poll_run( self, run_id: int, poll_interval: float = DEFAULT_POLL_INTERVAL, poll_timeout: Optional[float] = None, href: Optional[str] = None, ) -> Dict[str, Any]: """ Polls a dbt Cloud job run until it completes. Will raise a `dagster.Failure` exception if the run does not complete successfully. Args: run_id (int): The ID of the relevant dbt Cloud run. You can find this value by going to the details page of your run in the dbt Cloud UI. It will be the final number in the url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/runs/{run_id}/`` poll_interval (float): The time (in seconds) that should be waited between successive polls of the dbt Cloud API. poll_timeout (float): The maximum time (in seconds) that should be waited for this run to complete. If this threshold is exceeded, the run will be cancelled and an exception will be thrown. By default, this will poll forver. href (str): For internal use, generally should not be set manually. Returns: Dict[str, Any]: A dictionary containing the parsed contents of the dbt Cloud run details. See: https://docs.getdbt.com/dbt-cloud/api-v2#operation/getRunById for schema. """ if not href: href = self.get_run(run_id).get("href") poll_start = datetime.datetime.now() while True: run_details = self.get_run(run_id) status = run_details["status_humanized"] self._log.info(f"Polled run {run_id}. Status: [{status}]") # completed successfully if status == "Success": return self.get_run(run_id, include_related=["job", "trigger"]) elif status in ["Error", "Cancelled"]: break elif status not in ["Queued", "Starting", "Running"]: check.failed( f"Received unexpected status '{status}'. This should never happen" ) if poll_timeout and datetime.datetime.now( ) > poll_start + datetime.timedelta(seconds=poll_timeout): self.cancel_run(run_id) raise Failure( f"Run {run_id} timed out after " f"{datetime.datetime.now() - poll_start}. Attempted to cancel.", metadata={"run_page_url": MetadataValue.url(href)}, ) # Sleep for the configured time interval before polling again. time.sleep(poll_interval) run_details = self.get_run(run_id, include_related=["trigger"]) raise Failure( f"Run {run_id} failed. Status Message: {run_details['status_message']}", metadata={ "run_details": MetadataValue.json(run_details), "run_page_url": MetadataValue.url(href), }, )