def signal_task(message): if message == 'go!': raise signals.SUCCESS(message='going!') elif message == 'stop!': raise signals.FAIL(message='stopping!') elif message == 'skip!': raise signals.SKIP(message='skipping!')
def task_2(): logger = prefect.context.get("logger") interval = randrange(0, 60) logger.info(interval) time.sleep(interval) if interval > 40: logger.info("Skipping task...") raise signals.SKIP()
def skip_if_already_run( parametrised_workflow: Tuple[Flow, Dict[str, Any]]) -> None: """ Task to raise a SKIP signal if a workflow is already running or has previously run successfully with the given parameters. Parameters ---------- parametrised_workflow : tuple (prefect.Flow, dict) Workflow, and associated parameters, for which previous runs should be checked Raises ------ prefect.engine.signals.SKIP if this workflow with these parameters has already run successfully """ workflow, parameters = parametrised_workflow prefect.context.logger.info( f"Checking whether workflow '{workflow.name}' has already run successfully with parameters {parameters}." ) with session_scope(prefect.config.db_uri) as session: state = WorkflowRuns.get_most_recent_state(workflow_name=workflow.name, parameters=parameters, session=session) if state is None: prefect.context.logger.debug( f"Workflow '{workflow.name}' has not previously run with parameters {parameters}." ) elif state == RunState.failed: prefect.context.logger.debug( f"Workflow '{workflow.name}' previously failed with parameters {parameters}." ) elif state == RunState.running: raise signals.SKIP( f"Workflow '{workflow.name}' is already running with parameters {parameters}." ) elif state == RunState.success: raise signals.SKIP( f"Workflow '{workflow.name}' previously ran successfully with parameters {parameters}." ) else: # This should never happen raise ValueError(f"Unrecognised workflow state: '{state}'.")
def run(self, value: Any) -> None: """ Raises a SKIP signal if the passed value does not match the task's match value; succeeds silently otherwise. Args: - value (Any): the value that will be matched against the task's value. """ if value != self.value: raise signals.SKIP('Provided value "{}" did not match "{}"'.format( value, self.value))
def fetch_paypal_report( date: str, paypal_credentials: dict, paypal_report_prefix: str, paypal_report_check_column_name: str, s3_bucket: str, s3_path: str, overwrite: bool, ): logger = prefect.context.get("logger") logger.info("Pulling Paypal report for {}".format(date)) if not overwrite: # If we're not overwriting and the file already exists, raise a skip date_path = get_s3_path_for_date(date) s3_key = s3_path + date_path logger.info("Checking for existence of: {}".format(s3_key)) existing_file = list_object_keys_from_s3.run(s3_bucket, s3_key) if existing_file: raise signals.SKIP( 'File {} already exists and we are not overwriting. Skipping.'. format(s3_key)) else: logger.info( "File not found, continuing download for {}.".format(date)) transport = Transport(config.paypal.host, config.paypal.port) transport.connect(username=paypal_credentials.get('username'), password=paypal_credentials.get('password')) sftp_connection = SFTPClient.from_transport(transport) query_date = datetime.datetime.strptime(date, "%Y-%m-%d") remote_filename = get_paypal_filename(query_date, paypal_report_prefix, sftp_connection, config.paypal.remote_path) try: if remote_filename: sftp_connection.chdir(config.paypal.remote_path) check_paypal_report(sftp_connection, remote_filename, paypal_report_check_column_name) formatted_report = format_paypal_report(sftp_connection, remote_filename, date) return date, formatted_report else: raise Exception("Remote File Not found for date: {0}".format(date)) finally: sftp_connection.close()
def resource_cleanup_trigger(upstream_states: Dict[Edge, State]) -> bool: """Run the cleanup task, provided the following hold: - All upstream tasks have finished - The resource init task succeeded and wasn't skipped - The resource setup task succeeded and wasn't skipped """ for edge, state in upstream_states.items(): if not state.is_finished(): raise signals.TRIGGERFAIL( "Trigger was 'resource_cleanup_trigger' but some of the " "upstream tasks were not finished.") if edge.key == "mgr": if state.is_skipped(): raise signals.SKIP("Resource manager init skipped") elif not state.is_successful(): raise signals.SKIP("Resource manager init failed") elif edge.key == "resource": if state.is_skipped(): raise signals.SKIP("Resource manager setup skipped") elif not state.is_successful(): raise signals.SKIP("Resource manager setup failed") return True
def not_all_skipped(upstream_states: Set["state.State"]) -> bool: """ Runs if all upstream tasks were successful and were not all skipped. Args: - upstream_states (set[State]): the set of all upstream states """ if all(state.is_skipped() for state in upstream_states): raise signals.SKIP("All upstreams were skipped", result=None) elif not all(state.is_successful() for state in upstream_states): raise signals.TRIGGERFAIL( 'Trigger was "not_all_skipped" but some of the upstream tasks failed.' ) return True
def error_notifcation_handler(obj, old_state, new_state): # Hamdle an empty dataframe to return a fail message. # The result of a succesfull if new_state.is_failed(): p = PrefectSecret("system_errors") slack_web_hook_url = p.run() msg = f"Task'{obj.name}' finished in state {new_state.message}" # Replace URL with your Slack webhook URL #requests.post(slack_web_hook_url, json={"text": msg}) # The order matters return_state = new_state raise signals.SKIP(message='See Error Msg') else: return_state = new_state return return_state
def check_files(self): skip = False force = self.p.get('force', False) signalskip = self.p.get('signalskip', True) # check input exists for k, f in self.inps.items(): if not os.path.exists(f): raise signals.FAIL( message=f'input file {k}:{f} does not exist') done = all([os.path.exists(x) for k, x in self.outs.items()]) # if force delete tmps and outs if force or (not done): # delete all existing deletefiles(list(self.tmps.values()) + list(self.outs.values())) # if all outs exists then skip elif done: deletefiles(list(self.tmps.values())) if signalskip: raise signals.SKIP( message=f'all outputs {self.outs} already exist') skip = True return skip
def load_s3_data_to_snowflake( date: str, date_property: str, sf_credentials: SFCredentials, sf_database: str, sf_schema: str, sf_table: str, sf_role: str, sf_warehouse: str, sf_storage_integration_name: str, s3_url: str, sf_file_format: str = "TYPE='JSON', STRIP_OUTER_ARRAY=TRUE", file: str = None, pattern: str = None, overwrite: bool = False, truncate: bool = False, disable_existence_check: bool = False, ): """ Loads objects in S3 to a generic table in Snowflake, the data is stored in a variant column named `PROPERTIES`. Note that either `file` or `pattern` parameter must be specified. Notes: To load a single file, use the `file` parameter. You must explicitly include a separator (/) either at the end of the `s3_url` or at the beginning of file path specified in the `file` parameter. Typically you would include a trailing (/) in `s3_url`. To load multiple files, use `pattern`. Examples: To load a single file `s3://bucket/path/to/filename/filename.ext`: load_s3_data_to_snowflake(s3_url='s3://bucket/path/to/filename/, file='filename.ext') or load_s3_data_to_snowflake(s3_url='s3://bucket/path/, file='to/filename/filename.ext') Load all files under `s3://bucket/path/` with a certain name: load_s3_data_to_snowflake(s3_url='s3://bucket/path/, pattern='.*filename.ext') Args: date (str): Date of the data being loaded. date_property (str): Date type property name in the variant `PROPERTIES` column. sf_credentials (SFCredentials): Snowflake public key credentials in the format required by create_snowflake_connection. sf_database (str): Name of the destination database. sf_schema (str): Name of the destination schema. sf_table (str): Name of the destination table. sf_role (str): Name of the snowflake role to assume. sf_warehouse (str): Name of the Snowflake warehouse to be used for loading. sf_storage_integration_name (str): Name of the Snowflake storage integration to use. These are configured in Terraform. s3_url (str): Full URL to the S3 path containing the files to load. sf_file_format (str, optional): Snowflake file format for the Stage. Defaults to 'JSON'. file (str, optional): File path relative to `s3_url`. pattern (str, optional): Path pattern/regex to match S3 objects to copy. Defaults to `None`. overwrite (bool, optional): Whether to overwrite existing data for the given date. Defaults to `False`. truncate (bool, optional): Whether to truncate the table. Defaults to `False`. disable_existence_check (bool, optional): Whether to disable check for existing data, useful when always appending to the table regardless of any existing data for that provided `date` """ logger = get_logger() if not file and not pattern: raise signals.FAIL('Either `file` or `pattern` must be specified to run this task.') sf_connection = create_snowflake_connection(sf_credentials, sf_role, warehouse=sf_warehouse) if truncate: query = "TRUNCATE IF EXISTS {}".format(qualified_table_name(sf_database, sf_schema, sf_table)) logger.info("Truncating table: {}".format(sf_table)) cursor = sf_connection.cursor() cursor.execute(query) # Check for data existence for this date row = None if not disable_existence_check: try: query = """ SELECT 1 FROM {table} WHERE date(PROPERTIES:{date_property})=date('{date}') """.format( table=qualified_table_name(sf_database, sf_schema, sf_table), date=date, date_property=date_property, ) logger.info("Checking existence of data for {}".format(date)) cursor = sf_connection.cursor() cursor.execute(query) row = cursor.fetchone() except snowflake.connector.ProgrammingError as e: if "does not exist" in e.msg: # If so then the query failed because the table doesn't exist. row = None else: raise if row and not overwrite: raise signals.SKIP('Skipping task as data for the date exists and no overwrite was provided.') else: logger.info("Continuing with S3 load for {}".format(date)) try: # Create the generic loading table query = """ CREATE TABLE IF NOT EXISTS {table} ( ID NUMBER AUTOINCREMENT START 1 INCREMENT 1, LOAD_TIME TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(), ORIGIN_FILE_NAME VARCHAR(16777216), ORIGIN_FILE_LINE NUMBER(38,0), ORIGIN_STR VARCHAR(16777216), PROPERTIES VARIANT ); """.format( table=qualified_table_name(sf_database, sf_schema, sf_table) ) sf_connection.cursor().execute(query) # Delete existing data in case of overwrite. if overwrite and row: logger.info("Deleting data for overwrite for {}".format(date)) query = """ DELETE FROM {table} WHERE date(PROPERTIES:{date_property})=date('{date}') """.format( table=qualified_table_name(sf_database, sf_schema, sf_table), date=date, date_property=date_property, ) sf_connection.cursor().execute(query) # Create stage query = """ CREATE STAGE IF NOT EXISTS {stage_name} URL = '{stage_url}' STORAGE_INTEGRATION = {storage_integration} FILE_FORMAT = ({file_format}); """.format( stage_name=qualified_stage_name(sf_database, sf_schema, sf_table), stage_url=s3_url, storage_integration=sf_storage_integration_name, file_format=sf_file_format, ) sf_connection.cursor().execute(query) files_paramater = "" pattern_parameter = "" if file: logger.info("Loading file {}".format(file)) files_paramater = "FILES = ( '{}' )".format(file) if pattern: logger.info("Loading pattern {}".format(pattern)) pattern_parameter = "PATTERN = '{}'".format(pattern) query = """ COPY INTO {table} (origin_file_name, origin_file_line, origin_str, properties) FROM ( SELECT metadata$filename, metadata$file_row_number, t.$1, CASE WHEN CHECK_JSON(t.$1) IS NULL THEN t.$1 ELSE NULL END FROM @{stage_name} t ) {files_parameter} {pattern_parameter} FORCE={force} """.format( table=qualified_table_name(sf_database, sf_schema, sf_table), stage_name=qualified_stage_name(sf_database, sf_schema, sf_table), files_parameter=files_paramater, pattern_parameter=pattern_parameter, force=str(overwrite), ) logger.info("Copying data into Snowflake as: \n{}".format(query)) sf_connection.cursor().execute(query) sf_connection.commit() except Exception: sf_connection.rollback() raise finally: sf_connection.close()
def load_s3_data_to_mysql( aurora_credentials: dict, database: str, s3_url: str, table: str, table_columns: list, table_indexes: list = [], field_delimiter: str = ',', enclosed_by: str = '', escaped_by: str = '\\\\', record_filter: str = '', ignore_num_lines: int = 0, overwrite: bool = False, overwrite_with_temp_table: bool = False, use_manifest: bool = False, ): """ Loads an Aurora MySQL database table from text files in S3. Args: aurora_credentials (dict): Aurora credentials dict containing `user`, `password` and `host`. database (str): Name of the destination database. s3_url (str): Full S3 URL containing the files to load into the destination table . table (str): Name of the destination table. table_columns (list): List of tuples specifying table schema. Example: `[('id', 'int'), ('course_id', 'varchar(255) NOT NULL')]` table_indexes (list): List of tuples specifying table indexes to add. Defaults to `[]`. Example: `[('user_id',), ('course_id',), ('user_id', 'course_id')]` field_delimiter (str, optional): The character used to indicate how the fields in input files are delimited. Defaults to `,`. enclosed_by (str, optional): Single character string that specifies the fields enclosing character. Defaults to empty string(no enclosing character). escaped_by (str, optional): Single character string which indicates the escaping of delimiters and other escape sequences. Defaults to backslash(`\\`). record_filter (str, optional): Entire `WHERE` clause which specifies the data to overwrite. An empty value with overwrite=True will delete all the rows from the table. ignore_num_lines (int, optional): Specifies to ignore a certain number of lines at the start of the input file. Defaults to 0. overwrite (bool, optional): Whether to overwrite existing data in the destination Table. Defaults to `False`. overwrite_with_temp_table (bool, optional): Whether to use a temporary table to overwrite data instead of `DELETE`. The data would first be loaded into a new table followed by an atomic rename. Use this option if there are any schema changes or for expensive `DELETE` operations. IMPORTANT: Do not use this option for incrementally updated tables as any historical data would be lost. Defaults to `False`. use_manifest (bool, optional): Whether to use a manifest file to load data. Defaults to `False`. """ def _drop_temp_tables(table, connection): for table in [table + '_old', table + '_temp']: query = "DROP TABLE IF EXISTS {table}".format(table=table) connection.cursor().execute(query) logger = get_logger() connection = create_mysql_connection(aurora_credentials, database) table_schema = [] table_schema.extend(table_columns) for indexed_cols in table_indexes: table_schema.append( ("INDEX", "({cols})".format(cols=','.join(indexed_cols)))) table_schema = ','.join( '{name} {definition}'.format(name=name, definition=definition) for name, definition in table_schema) # Create the table if it does not exist query = """ CREATE TABLE IF NOT EXISTS {table} ({table_schema}) """.format(table=table, table_schema=table_schema) logger.debug(query) connection.cursor().execute(query) # Check for existing data query = "SELECT 1 FROM {table} {record_filter} LIMIT 1".format( table=table, record_filter=record_filter) cursor = connection.cursor() cursor.execute(query) row = cursor.fetchone() if row and not overwrite: raise signals.SKIP( 'Skipping task as data already exists in the dest. table and no overwrite was provided.' ) # Create a temp table for loading data if overwrite and overwrite_with_temp_table: _drop_temp_tables(table, connection) query = "CREATE TABLE {table} ({table_schema})".format( table=table + '_temp', table_schema=table_schema) connection.cursor().execute(query) try: if row and overwrite and not overwrite_with_temp_table: query = "DELETE FROM {table} {record_filter}".format( table=table, record_filter=record_filter) logger.debug( "Deleting existing data for {table}".format(table=table)) connection.cursor().execute(query) if use_manifest: s3_url = os.path.join(s3_url, MANIFEST_FILE_NAME) prefix_or_manifest = "MANIFEST" else: prefix_or_manifest = "PREFIX" query = """ LOAD DATA FROM S3 {prefix_or_manifest} '{s3_url}' INTO TABLE {table} FIELDS TERMINATED BY '{delimiter}' OPTIONALLY ENCLOSED BY '{enclosed_by}' ESCAPED BY '{escaped_by}' IGNORE {ignore_lines} LINES """.format( prefix_or_manifest=prefix_or_manifest, s3_url=s3_url, table=table if not overwrite_with_temp_table else table + '_temp', delimiter=field_delimiter, enclosed_by=enclosed_by, escaped_by=escaped_by, ignore_lines=ignore_num_lines, ) connection.cursor().execute(query) if overwrite and overwrite_with_temp_table: # Note that this would cause an implicit commit. query = "RENAME TABLE {table} to {table_old}, {table_temp} to {table}".format( table=table, table_old=table + '_old', table_temp=table + '_temp') connection.cursor().execute(query) else: # Commit if we're not getting an implicit commit from RENAME. connection.commit() except Exception as e: logger.error(str(e)) connection.rollback() raise finally: _drop_temp_tables(table, connection) connection.close()