Exemplo n.º 1
0
def signal_task(message):
    if message == 'go!':
        raise signals.SUCCESS(message='going!')
    elif message == 'stop!':
        raise signals.FAIL(message='stopping!')
    elif message == 'skip!':
        raise signals.SKIP(message='skipping!')
Exemplo n.º 2
0
def task_2():
    logger = prefect.context.get("logger")
    interval = randrange(0, 60)
    logger.info(interval)
    time.sleep(interval)
    if interval > 40:
        logger.info("Skipping task...")
        raise signals.SKIP()
Exemplo n.º 3
0
def skip_if_already_run(
        parametrised_workflow: Tuple[Flow, Dict[str, Any]]) -> None:
    """
    Task to raise a SKIP signal if a workflow is already running or has previously run successfully
    with the given parameters.

    Parameters
    ----------
    parametrised_workflow : tuple (prefect.Flow, dict)
        Workflow, and associated parameters, for which previous runs should be checked

    Raises
    ------
    prefect.engine.signals.SKIP
        if this workflow with these parameters has already run successfully
    """
    workflow, parameters = parametrised_workflow
    prefect.context.logger.info(
        f"Checking whether workflow '{workflow.name}' has already run successfully with parameters {parameters}."
    )
    with session_scope(prefect.config.db_uri) as session:
        state = WorkflowRuns.get_most_recent_state(workflow_name=workflow.name,
                                                   parameters=parameters,
                                                   session=session)

    if state is None:
        prefect.context.logger.debug(
            f"Workflow '{workflow.name}' has not previously run with parameters {parameters}."
        )
    elif state == RunState.failed:
        prefect.context.logger.debug(
            f"Workflow '{workflow.name}' previously failed with parameters {parameters}."
        )
    elif state == RunState.running:
        raise signals.SKIP(
            f"Workflow '{workflow.name}' is already running with parameters {parameters}."
        )
    elif state == RunState.success:
        raise signals.SKIP(
            f"Workflow '{workflow.name}' previously ran successfully with parameters {parameters}."
        )
    else:
        # This should never happen
        raise ValueError(f"Unrecognised workflow state: '{state}'.")
Exemplo n.º 4
0
    def run(self, value: Any) -> None:
        """
        Raises a SKIP signal if the passed value does not match the task's match value;
        succeeds silently otherwise.

        Args:
            - value (Any): the value that will be matched against the task's value.
        """
        if value != self.value:
            raise signals.SKIP('Provided value "{}" did not match "{}"'.format(
                value, self.value))
Exemplo n.º 5
0
def fetch_paypal_report(
    date: str,
    paypal_credentials: dict,
    paypal_report_prefix: str,
    paypal_report_check_column_name: str,
    s3_bucket: str,
    s3_path: str,
    overwrite: bool,
):
    logger = prefect.context.get("logger")
    logger.info("Pulling Paypal report for {}".format(date))

    if not overwrite:
        # If we're not overwriting and the file already exists, raise a skip
        date_path = get_s3_path_for_date(date)
        s3_key = s3_path + date_path

        logger.info("Checking for existence of: {}".format(s3_key))

        existing_file = list_object_keys_from_s3.run(s3_bucket, s3_key)

        if existing_file:
            raise signals.SKIP(
                'File {} already exists and we are not overwriting. Skipping.'.
                format(s3_key))
        else:
            logger.info(
                "File not found, continuing download for {}.".format(date))

    transport = Transport(config.paypal.host, config.paypal.port)
    transport.connect(username=paypal_credentials.get('username'),
                      password=paypal_credentials.get('password'))
    sftp_connection = SFTPClient.from_transport(transport)

    query_date = datetime.datetime.strptime(date, "%Y-%m-%d")
    remote_filename = get_paypal_filename(query_date, paypal_report_prefix,
                                          sftp_connection,
                                          config.paypal.remote_path)

    try:
        if remote_filename:
            sftp_connection.chdir(config.paypal.remote_path)
            check_paypal_report(sftp_connection, remote_filename,
                                paypal_report_check_column_name)
            formatted_report = format_paypal_report(sftp_connection,
                                                    remote_filename, date)
            return date, formatted_report
        else:
            raise Exception("Remote File Not found for date: {0}".format(date))
    finally:
        sftp_connection.close()
Exemplo n.º 6
0
def resource_cleanup_trigger(upstream_states: Dict[Edge, State]) -> bool:
    """Run the cleanup task, provided the following hold:

    - All upstream tasks have finished
    - The resource init task succeeded and wasn't skipped
    - The resource setup task succeeded and wasn't skipped
    """
    for edge, state in upstream_states.items():
        if not state.is_finished():
            raise signals.TRIGGERFAIL(
                "Trigger was 'resource_cleanup_trigger' but some of the "
                "upstream tasks were not finished.")
        if edge.key == "mgr":
            if state.is_skipped():
                raise signals.SKIP("Resource manager init skipped")
            elif not state.is_successful():
                raise signals.SKIP("Resource manager init failed")
        elif edge.key == "resource":
            if state.is_skipped():
                raise signals.SKIP("Resource manager setup skipped")
            elif not state.is_successful():
                raise signals.SKIP("Resource manager setup failed")
    return True
Exemplo n.º 7
0
def not_all_skipped(upstream_states: Set["state.State"]) -> bool:
    """
    Runs if all upstream tasks were successful and were not all skipped.

    Args:
        - upstream_states (set[State]): the set of all upstream states
    """

    if all(state.is_skipped() for state in upstream_states):
        raise signals.SKIP("All upstreams were skipped", result=None)
    elif not all(state.is_successful() for state in upstream_states):
        raise signals.TRIGGERFAIL(
            'Trigger was "not_all_skipped" but some of the upstream tasks failed.'
        )
    return True
Exemplo n.º 8
0
def error_notifcation_handler(obj, old_state, new_state):
    # Hamdle an empty dataframe to return a fail message.  
    # The result of a succesfull 
    if new_state.is_failed():
        p = PrefectSecret("system_errors") 
        slack_web_hook_url = p.run()

        msg = f"Task'{obj.name}' finished in state {new_state.message}"
        
        # Replace URL with your Slack webhook URL
        #requests.post(slack_web_hook_url, json={"text": msg})

        # The order matters
        return_state = new_state  

        raise signals.SKIP(message='See Error Msg')

    else:
        return_state = new_state   
        
    return return_state
Exemplo n.º 9
0
 def check_files(self):
     skip = False
     force = self.p.get('force', False)
     signalskip = self.p.get('signalskip', True)
     # check input exists
     for k, f in self.inps.items():
         if not os.path.exists(f):
             raise signals.FAIL(
                 message=f'input file {k}:{f} does not exist')
     done = all([os.path.exists(x) for k, x in self.outs.items()])
     # if force delete tmps and outs
     if force or (not done):  # delete all existing
         deletefiles(list(self.tmps.values()) + list(self.outs.values()))
         # if all outs exists then skip
     elif done:
         deletefiles(list(self.tmps.values()))
         if signalskip:
             raise signals.SKIP(
                 message=f'all outputs {self.outs} already exist')
         skip = True
     return skip
Exemplo n.º 10
0
def load_s3_data_to_snowflake(
    date: str,
    date_property: str,
    sf_credentials: SFCredentials,
    sf_database: str,
    sf_schema: str,
    sf_table: str,
    sf_role: str,
    sf_warehouse: str,
    sf_storage_integration_name: str,
    s3_url: str,
    sf_file_format: str = "TYPE='JSON', STRIP_OUTER_ARRAY=TRUE",
    file: str = None,
    pattern: str = None,
    overwrite: bool = False,
    truncate: bool = False,
    disable_existence_check: bool = False,
):
    """
    Loads objects in S3 to a generic table in Snowflake, the data is stored in a variant column named
    `PROPERTIES`. Note that either `file` or `pattern` parameter must be specified.
    Notes:
      To load a single file, use the `file` parameter.
      You must explicitly include a separator (/) either at the end of the `s3_url` or at the beginning
      of file path specified in the `file` parameter. Typically you would include a trailing (/)
      in `s3_url`.
      To load multiple files, use `pattern`.
    Examples:
      To load a single file `s3://bucket/path/to/filename/filename.ext`:
      load_s3_data_to_snowflake(s3_url='s3://bucket/path/to/filename/, file='filename.ext')
      or
      load_s3_data_to_snowflake(s3_url='s3://bucket/path/, file='to/filename/filename.ext')

      Load all files under `s3://bucket/path/` with a certain name:
      load_s3_data_to_snowflake(s3_url='s3://bucket/path/, pattern='.*filename.ext')

    Args:
      date (str): Date of the data being loaded.
      date_property (str): Date type property name in the variant `PROPERTIES` column.
      sf_credentials (SFCredentials): Snowflake public key credentials in the
              format required by create_snowflake_connection.
      sf_database (str): Name of the destination database.
      sf_schema (str): Name of the destination schema.
      sf_table (str): Name of the destination table.
      sf_role (str): Name of the snowflake role to assume.
      sf_warehouse (str): Name of the Snowflake warehouse to be used for loading.
      sf_storage_integration_name (str): Name of the Snowflake storage integration to use. These are
              configured in Terraform.
      s3_url (str): Full URL to the S3 path containing the files to load.
      sf_file_format (str, optional): Snowflake file format for the Stage. Defaults to 'JSON'.
      file (str, optional): File path relative to `s3_url`.
      pattern (str, optional): Path pattern/regex to match S3 objects to copy. Defaults to `None`.
      overwrite (bool, optional): Whether to overwrite existing data for the given date. Defaults to `False`.
      truncate (bool, optional): Whether to truncate the table. Defaults to `False`.
      disable_existence_check (bool, optional): Whether to disable check for existing data, useful when
              always appending to the table regardless of any existing data for that provided `date`
    """
    logger = get_logger()
    if not file and not pattern:
        raise signals.FAIL('Either `file` or `pattern` must be specified to run this task.')

    sf_connection = create_snowflake_connection(sf_credentials, sf_role, warehouse=sf_warehouse)

    if truncate:
        query = "TRUNCATE IF EXISTS {}".format(qualified_table_name(sf_database, sf_schema, sf_table))
        logger.info("Truncating table: {}".format(sf_table))
        cursor = sf_connection.cursor()
        cursor.execute(query)

    # Check for data existence for this date
    row = None
    if not disable_existence_check:
        try:
            query = """
            SELECT 1 FROM {table}
            WHERE date(PROPERTIES:{date_property})=date('{date}')
            """.format(
                table=qualified_table_name(sf_database, sf_schema, sf_table),
                date=date,
                date_property=date_property,
            )

            logger.info("Checking existence of data for {}".format(date))

            cursor = sf_connection.cursor()
            cursor.execute(query)
            row = cursor.fetchone()
        except snowflake.connector.ProgrammingError as e:
            if "does not exist" in e.msg:
                # If so then the query failed because the table doesn't exist.
                row = None
            else:
                raise

    if row and not overwrite:
        raise signals.SKIP('Skipping task as data for the date exists and no overwrite was provided.')
    else:
        logger.info("Continuing with S3 load for {}".format(date))

    try:
        # Create the generic loading table
        query = """
        CREATE TABLE IF NOT EXISTS {table} (
            ID NUMBER AUTOINCREMENT START 1 INCREMENT 1,
            LOAD_TIME TIMESTAMP_LTZ DEFAULT CURRENT_TIMESTAMP(),
            ORIGIN_FILE_NAME VARCHAR(16777216),
            ORIGIN_FILE_LINE NUMBER(38,0),
            ORIGIN_STR VARCHAR(16777216),
            PROPERTIES VARIANT
        );
        """.format(
            table=qualified_table_name(sf_database, sf_schema, sf_table)
        )

        sf_connection.cursor().execute(query)

        # Delete existing data in case of overwrite.
        if overwrite and row:
            logger.info("Deleting data for overwrite for {}".format(date))

            query = """
            DELETE FROM {table}
            WHERE date(PROPERTIES:{date_property})=date('{date}')
            """.format(
                table=qualified_table_name(sf_database, sf_schema, sf_table),
                date=date,
                date_property=date_property,
            )
            sf_connection.cursor().execute(query)

        # Create stage
        query = """
        CREATE STAGE IF NOT EXISTS {stage_name}
            URL = '{stage_url}'
            STORAGE_INTEGRATION = {storage_integration}
            FILE_FORMAT = ({file_format});
        """.format(
            stage_name=qualified_stage_name(sf_database, sf_schema, sf_table),
            stage_url=s3_url,
            storage_integration=sf_storage_integration_name,
            file_format=sf_file_format,
        )
        sf_connection.cursor().execute(query)

        files_paramater = ""
        pattern_parameter = ""

        if file:
            logger.info("Loading file {}".format(file))
            files_paramater = "FILES = ( '{}' )".format(file)

        if pattern:
            logger.info("Loading pattern {}".format(pattern))
            pattern_parameter = "PATTERN = '{}'".format(pattern)

        query = """
        COPY INTO {table} (origin_file_name, origin_file_line, origin_str, properties)
            FROM (
                SELECT
                    metadata$filename,
                    metadata$file_row_number,
                    t.$1,
                    CASE
                        WHEN CHECK_JSON(t.$1) IS NULL THEN t.$1
                        ELSE NULL
                    END
                FROM @{stage_name} t
            )
        {files_parameter}
        {pattern_parameter}
        FORCE={force}
        """.format(
            table=qualified_table_name(sf_database, sf_schema, sf_table),
            stage_name=qualified_stage_name(sf_database, sf_schema, sf_table),
            files_parameter=files_paramater,
            pattern_parameter=pattern_parameter,
            force=str(overwrite),
        )

        logger.info("Copying data into Snowflake as: \n{}".format(query))

        sf_connection.cursor().execute(query)
        sf_connection.commit()
    except Exception:
        sf_connection.rollback()
        raise
    finally:
        sf_connection.close()
Exemplo n.º 11
0
def load_s3_data_to_mysql(
    aurora_credentials: dict,
    database: str,
    s3_url: str,
    table: str,
    table_columns: list,
    table_indexes: list = [],
    field_delimiter: str = ',',
    enclosed_by: str = '',
    escaped_by: str = '\\\\',
    record_filter: str = '',
    ignore_num_lines: int = 0,
    overwrite: bool = False,
    overwrite_with_temp_table: bool = False,
    use_manifest: bool = False,
):
    """
    Loads an Aurora MySQL database table from text files in S3.

    Args:
      aurora_credentials (dict): Aurora credentials dict containing `user`, `password` and `host`.
      database (str): Name of the destination database.
      s3_url (str): Full S3 URL containing the files to load into the destination table .
      table (str): Name of the destination table.
      table_columns (list): List of tuples specifying table schema.
              Example: `[('id', 'int'), ('course_id', 'varchar(255) NOT NULL')]`
      table_indexes (list): List of tuples specifying table indexes to add. Defaults to `[]`.
              Example: `[('user_id',), ('course_id',), ('user_id', 'course_id')]`
      field_delimiter (str, optional): The character used to indicate how the fields in input files are delimited.
              Defaults to `,`.
      enclosed_by (str, optional): Single character string that specifies the fields enclosing character.
              Defaults to empty string(no enclosing character).
      escaped_by (str, optional): Single character string which indicates the escaping of delimiters and
              other escape sequences. Defaults to backslash(`\\`).
      record_filter (str, optional): Entire `WHERE` clause which specifies the data to overwrite. An empty value
              with overwrite=True will delete all the rows from the table.
      ignore_num_lines (int, optional): Specifies to ignore a certain number of lines at the start of the input file.
              Defaults to 0.
      overwrite (bool, optional): Whether to overwrite existing data in the destination Table. Defaults to `False`.
      overwrite_with_temp_table (bool, optional): Whether to use a temporary table to overwrite data instead of
              `DELETE`. The data would first be loaded into a new table followed by an atomic rename. Use this option
              if there are any schema changes or for expensive `DELETE` operations.
              IMPORTANT: Do not use this option for incrementally updated tables as any historical data would be lost.
                Defaults to `False`.
      use_manifest (bool, optional): Whether to use a manifest file to load data. Defaults to `False`.
    """
    def _drop_temp_tables(table, connection):
        for table in [table + '_old', table + '_temp']:
            query = "DROP TABLE IF EXISTS {table}".format(table=table)
            connection.cursor().execute(query)

    logger = get_logger()

    connection = create_mysql_connection(aurora_credentials, database)

    table_schema = []

    table_schema.extend(table_columns)

    for indexed_cols in table_indexes:
        table_schema.append(
            ("INDEX", "({cols})".format(cols=','.join(indexed_cols))))

    table_schema = ','.join(
        '{name} {definition}'.format(name=name, definition=definition)
        for name, definition in table_schema)

    # Create the table if it does not exist
    query = """
        CREATE TABLE IF NOT EXISTS {table} ({table_schema})
    """.format(table=table, table_schema=table_schema)
    logger.debug(query)
    connection.cursor().execute(query)

    # Check for existing data
    query = "SELECT 1 FROM {table} {record_filter} LIMIT 1".format(
        table=table, record_filter=record_filter)
    cursor = connection.cursor()
    cursor.execute(query)
    row = cursor.fetchone()

    if row and not overwrite:
        raise signals.SKIP(
            'Skipping task as data already exists in the dest. table and no overwrite was provided.'
        )

    # Create a temp table for loading data
    if overwrite and overwrite_with_temp_table:
        _drop_temp_tables(table, connection)
        query = "CREATE TABLE {table} ({table_schema})".format(
            table=table + '_temp', table_schema=table_schema)
        connection.cursor().execute(query)

    try:
        if row and overwrite and not overwrite_with_temp_table:
            query = "DELETE FROM {table} {record_filter}".format(
                table=table, record_filter=record_filter)
            logger.debug(
                "Deleting existing data for {table}".format(table=table))
            connection.cursor().execute(query)

        if use_manifest:
            s3_url = os.path.join(s3_url, MANIFEST_FILE_NAME)
            prefix_or_manifest = "MANIFEST"
        else:
            prefix_or_manifest = "PREFIX"

        query = """
            LOAD DATA FROM S3 {prefix_or_manifest} '{s3_url}'
            INTO TABLE {table}
            FIELDS TERMINATED BY '{delimiter}' OPTIONALLY ENCLOSED BY '{enclosed_by}'
            ESCAPED BY '{escaped_by}'
            IGNORE {ignore_lines} LINES
        """.format(
            prefix_or_manifest=prefix_or_manifest,
            s3_url=s3_url,
            table=table if not overwrite_with_temp_table else table + '_temp',
            delimiter=field_delimiter,
            enclosed_by=enclosed_by,
            escaped_by=escaped_by,
            ignore_lines=ignore_num_lines,
        )
        connection.cursor().execute(query)

        if overwrite and overwrite_with_temp_table:
            # Note that this would cause an implicit commit.
            query = "RENAME TABLE {table} to {table_old}, {table_temp} to {table}".format(
                table=table,
                table_old=table + '_old',
                table_temp=table + '_temp')
            connection.cursor().execute(query)
        else:
            # Commit if we're not getting an implicit commit from RENAME.
            connection.commit()
    except Exception as e:
        logger.error(str(e))
        connection.rollback()
        raise
    finally:
        _drop_temp_tables(table, connection)
        connection.close()