Exemplo n.º 1
0
def task_instance_info(dag_id, execution_date, task_id):
    """
    Returns a JSON with a task instance's public instance variables.
    The format for the exec_date is expected to be
    "YYYY-mm-DDTHH:MM:SS", for example: "2016-11-16T11:34:15". This will
    of course need to have been encoded for URL in the request.
    """

    # Convert string datetime into actual datetime
    try:
        execution_date = timezone.parse(execution_date)
    except ValueError:
        error_message = (
            'Given execution date, {}, could not be identified '
            'as a date. Example date format: 2015-11-16T14:34:15+00:00'
            .format(execution_date))
        log.error(error_message)
        response = jsonify({'error': error_message})
        response.status_code = 400

        return response

    try:
        ti_info = get_task_instance(dag_id, task_id, execution_date)
    except AirflowException as err:
        log.info(err)
        response = jsonify(error="{}".format(err))
        response.status_code = err.status_code
        return response

    # JSONify and return.
    fields = {k: str(v)
              for k, v in vars(ti_info).items()
              if not k.startswith('_')}
    return jsonify(fields)
Exemplo n.º 2
0
def task_instance_info(dag_id, execution_date, task_id):
    """
    Returns a JSON with a task instance's public instance variables.
    The format for the exec_date is expected to be
    "YYYY-mm-DDTHH:MM:SS", for example: "2016-11-16T11:34:15". This will
    of course need to have been encoded for URL in the request.
    """

    # Convert string datetime into actual datetime
    try:
        execution_date = timezone.parse(execution_date)
    except ValueError:
        error_message = (
            'Given execution date, {}, could not be identified '
            'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format(
                execution_date))
        _log.info(error_message)
        response = jsonify({'error': error_message})
        response.status_code = 400

        return response

    try:
        info = get_task_instance(dag_id, task_id, execution_date)
    except AirflowException as err:
        _log.info(err)
        response = jsonify(error="{}".format(err))
        response.status_code = err.status_code
        return response

    # JSONify and return.
    fields = {k: str(v)
              for k, v in vars(info).items()
              if not k.startswith('_')}
    return jsonify(fields)
Exemplo n.º 3
0
 def _get_leaves_tis(self, execution_date):
     leaves_tis = []
     for leaf in self.subdag.leaves:
         try:
             ti = get_task_instance(dag_id=self.subdag.dag_id,
                                    task_id=leaf.task_id,
                                    execution_date=execution_date)
             leaves_tis.append(ti)
         except TaskInstanceNotFound:
             continue
     return leaves_tis
Exemplo n.º 4
0
def swap_dataset_tables(
    target_db: str,
    *tables: sa.Table,
    use_utc_now_as_source_modified: bool = False,
    **kwargs,
):
    """Rename temporary tables to replace current dataset one.

    Given a one or more dataset tables `tables` this finds the temporary table created
    for the current DAG run and replaces existing dataset one with it.

    If a dataset table didn't exist the new table gets renamed, otherwise
    the existing dataset table is renamed to a temporary "swap" name first.

    This requires an exclusive lock for the dataset table (similar to TRUNCATE)
    but doesn't need to copy any data around (reducing the amount of time dataset
    is unavailable) and will update the table schema at the same time (since it
    will apply the new schema temporary table was created with).

    """
    engine = sa.create_engine(
        'postgresql+psycopg2://',
        creator=PostgresHook(postgres_conn_id=target_db).get_conn,
    )
    for table in tables:
        temp_table = get_temp_table(table, kwargs["ts_nodash"])

        logger.info("Moving %s to %s", temp_table.name, table.name)
        with engine.begin() as conn:
            conn.execute("SET statement_timeout = 600000")
            grantees = [
                grantee[0] for grantee in conn.execute("""
                SELECT grantee
                FROM information_schema.role_table_grants
                WHERE table_name='{table_name}'
                AND privilege_type = 'SELECT'
                AND grantor != grantee
                """.format(table_name=engine.dialect.identifier_preparer.quote(
                    table.name))).fetchall()
            ]

            conn.execute("""
                SELECT dataflow.save_and_drop_dependencies('{schema}', '{target_temp_table}');
                ALTER TABLE IF EXISTS {schema}.{target_temp_table} RENAME TO {swap_table_name};
                ALTER TABLE {schema}.{temp_table} RENAME TO {target_temp_table};
                SELECT dataflow.restore_dependencies('{schema}', '{target_temp_table}');
                """.format(
                schema=engine.dialect.identifier_preparer.quote(table.schema),
                target_temp_table=engine.dialect.identifier_preparer.quote(
                    table.name),
                swap_table_name=engine.dialect.identifier_preparer.quote(
                    temp_table.name + "_swap"),
                temp_table=engine.dialect.identifier_preparer.quote(
                    temp_table.name),
            ))
            for grantee in grantees + config.DEFAULT_DATABASE_GRANTEES:
                conn.execute(
                    'GRANT SELECT ON {schema}.{table_name} TO {grantee}'.
                    format(
                        schema=engine.dialect.identifier_preparer.quote(
                            table.schema),
                        table_name=engine.dialect.identifier_preparer.quote(
                            table.name),
                        grantee=grantee,
                    ))

            new_modified_utc = kwargs['task_instance'].xcom_pull(
                key='source-modified-date-utc')
            if new_modified_utc is None and use_utc_now_as_source_modified:
                try:
                    new_modified_utc = get_task_instance(
                        kwargs['dag'].safe_dag_id,
                        'run-fetch',
                        kwargs['execution_date'],
                    ).end_date
                except TaskNotFound:
                    new_modified_utc = datetime.datetime.utcnow()

            conn.execute(
                """
                INSERT INTO dataflow.metadata
                (table_schema, table_name, source_data_modified_utc, dataflow_swapped_tables_utc)
                VALUES (%s, %s, %s, %s)
                """,
                (
                    table.schema,
                    table.name,
                    new_modified_utc,
                    datetime.datetime.utcnow(),
                ),
            )