def get_newest_task_instances(self): newest_task_instances_sql = ''' SELECT dr.dag_id, dr.execution_date, dag_state, task_id, ti.state AS task_state, duration, start_date, end_date FROM ( SELECT dag_run.dag_id, execution_date, state AS dag_state, ROW_NUMBER() OVER (PARTITION BY dag_run.dag_id ORDER BY execution_date DESC) AS age FROM dag_run JOIN dag ON dag.dag_id = dag_run.dag_id AND is_active = 1 AND is_paused = 0) dr JOIN task_instance ti ON ti.dag_id = dr.dag_id AND ti.execution_date = dr.execution_date WHERE dr.age = 1'''.replace("\n", "") data = self.client.query(newest_task_instances_sql) result = {} for row in data: row['dag_name'] = clean_dag_id(row['dag_id']) key = row['dag_name'] + row['task_id'] if key in result and row[ 'end_date'] and result[key].end_date > row['end_date']: continue # duplicate with dag old version if row['dag_name'] in self.config.get('TECHNICAL_ETLS', set()): continue # task instance from the technical ETL result[key] = TaskInstance(**row) return list(result.values())
def get_tables_graph(self, dag_id, execution_date): name_without_version = clean_dag_id(dag_id) dag_tables = self.get_tables_by_dag(name_without_version) dag_progress = { d.task_id: d for d in self.airflow.get_dag_tasks(dag_id, execution_date) } # root node - dag name yield GraphVertex(id='main', name=name_without_version, state=self.airflow.get_dag_state( dag_id, execution_date)) # tables for table in dag_tables.values(): yield GraphVertex( id=table.id, name=table.name + (' ({})'.format(table.period.name) if table.period else ''), state=dag_progress[table.task_id].task_state, tooltip='Finished at: {}, duration: {}'.format( dag_progress[table.task_id].end_date, dag_progress[table.task_id].duration), # workaround for this entire method not being able to reference table managed by other DAG in table.uses # see https://github.com/Wikia/discreETLy/issues/22 parent='main' if table.uses is None or table.uses not in dag_tables.keys() else table.uses)
def get_history(self, days): # used to skip the dag version while sorting in order to get only the most recent ones extracted_dag_name_regex = '''SUBSTRING(dag_id FROM 1 FOR (REGEXP_INSTR(dag_id, '_v?[0-9]+.[0-9]+$') - 1))''' SQL = f''' SELECT dag_id, date, state FROM ( SELECT dag_id, execution_date as date, state, ROW_NUMBER() OVER (PARTITION BY {extracted_dag_name_regex} ORDER BY execution_date DESC) AS age FROM dag_run ) t WHERE age <= {days} ''' data = self.client.query(SQL) dag_names = set(map(lambda row: clean_dag_id(row['dag_id']), data)) return {dag_name: reversed([DagRun(**row) for row in data if clean_dag_id(row['dag_id']) == dag_name]) for dag_name in dag_names}
def get_tables_graph(self, dag_id, execution_date): name_without_version = clean_dag_id(dag_id) dag_tables = self.get_tables_by_dag(name_without_version) dag_progess = { d.task_id: d for d in self.airflow.get_dag_tasks(dag_id, execution_date) } # root node - dag name yield GraphVertex(id='main', name=name_without_version, state=self.airflow.get_dag_state( dag_id, execution_date)) # tables for table in dag_tables: yield GraphVertex( id=table.id, name=table.name + (' ({})'.format(table.period.name) if table.period else ''), state=dag_progess[table.task_id].task_state, tooltip='Finished at: {}, duration: {}'.format( dag_progess[table.task_id].end_date, dag_progess[table.task_id].duration), parent=table.get_parent())
def get_dags_status(self): ''' For each non-technical DAG returns the name and the state of last run :return: ''' latest_dags_status_sql = ''' SELECT * FROM ( SELECT dag.dag_id, state, ROW_NUMBER() OVER (PARTITION BY dag.dag_id ORDER BY execution_date DESC) AS age FROM dag_run JOIN dag ON dag.dag_id = dag_run.dag_id AND is_active = 1 AND is_paused = 0) dr WHERE age = 1'''.replace("\n", "") return [{ 'name': clean_dag_id(dag['dag_id']), 'state': dag['state']} for dag in self.client.query(latest_dags_status_sql) if clean_dag_id(dag['dag_id']) not in self.config.get('TECHNICAL_ETLS', set())]
def get_history(self, days): SQL = f''' SELECT dag_id, date, state FROM ( SELECT dag_id, execution_date as date, state, ROW_NUMBER() OVER (PARTITION BY dag_id ORDER BY execution_date DESC) AS age FROM dag_run ) t WHERE age <= {days} ''' data = self.client.query(SQL) dag_names = set(map(lambda row: clean_dag_id(row['dag_id']), data)) return { dag_name: reversed([ DagRun(**row) for row in data if clean_dag_id(row['dag_id']) == dag_name ]) for dag_name in dag_names }
def etl_details(dag_id, execution_date): name_without_version = clean_dag_id(dag_id) data = app.table_data_provider.get_tables_graph(dag_id, execution_date) return render_template('etl_details.html', name=name_without_version, data=data, raw=json.dumps( tree_diagram([{ 'id': v.id, 'name': v.name, 'success': v.get_graph_color(), 'parent': v.parent} for v in data], width=800, height=800, padding=8) ))
def get_last_successful_tasks(self): last_successful_task_end_date = ''' SELECT dag_id, task_id, max(end_date) as end_date FROM task_instance WHERE state = "success" AND end_date is not null GROUP BY dag_id, task_id ''' data = self.client.query(last_successful_task_end_date) result = {} for row in data: row['dag_name'] = clean_dag_id(row['dag_id']) key = row['dag_name'] + row['task_id'] if key in result and result[key].end_date > row['end_date']: continue # duplicate with dag old version result[key] = TaskInstance(**row) return list(result.values())