def get_newest_task_instances(self): newest_task_instances_sql = ''' SELECT dr.dag_id, dr.execution_date, dag_state, task_id, ti.state AS task_state, duration, start_date, end_date FROM ( SELECT dag_run.dag_id, execution_date, state AS dag_state, ROW_NUMBER() OVER (PARTITION BY dag_run.dag_id ORDER BY execution_date DESC) AS age FROM dag_run JOIN dag ON dag.dag_id = dag_run.dag_id AND is_active = 1 AND is_paused = 0) dr JOIN task_instance ti ON ti.dag_id = dr.dag_id AND ti.execution_date = dr.execution_date WHERE dr.age = 1'''.replace("\n", "") data = self.client.query(newest_task_instances_sql) result = {} for row in data: row['dag_name'] = clean_dag_id(row['dag_id']) key = row['dag_name'] + row['task_id'] if key in result and row[ 'end_date'] and result[key].end_date > row['end_date']: continue # duplicate with dag old version if row['dag_name'] in self.config.get('TECHNICAL_ETLS', set()): continue # task instance from the technical ETL result[key] = TaskInstance(**row) return list(result.values())
def get_last_successful_tasks(self): last_successful_task_end_date = ''' SELECT dag_id, task_id, max(end_date) as end_date FROM task_instance WHERE state = "success" AND end_date is not null GROUP BY dag_id, task_id ''' data = self.client.query(last_successful_task_end_date) result = {} for row in data: row['dag_name'] = clean_dag_id(row['dag_id']) key = row['dag_name'] + row['task_id'] if key in result and result[key].end_date > row['end_date']: continue # duplicate with dag old version result[key] = TaskInstance(**row) return list(result.values())
def get_dag_tasks(self, dag_id, execution_date): data = self.client.query( f"""SELECT dag_id, execution_date, task_id, start_date, end_date, duration, state as task_state FROM task_instance WHERE dag_id='{dag_id}' AND execution_date='{execution_date}'""" .replace("\n", "")) return [TaskInstance(**row) for row in data]