def populate_compounds( session: Session, properties: pd.DataFrame, cross_references: pd.DataFrame, batch_size: int, ) -> None: """ Populate the compound and identifier tables using information from MetaNetX. Parameters ---------- session : SQLAlchemy.Session properties : pd.DataFrame cross_references : pd.DataFrame batch_size : int Warnings -------- The function uses bulk inserts for performance and thus assumes empty tables. Do **not** use it for updating content. """ prefix2registry = get_mnx_mapping(session) grouped_xref = cross_references[ (cross_references["prefix"] != "metanetx.chemical") & cross_references["accession"].notnull()].groupby("mnx_id", sort=False) with tqdm(total=len(properties), desc="Compounds") as pbar: for index in range(0, len(properties), batch_size): compounds = [ create_compound_object(row) for row in properties[index:index + batch_size].itertuples(index=False) ] session.bulk_insert_mappings(Compound, compounds) session.commit() pbar.update(len(compounds)) with tqdm(total=len(properties), desc="Cross-References") as pbar: for index in range(0, len(properties), batch_size): identifiers = [] counter = 0 for row in session.query(Compound.id, Compound.mnx_id).slice( index, index + batch_size): try: identifiers.extend( create_compound_identifier_objects( row.id, grouped_xref.get_group(row.mnx_id), prefix2registry, )) except KeyError: logger.debug("Compound '%s' has no cross-references.", row.mnx_id) counter += 1 session.bulk_insert_mappings(CompoundIdentifier, identifiers) session.commit() pbar.update(counter)
def _create_task_instances( self, dag_id: str, tasks: Iterable["Operator"], created_counts: Dict[str, int], hook_is_noop: bool, *, session: Session, ) -> None: """ Create the necessary task instances from the given tasks. :param dag_id: DAG ID associated with the dagrun :param tasks: the tasks to create the task instances from :param created_counts: a dictionary of number of tasks -> total ti created by the task creator :param hook_is_noop: whether the task_instance_mutation_hook is noop :param session: the session to use """ try: if hook_is_noop: session.bulk_insert_mappings(TI, tasks) else: session.bulk_save_objects(tasks) for task_type, count in created_counts.items(): Stats.incr(f"task_instance_created-{task_type}", count) session.flush() except IntegrityError: self.log.info( 'Hit IntegrityError while creating the TIs for %s- %s', dag_id, self.run_id, exc_info=True, ) self.log.info('Doing session rollback.') # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive. session.rollback()
def verify_integrity(self, session: Session = NEW_SESSION): """ Verifies the DagRun by checking for removed tasks or tasks that are not in the database yet. It will set state to removed or add the task if required. :param session: Sqlalchemy ORM Session :type session: Session """ from airflow.settings import task_instance_mutation_hook dag = self.get_dag() tis = self.get_task_instances(session=session) # check for removed or restored tasks task_ids = set() for ti in tis: task_instance_mutation_hook(ti) task_ids.add(ti.task_id) task = None try: task = dag.get_task(ti.task_id) except AirflowException: if ti.state == State.REMOVED: pass # ti has already been removed, just ignore it elif self.state != State.RUNNING and not dag.partial: self.log.warning( "Failed to get task '%s' for dag '%s'. Marking it as removed.", ti, dag) Stats.incr(f"task_removed_from_dag.{dag.dag_id}", 1, 1) ti.state = State.REMOVED should_restore_task = (task is not None) and ti.state == State.REMOVED if should_restore_task: self.log.info( "Restoring task '%s' which was previously removed from DAG '%s'", ti, dag) Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1) ti.state = State.NONE session.merge(ti) def task_filter(task: "BaseOperator"): return task.task_id not in task_ids and ( self.is_backfill or task.start_date <= self.execution_date) created_counts: Dict[str, int] = defaultdict(int) # Set for the empty default in airflow.settings -- if it's not set this means it has been changed hook_is_noop = getattr(task_instance_mutation_hook, 'is_noop', False) if hook_is_noop: def create_ti_mapping(task: "BaseOperator"): created_counts[task.task_type] += 1 return TI.insert_mapping(self.run_id, task) else: def create_ti(task: "BaseOperator") -> TI: ti = TI(task, run_id=self.run_id) task_instance_mutation_hook(ti) created_counts[ti.operator] += 1 return ti # Create missing tasks tasks = list(filter(task_filter, dag.task_dict.values())) try: if hook_is_noop: session.bulk_insert_mappings(TI, map(create_ti_mapping, tasks)) else: session.bulk_save_objects(map(create_ti, tasks)) for task_type, count in created_counts.items(): Stats.incr(f"task_instance_created-{task_type}", count) session.flush() except IntegrityError as err: self.log.info(str(err)) self.log.info( 'Hit IntegrityError while creating the TIs for %s- %s', dag.dag_id, self.run_id) self.log.info('Doing session rollback.') # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive. session.rollback()
def verify_integrity(self, session: Session = NEW_SESSION): """ Verifies the DagRun by checking for removed tasks or tasks that are not in the database yet. It will set state to removed or add the task if required. :param session: Sqlalchemy ORM Session """ from airflow.settings import task_instance_mutation_hook dag = self.get_dag() tis = self.get_task_instances(session=session) # check for removed or restored tasks task_ids = set() for ti in tis: task_instance_mutation_hook(ti) task_ids.add(ti.task_id) task = None try: task = dag.get_task(ti.task_id) should_restore_task = ( task is not None) and ti.state == State.REMOVED if should_restore_task: self.log.info( "Restoring task '%s' which was previously removed from DAG '%s'", ti, dag) Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1) ti.state = State.NONE except AirflowException: if ti.state == State.REMOVED: pass # ti has already been removed, just ignore it elif self.state != State.RUNNING and not dag.partial: self.log.warning( "Failed to get task '%s' for dag '%s'. Marking it as removed.", ti, dag) Stats.incr(f"task_removed_from_dag.{dag.dag_id}", 1, 1) ti.state = State.REMOVED continue if not task.is_mapped: continue task = cast("MappedOperator", task) num_mapped_tis = task.parse_time_mapped_ti_count # Check if the number of mapped literals has changed and we need to mark this TI as removed if num_mapped_tis is not None: if ti.map_index >= num_mapped_tis: self.log.debug( "Removing task '%s' as the map_index is longer than the literal mapping list (%s)", ti, num_mapped_tis, ) ti.state = State.REMOVED elif ti.map_index < 0: self.log.debug( "Removing the unmapped TI '%s' as the mapping can now be performed", ti) ti.state = State.REMOVED else: self.log.info("Restoring mapped task '%s'", ti) Stats.incr(f"task_restored_to_dag.{dag.dag_id}", 1, 1) ti.state = State.NONE else: # What if it is _now_ dynamically mapped, but wasn't before? total_length = task.run_time_mapped_ti_count(self.run_id, session=session) if total_length is None: # Not all upstreams finished, so we can't tell what should be here. Remove everything. if ti.map_index >= 0: self.log.debug( "Removing the unmapped TI '%s' as the mapping can't be resolved yet", ti) ti.state = State.REMOVED continue # Upstreams finished, check there aren't any extras if ti.map_index >= total_length: self.log.debug( "Removing task '%s' as the map_index is longer than the resolved mapping list (%d)", ti, total_length, ) ti.state = State.REMOVED ... def task_filter(task: "Operator") -> bool: return task.task_id not in task_ids and ( self.is_backfill or task.start_date <= self.execution_date and (task.end_date is None or self.execution_date <= task.end_date)) created_counts: Dict[str, int] = defaultdict(int) # Set for the empty default in airflow.settings -- if it's not set this means it has been changed hook_is_noop = getattr(task_instance_mutation_hook, 'is_noop', False) if hook_is_noop: def create_ti_mapping(task: "Operator", indexes: Tuple[int, ...]) -> Generator: created_counts[task.task_type] += 1 for map_index in indexes: yield TI.insert_mapping(self.run_id, task, map_index=map_index) creator = create_ti_mapping else: def create_ti(task: "Operator", indexes: Tuple[int, ...]) -> Generator: for map_index in indexes: ti = TI(task, run_id=self.run_id, map_index=map_index) task_instance_mutation_hook(ti) created_counts[ti.operator] += 1 yield ti creator = create_ti # Create missing tasks -- and expand any MappedOperator that _only_ have literals as input def expand_mapped_literals( task: "Operator") -> Tuple["Operator", Sequence[int]]: if not task.is_mapped: return (task, (-1, )) task = cast("MappedOperator", task) count = task.parse_time_mapped_ti_count or task.run_time_mapped_ti_count( self.run_id, session=session) if not count: return (task, (-1, )) return (task, range(count)) tasks_and_map_idxs = map(expand_mapped_literals, filter(task_filter, dag.task_dict.values())) tasks = itertools.chain.from_iterable( itertools.starmap(creator, tasks_and_map_idxs)) try: if hook_is_noop: session.bulk_insert_mappings(TI, tasks) else: session.bulk_save_objects(tasks) for task_type, count in created_counts.items(): Stats.incr(f"task_instance_created-{task_type}", count) session.flush() except IntegrityError: self.log.info( 'Hit IntegrityError while creating the TIs for %s- %s', dag.dag_id, self.run_id, exc_info=True, ) self.log.info('Doing session rollback.') # TODO[HA]: We probably need to savepoint this so we can keep the transaction alive. session.rollback()