def create_missing_items( cls, flow_name: str, start_time: dt.datetime, end_time: dt.datetime, interval_timedelta: dt.timedelta, ) -> list["FlowItem"]: items = [] for datetime_ in iter_range_datetime(start_time, end_time, interval_timedelta): try: item = cls.create(**{ cls.name.name: flow_name, cls.worktime.name: datetime_ }) except peewee.IntegrityError: item = cls.get(cls.name == flow_name, cls.worktime == datetime_) else: logger.info("Created missing worktime {} for {}", datetime_, flow_name) items.append(item) return items
def ordering_etl_flow_tasks(*, dry_run: bool = False ) -> Iterator[ExecutorIterationTask]: """Prepare flow function to be sent to the queue and executed""" # TODO: избавиться от функции, переделать так, чтобы одна функция была для заказа from flowmaster.operators.etl.core import ETLOperator from flowmaster.operators.etl.policy import ETLNotebook for name in iter_active_notebook_filenames(): validate, text, notebook_dict, notebook, error = get_notebook(name) notebook: ETLNotebook if dry_run: if notebook.provider != "fakedata": continue if not validate: logger.error("ValidationError: '{}': {}", name, error) continue work = Work(notebook) for start_period, end_period in work.iter_period_for_execute(): flow = ETLOperator(notebook) etl_flow_task = flow.task(start_period, end_period, dry_run=dry_run) with prepare_items_for_order(flow, start_period, end_period): logger.info("Order ETL flow [{}]: {} {}", notebook.name, start_period, end_period) yield etl_flow_task
def fill_queue(self) -> None: """Adds new function to the queue""" count = 0 with threading_lock: for task in self.order_task_func(): task: ExecutorIterationTask task_queue.put(task) count += 1 logger.info("Count ordering task: {}", count)
def recreate_prev_items( cls, flow_name: str, worktime: dt.datetime, offset_periods: Union[pydantic.PositiveInt, list[pydantic.NegativeInt]], interval_timedelta: dt.timedelta, ) -> Optional[list["FlowItem"]]: if isinstance(offset_periods, int): if offset_periods > 0: offset_periods = [-i for i in range(offset_periods) if i > 0] else: raise ValueError("Only positive Int") else: assert all([i < 0 for i in offset_periods]) first_item = cls.first_item(flow_name) if first_item: worktime_list = [ worktime + (interval_timedelta * delta) for delta in offset_periods ] worktime_list = list( filter(lambda dt_: dt_ >= first_item.worktime, worktime_list)) cls.delete().where(cls.name == flow_name, cls.worktime.in_(worktime_list)).execute() items = [] for date1, date2 in iter_period_from_range(worktime_list, interval_timedelta): new_items = cls.create_missing_items( flow_name, start_time=date1, end_time=date2, interval_timedelta=interval_timedelta, ) items.extend(new_items) logger.info( "Recreated items to restart flows {} for previous worktimes {}", flow_name, worktime_list, ) return items
def get_items_for_execute( cls, flow_name: str, worktime: dt.datetime, start_time: dt.datetime, interval_timedelta: dt.timedelta, keep_sequence: bool, retries: int, retry_delay: int, notebook_hash: str, max_fatal_errors: int, update_stale_data: Optional[Union[pydantic.PositiveInt, list[pydantic.NegativeInt]]] = None, ) -> Optional[list["FlowItem"]]: if cls.allow_execute_flow(flow_name, notebook_hash=notebook_hash, max_fatal_errors=max_fatal_errors): if not cls.exists(flow_name): cls.create(**{ cls.name.name: flow_name, cls.worktime.name: worktime }) logger.info("Created first item for {}, worktime {}", flow_name, worktime) if cls.create_next_execute_item(flow_name, interval_timedelta, worktime): if update_stale_data: # When creating the next item, elements are created to update the data for the past dates. cls.recreate_prev_items(flow_name, worktime, update_stale_data, interval_timedelta) if keep_sequence: cls.create_missing_items(flow_name, start_time, worktime, interval_timedelta) cls.retry_error_items(flow_name, retries, retry_delay) return (cls.select().where( cls.name == flow_name, cls.status == Statuses.add, ((pendulum.now("UTC").timestamp() <= cls.expires_utc.to_timestamp()) | (cls.expires_utc.is_null())), ).order_by(cls.worktime.desc()))
def worker(self) -> None: logger.info("Start worker") try: while not threading_event.is_set(): queue_and_task = self.get_task() if queue_and_task: queue_, task = queue_and_task try: list(task) except (SleepException, PoolOverflowingException): self.sleeping_task_storage.append(task) except Exception as exc: logger.error("Fail task: {}", exc) finally: queue_.task_done() except: logger.exception("Fail worker") finally: logger.info("Stop worker")
def create_next_execute_item( cls, flow_name: str, interval_timedelta: dt.timedelta, worktime: dt.datetime, ) -> Optional["FlowItem"]: if cls.is_create_next(flow_name, interval_timedelta, worktime): last_executed_item = cls.last_item(flow_name, for_updated=True) next_worktime = last_executed_item.worktime + interval_timedelta try: item = cls.create(**{ cls.name.name: flow_name, cls.worktime.name: next_worktime, }) except peewee.IntegrityError: return None else: logger.info("Created next worktime {} for {}", next_worktime, flow_name) return item
def allow_execute_flow(cls, flow_name: str, notebook_hash: str, *, max_fatal_errors: int = 3) -> bool: item = cls.last_item(flow_name, for_updated=True) if item and item.notebook_hash == notebook_hash: # Check limit fatal errors. items = (cls.select().where( cls.name == flow_name, cls.status == Statuses.fatal_error).order_by( cls.updated_utc.desc()).limit(max_fatal_errors)) is_allow = len(items) < max_fatal_errors if not is_allow: logger.info("Many fatal errors, {} will not be scheduled", flow_name) return is_allow else: return True
def scheduler(): logger.info("Start scheduler") begin = time.time() iter_begin = time.time() duration = order_interval num_order = 0 while not threading_event.is_set() and ( work_duration is None or time.time() - begin < work_duration): self.wake_sleep_func() if duration >= order_interval: duration = 0 iter_begin = time.time() logger.info("Pool info: {}", pools.info_text()) logger.info( "The number of new tasks in the queue: {}", task_queue.qsize() + sleeptask_queue.qsize(), ) logger.info( "Number of sleeping tasks in the queue: {}", len(self.sleeping_task_storage), ) if orders is None or num_order < orders: self.fill_queue() num_order += 1 else: break time.sleep(0 if self.dry_run else 1) duration += time.time() - iter_begin logger.info("Stop scheduler")
def retry_error_items(cls, flow_name: str, retries: int, retry_delay: int) -> peewee.ModelSelect: # http://docs.peewee-orm.com/en/latest/peewee/hacks.html?highlight=time%20now#date-math # A function that checks to see if retry_delay passes to restart. next_start_time_timestamp = cls.finished_utc.to_timestamp( ) + retry_delay items = cls.select().where( cls.name == flow_name, cls.status.in_(Statuses.error_statuses), cls.retries < retries, ((pendulum.now("UTC").timestamp() >= next_start_time_timestamp) | (cls.finished_utc.is_null())), # TODO: В поле info записывать, что поток не будет перезапущен, т.к. истек срок выполнения. # Иначе не понятно, почему не перезапускаются. ((pendulum.now("UTC").timestamp() <= cls.expires_utc.to_timestamp()) | (cls.expires_utc.is_null())), ) worktimes = [i.worktime for i in items] if worktimes: # TODO: recreate items cls.update( **{ cls.status.name: Statuses.add, cls.retries.name: cls.retries + 1, cls.updated_utc.name: pendulum.now("UTC"), }).where(cls.name == flow_name, cls.worktime.in_(worktimes)).execute() logger.info("Restart error items for {}, worktimes = {}", flow_name, worktimes) return (cls.select().where(cls.name == flow_name, cls.worktime.in_(worktimes)).order_by( cls.worktime.desc()))
def sync_executor(*, interval: int = 20, orders: int = None, dry_run: bool = False): begin = time.time() duration = interval count_orders = 0 while True: logger.info("Ordering flow tasks") for task in ordering_flow_tasks(dry_run=dry_run): task.execute() if duration >= interval: duration = 0 begin = time.time() else: time.sleep(interval - duration) duration += time.time() - begin count_orders += 1 if orders and count_orders >= orders: break