示例#1
0
    def create_missing_items(
        cls,
        flow_name: str,
        start_time: dt.datetime,
        end_time: dt.datetime,
        interval_timedelta: dt.timedelta,
    ) -> list["FlowItem"]:
        items = []
        for datetime_ in iter_range_datetime(start_time, end_time,
                                             interval_timedelta):
            try:
                item = cls.create(**{
                    cls.name.name: flow_name,
                    cls.worktime.name: datetime_
                })
            except peewee.IntegrityError:
                item = cls.get(cls.name == flow_name,
                               cls.worktime == datetime_)
            else:
                logger.info("Created missing worktime {} for {}", datetime_,
                            flow_name)

            items.append(item)

        return items
示例#2
0
def ordering_etl_flow_tasks(*,
                            dry_run: bool = False
                            ) -> Iterator[ExecutorIterationTask]:
    """Prepare flow function to be sent to the queue and executed"""
    # TODO: избавиться от функции, переделать так, чтобы одна функция была для заказа

    from flowmaster.operators.etl.core import ETLOperator
    from flowmaster.operators.etl.policy import ETLNotebook

    for name in iter_active_notebook_filenames():
        validate, text, notebook_dict, notebook, error = get_notebook(name)
        notebook: ETLNotebook
        if dry_run:
            if notebook.provider != "fakedata":
                continue

        if not validate:
            logger.error("ValidationError: '{}': {}", name, error)
            continue

        work = Work(notebook)
        for start_period, end_period in work.iter_period_for_execute():
            flow = ETLOperator(notebook)
            etl_flow_task = flow.task(start_period,
                                      end_period,
                                      dry_run=dry_run)

            with prepare_items_for_order(flow, start_period, end_period):
                logger.info("Order ETL flow [{}]: {} {}", notebook.name,
                            start_period, end_period)
                yield etl_flow_task
示例#3
0
    def fill_queue(self) -> None:
        """Adds new function to the queue"""
        count = 0
        with threading_lock:
            for task in self.order_task_func():
                task: ExecutorIterationTask
                task_queue.put(task)
                count += 1

        logger.info("Count ordering task: {}", count)
示例#4
0
    def recreate_prev_items(
        cls,
        flow_name: str,
        worktime: dt.datetime,
        offset_periods: Union[pydantic.PositiveInt,
                              list[pydantic.NegativeInt]],
        interval_timedelta: dt.timedelta,
    ) -> Optional[list["FlowItem"]]:

        if isinstance(offset_periods, int):
            if offset_periods > 0:
                offset_periods = [-i for i in range(offset_periods) if i > 0]
            else:
                raise ValueError("Only positive Int")
        else:
            assert all([i < 0 for i in offset_periods])

        first_item = cls.first_item(flow_name)
        if first_item:
            worktime_list = [
                worktime + (interval_timedelta * delta)
                for delta in offset_periods
            ]
            worktime_list = list(
                filter(lambda dt_: dt_ >= first_item.worktime, worktime_list))

            cls.delete().where(cls.name == flow_name,
                               cls.worktime.in_(worktime_list)).execute()

            items = []
            for date1, date2 in iter_period_from_range(worktime_list,
                                                       interval_timedelta):
                new_items = cls.create_missing_items(
                    flow_name,
                    start_time=date1,
                    end_time=date2,
                    interval_timedelta=interval_timedelta,
                )
                items.extend(new_items)

            logger.info(
                "Recreated items to restart flows {} for previous worktimes {}",
                flow_name,
                worktime_list,
            )

            return items
示例#5
0
    def get_items_for_execute(
        cls,
        flow_name: str,
        worktime: dt.datetime,
        start_time: dt.datetime,
        interval_timedelta: dt.timedelta,
        keep_sequence: bool,
        retries: int,
        retry_delay: int,
        notebook_hash: str,
        max_fatal_errors: int,
        update_stale_data: Optional[Union[pydantic.PositiveInt,
                                          list[pydantic.NegativeInt]]] = None,
    ) -> Optional[list["FlowItem"]]:
        if cls.allow_execute_flow(flow_name,
                                  notebook_hash=notebook_hash,
                                  max_fatal_errors=max_fatal_errors):
            if not cls.exists(flow_name):
                cls.create(**{
                    cls.name.name: flow_name,
                    cls.worktime.name: worktime
                })
                logger.info("Created first item for {}, worktime {}",
                            flow_name, worktime)

            if cls.create_next_execute_item(flow_name, interval_timedelta,
                                            worktime):
                if update_stale_data:
                    # When creating the next item, elements are created to update the data for the past dates.
                    cls.recreate_prev_items(flow_name, worktime,
                                            update_stale_data,
                                            interval_timedelta)

            if keep_sequence:
                cls.create_missing_items(flow_name, start_time, worktime,
                                         interval_timedelta)

            cls.retry_error_items(flow_name, retries, retry_delay)

            return (cls.select().where(
                cls.name == flow_name,
                cls.status == Statuses.add,
                ((pendulum.now("UTC").timestamp() <=
                  cls.expires_utc.to_timestamp())
                 | (cls.expires_utc.is_null())),
            ).order_by(cls.worktime.desc()))
示例#6
0
 def worker(self) -> None:
     logger.info("Start worker")
     try:
         while not threading_event.is_set():
             queue_and_task = self.get_task()
             if queue_and_task:
                 queue_, task = queue_and_task
                 try:
                     list(task)
                 except (SleepException, PoolOverflowingException):
                     self.sleeping_task_storage.append(task)
                 except Exception as exc:
                     logger.error("Fail task: {}", exc)
                 finally:
                     queue_.task_done()
     except:
         logger.exception("Fail worker")
     finally:
         logger.info("Stop worker")
示例#7
0
 def create_next_execute_item(
     cls,
     flow_name: str,
     interval_timedelta: dt.timedelta,
     worktime: dt.datetime,
 ) -> Optional["FlowItem"]:
     if cls.is_create_next(flow_name, interval_timedelta, worktime):
         last_executed_item = cls.last_item(flow_name, for_updated=True)
         next_worktime = last_executed_item.worktime + interval_timedelta
         try:
             item = cls.create(**{
                 cls.name.name: flow_name,
                 cls.worktime.name: next_worktime,
             })
         except peewee.IntegrityError:
             return None
         else:
             logger.info("Created next worktime {} for {}", next_worktime,
                         flow_name)
             return item
示例#8
0
    def allow_execute_flow(cls,
                           flow_name: str,
                           notebook_hash: str,
                           *,
                           max_fatal_errors: int = 3) -> bool:
        item = cls.last_item(flow_name, for_updated=True)

        if item and item.notebook_hash == notebook_hash:
            # Check limit fatal errors.
            items = (cls.select().where(
                cls.name == flow_name,
                cls.status == Statuses.fatal_error).order_by(
                    cls.updated_utc.desc()).limit(max_fatal_errors))
            is_allow = len(items) < max_fatal_errors
            if not is_allow:
                logger.info("Many fatal errors, {} will not be scheduled",
                            flow_name)

            return is_allow
        else:
            return True
示例#9
0
        def scheduler():
            logger.info("Start scheduler")
            begin = time.time()
            iter_begin = time.time()
            duration = order_interval
            num_order = 0

            while not threading_event.is_set() and (
                    work_duration is None
                    or time.time() - begin < work_duration):
                self.wake_sleep_func()

                if duration >= order_interval:
                    duration = 0
                    iter_begin = time.time()

                    logger.info("Pool info: {}", pools.info_text())
                    logger.info(
                        "The number of new tasks in the queue: {}",
                        task_queue.qsize() + sleeptask_queue.qsize(),
                    )
                    logger.info(
                        "Number of sleeping tasks in the queue: {}",
                        len(self.sleeping_task_storage),
                    )

                    if orders is None or num_order < orders:
                        self.fill_queue()
                        num_order += 1
                    else:
                        break

                time.sleep(0 if self.dry_run else 1)
                duration += time.time() - iter_begin

            logger.info("Stop scheduler")
示例#10
0
    def retry_error_items(cls, flow_name: str, retries: int,
                          retry_delay: int) -> peewee.ModelSelect:
        # http://docs.peewee-orm.com/en/latest/peewee/hacks.html?highlight=time%20now#date-math
        # A function that checks to see if retry_delay passes to restart.
        next_start_time_timestamp = cls.finished_utc.to_timestamp(
        ) + retry_delay
        items = cls.select().where(
            cls.name == flow_name,
            cls.status.in_(Statuses.error_statuses),
            cls.retries < retries,
            ((pendulum.now("UTC").timestamp() >= next_start_time_timestamp)
             | (cls.finished_utc.is_null())),
            # TODO: В поле info записывать, что поток не будет перезапущен, т.к. истек срок выполнения.
            #  Иначе не понятно, почему не перезапускаются.
            ((pendulum.now("UTC").timestamp() <=
              cls.expires_utc.to_timestamp())
             | (cls.expires_utc.is_null())),
        )
        worktimes = [i.worktime for i in items]

        if worktimes:
            # TODO: recreate items
            cls.update(
                **{
                    cls.status.name: Statuses.add,
                    cls.retries.name: cls.retries + 1,
                    cls.updated_utc.name: pendulum.now("UTC"),
                }).where(cls.name == flow_name,
                         cls.worktime.in_(worktimes)).execute()

            logger.info("Restart error items for {}, worktimes = {}",
                        flow_name, worktimes)

        return (cls.select().where(cls.name == flow_name,
                                   cls.worktime.in_(worktimes)).order_by(
                                       cls.worktime.desc()))
示例#11
0
def sync_executor(*,
                  interval: int = 20,
                  orders: int = None,
                  dry_run: bool = False):
    begin = time.time()
    duration = interval
    count_orders = 0

    while True:
        logger.info("Ordering flow tasks")

        for task in ordering_flow_tasks(dry_run=dry_run):
            task.execute()

        if duration >= interval:
            duration = 0
            begin = time.time()
        else:
            time.sleep(interval - duration)
            duration += time.time() - begin

        count_orders += 1
        if orders and count_orders >= orders:
            break