예제 #1
0
 def kill(self, session=None):
     """关闭job ."""
     job = session.query(BaseJob).filter(BaseJob.id == self.id).first()
     job.end_date = timezone.system_now()
     # 杀死job
     try:
         self.on_kill()
     except Exception as e:
         self.log.error('on_kill() method failed: {}'.format(e))
     # 保存job的关闭时间
     session.merge(job)
     session.commit()
     # 抛出异常
     raise XToolException("Job shut down externally.")
예제 #2
0
 def start(self):
     """创建一个文件处理子进程,
     并将结果保存到self._result_queue结果队列
     """
     thread_name = "{}_{}-Process".format(self.__class__.__name__,
                                          self._instance_id)
     self._process = multiprocessing.Process(target=self._handler,
                                             args=(self._result_queue,
                                                   self.file_path,
                                                   self.args, self.kwargs),
                                             name=thread_name)
     self._process.start()
     # 记录子进程启动时间
     self._start_time = timezone.system_now()
     return self._process
예제 #3
0
    def heartbeat(self):
        # 每次心跳获得最新的job状态
        with create_session() as session:
            # 如果只能查询到一个结果,返回它,否则抛出异常。
            # 没有结果时抛sqlalchemy.orm.exc.NoResultFound,
            # 有超过一个结果时抛sqlalchemy.orm.exc.MultipleResultsFound。
            job = session.query(BaseJob).filter_by(id=self.id).one()
            # 将job复制,并去掉与sesion的关联
            # remove the association with any session
            # and remove its “identity key”
            make_transient(job)
            session.commit()

        # 如果job是关闭状态,则执行kill操作
        if job.state == State.SHUTDOWN:
            # 关闭job,抛出 AirflowException
            self.kill()

        # 获得到下一次心跳需要睡眠的时间间隔,并等待
        sleep_for = 0
        if job.latest_heartbeat:
            sleep_for = max(
                0, self.heartrate -
                (timezone.system_now() - job.latest_heartbeat).total_seconds())
        sleep(sleep_for)

        # 睡眠之后会重新连接DB
        with create_session() as session:
            # 更新最新的心跳时间
            job = session.query(BaseJob).filter(BaseJob.id == self.id).first()
            job.latest_heartbeat = timezone.system_now()
            session.merge(job)
            session.commit()
            # 执行心跳处理函数
            self.heartbeat_callback(session=session)
            self.log.debug('[heartbeat]')
예제 #4
0
    def run(self):
        """新增一个job,并执行job,状态从运行态->完成态 ."""
        with create_session() as session:
            # 新增job
            self.state = State.RUNNING
            session.add(self)
            session.commit()
            id_ = self.id
            make_transient(self)
            self.id = id_

            # 运行job
            self._execute()

            # job执行完成后,记录完成时间和状态
            self.end_date = timezone.system_now()
            self.state = State.SUCCESS
            session.merge(self)
            session.commit()
예제 #5
0
 def __init__(self,
              executor,
              heartrate=5,
              max_tis_per_query=512,
              *args,
              **kwargs):
     # 当前机器的主机名
     self.hostname = get_hostname()
     # 执行器: 从任务队列中获取任务,并执行
     self.executor = executor
     # 获得执行器的类名
     self.executor_class = executor.__class__.__name__
     # 设置开始时间和心跳开始时间
     self.start_date = timezone.system_now()
     self.latest_heartbeat = self.start_date
     # job的心跳间隔时间,用于判断job是否存活
     self.heartrate = heartrate
     # 调度器进程所在机器的执行用户
     self.unixname = getpass.getuser()
     # 因为需要批量更新任务实例的状态,为了防止SQL过长
     # 需要设置每批更新的任务实例的数量
     self.max_tis_per_query = max_tis_per_query
     super(BaseJob, self).__init__(*args, **kwargs)
예제 #6
0
    def heartbeat(self):
        """心跳
        
        - 处理执行完毕的处理器
        - 将任务加入队列
        - 执行队列中的进程
        """
        # 已完成的文件处理器
        # :type : dict[unicode, AbstractFileProcessor]
        finished_processors = {}
        # 正在运行的文件处理器
        # :type : dict[unicode, AbstractFileProcessor]
        running_processors = {}

        # 遍历所有的文件处理器
        result = []
        for file_path, processor in self._processors.items():
            if processor.done:
                self.log.info("Processor for %s finished", file_path)
                # 获得已完成的文件处理器进程
                finished_processors[file_path] = processor
                # 文件处理器运行时间
                now = timezone.system_now()
                # 记录文件处理器的的执行时长
                self._last_runtime[file_path] = (
                    now - processor.start_time).total_seconds()
                # 记录文件处理器的结束时间
                self._last_finish_time[file_path] = now
                # 记录文件被处理的次数
                self._run_count[file_path] += 1
                # 收集已完成处理器的执行结果
                if processor.result is None:
                    self.log.warning(
                        "Processor for %s exited with return code %s.",
                        processor.file_path, processor.exit_code)
                else:
                    for value in processor.result:
                        result.append(value)
            else:
                # 记录正在运行的文件处理器进程
                running_processors[file_path] = processor

        # 每一次心跳,剔除已完成的处理器
        self._processors = running_processors

        self.log.debug("%s/%s scheduler processes running",
                       len(self._processors), self._parallelism)

        self.log.debug("%s file paths queued for processing",
                       len(self._file_path_queue))

        # 如果队列为空,设置需要入队的文件
        if not self._file_path_queue:
            # 可能存在正在运行的文件处理器进程尚未执行完成
            # 在下一次心跳处理
            file_paths_in_progress = self._processors.keys()

            # 记录下尚未到调度时间的文件
            file_paths_recently_processed = []
            longest_parse_duration = 0
            now = timezone.system_now()

            # 遍历需要处理的文件列表
            for file_path in self._file_paths:
                # 获得文件处理器上一次执行完成的时间
                last_finish_time = self.get_last_finish_time(file_path)
                # 如果文件曾经处理过
                if last_finish_time is not None:
                    duration = now - last_finish_time
                    # 如果文件尚未到调度时间,则记录在指定数组中
                    if duration.total_seconds() < self._process_file_interval:
                        file_paths_recently_processed.append(file_path)
                    # 获得所有文件中最长的等待时间
                    longest_parse_duration = max(duration.total_seconds(),
                                                 longest_parse_duration)

            # 获得每次心跳的休眠时间
            sleep_length = max(
                self._min_file_parsing_loop_time - longest_parse_duration, 0)
            # 休眠
            if sleep_length > 0:
                self.log.debug(
                    "Sleeping for %.2f seconds to prevent excessive "
                    "logging", sleep_length)
                time.sleep(sleep_length)

            # 获得处理器已经执行完成的文件列表,且这些文件的执行次数已经达到了最大阈值
            files_paths_at_run_limit = [
                file_path for file_path, num_runs in self._run_count.items()
                if num_runs == self._max_runs
            ]

            # 获得需要入队的文件,新增的文件在此入库
            # 去掉正在运行的文件
            # 去掉最近需要运行的文件
            # 去掉运行次数已经达到阈值的文件
            files_paths_to_queue = list(
                set(self._file_paths) - set(file_paths_in_progress) -
                set(file_paths_recently_processed) -
                set(files_paths_at_run_limit))

            # 遍历正在运行的处理器进程
            for file_path, processor in self._processors.items():
                self.log.debug(
                    "File path %s is still being processed (started: %s)",
                    processor.file_path, processor.start_time.isoformat())
            self.log.debug("Queuing the following files for processing:\n\t%s",
                           "\n\t".join(files_paths_to_queue))

            # 将任务加入队列
            self._file_path_queue.extend(files_paths_to_queue)

        # 处理器并发性阈值验证
        while (self._parallelism - len(self._processors) > 0
               and self._file_path_queue):
            # 从队列中出队一个文件
            file_path = self._file_path_queue.pop(0)
            # 创建文件处理器子进程
            processor = self._processor_factory(file_path)
            # 启动文件处理器子进程
            processor.start()
            self.log.info(
                "Started a process (PID: %s) to generate tasks for %s",
                processor.pid, file_path)
            # 记录文件子进程
            self._processors[file_path] = processor

        # 记录心跳的次数
        self._run_count[self._heart_beat_key] += 1

        # 返回已完成处理器的执行结果
        return result
예제 #7
0
 def get_runtime(self, file_path):
     """获得文件处理器的运行时长,单位是秒 ."""
     if file_path in self._processors:
         return (timezone.system_now() - self._processors[file_path].start_time)\
             .total_seconds()
     return None
예제 #8
0
def date_range(start_date, end_date=None, num=None, delta=None):
    """
    Get a set of dates as a list based on a start, end and delta, delta
    can be something that can be added to ``datetime.datetime``
    or a cron expression as a ``str``

    :param start_date: anchor date to start the series from
    :type start_date: datetime.datetime
    :param end_date: right boundary for the date range
    :type end_date: datetime.datetime
    :param num: alternatively to end_date, you can specify the number of
        number of entries you want in the range. This number can be negative,
        output will always be sorted regardless
    :type num: int

    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1))
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *')
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), datetime.datetime(2016, 1, 3, 0, 0)]
    >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *")
    [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)]
    """
    if not delta:
        return []
    if end_date and start_date > end_date:
        raise Exception("Wait. start_date needs to be before end_date")
    if end_date and num:
        raise Exception("Wait. Either specify end_date OR num")
    # 结束时间默认为当前UTC时间
    if not end_date and not num:
        end_date = timezone.system_now()

    delta_iscron = False
    tz = start_date.tzinfo
    if isinstance(delta, six.string_types):
        delta_iscron = True
        # 去掉开始时间的时区
        if tz is not None:
            start_date = timezone.make_naive(start_date, tz)
        cron = croniter(delta, start_date)
    elif isinstance(delta, timedelta):
        delta = abs(delta)

    l = []
    if end_date:
        while start_date <= end_date:
            if timezone.is_naive(start_date):
                l.append(timezone.make_aware(start_date, tz))
            else:
                l.append(start_date)

            if delta_iscron:
                start_date = cron.get_next(datetime)
            else:
                start_date += delta
    else:
        for _ in range(abs(num)):
            if timezone.is_naive(start_date):
                # 如果开始时间不存在时区信息,加上指定的时区信息
                # 如果没有指定时区信息,默认系统时区
                l.append(timezone.make_aware(start_date, tz))
            else:
                l.append(start_date)

            if delta_iscron:
                if num > 0:
                    start_date = cron.get_next(datetime)
                else:
                    start_date = cron.get_prev(datetime)
            else:
                if num > 0:
                    start_date += delta
                else:
                    start_date -= delta
    return sorted(l)
예제 #9
0
 def is_alive(self):
     """判断job是否存活 ."""
     # 如果job超过2个心跳周期都没有上报心跳,则认为job已经死亡
     return ((timezone.system_now() - self.latest_heartbeat).seconds <
             (heartrate * 2.1))