Пример #1
0
 def text(self, regex=None, strip=True, separator=""):
     if self._root is None:
         if self._default is ArgDefault:
             raise errors.TaskError(f"未找到:{repr(self)}")
         else:
             # 默认值不校验格式,直接返回
             return self._default
     _text = self._root.get_text(separator, strip)
     if regex is None or re.match(regex, _text):
         return _text
     else:
         raise errors.TaskError(f"未通过正则校验:{regex}")
Пример #2
0
 def text(self, regex=None, strip=True):
     if self._root is None:
         if self._default is ArgDefault:
             raise errors.TaskError(f"未找到{repr(self)}")
         else:
             # 默认值不校验格式,直接返回
             return self._default
     _text = self._root.text
     _text = '' if _text is None else _text
     _text = _text.strip() if strip else _text
     if regex is None or re.match(regex, _text):
         return _text
     else:
         raise errors.TaskError(f"未通过正则校验:{regex}")
Пример #3
0
 def html(self):
     if self._root is None:
         if self._default is ArgDefault:
             raise errors.TaskError(f"未找到:{repr(self)}")
         else:
             # 默认值不校验格式,直接返回
             return self._default
     return str(self._root)
Пример #4
0
    def __init__(self, root, pattern="/*"):
        if isinstance(root, (etree._Element, type(None))):
            self._root = root
        elif isinstance(root, str):
            self._root = etree.HTML(root)
        else:
            raise errors.TaskError(f"不支持从'{type(root)}'类型构造XPath")

        self._pattern = pattern
        self._default = ArgDefault
Пример #5
0
    def __init__(self, root, pattern=":root"):
        if isinstance(root, (element.Tag, type(None))):
            self._root = root
        elif isinstance(root, str):
            self._root = BeautifulSoup(root, "lxml")
        else:
            raise errors.TaskError(f"不支持从'{type(root)}'类型构造CSS")

        self._pattern = pattern
        self._default = ArgDefault
Пример #6
0
def execute(task: Task):
    """
    运行task实例并处理所有异常
    Returns:
        links: {priority: urls}
    """
    try:
        task.tracking.incr('on_download')
        task.response = task.on_download()
        task.tracking.incr('on_download_ok')
        task.result = task.on_parse()
        links = task.on_link()
        if isinstance(links, list):
            links = {3: links}
        elif links is None:
            links = {}
        elif not isinstance(links, dict):
            raise errors.TaskError(f"on_link返回值应是list或dict型,而非{type(links)}")
        task.on_save()
        task.on_finish()
        return links
    except errors.TaskFinish:
        logger.debug("TaskFinish", task.url)
        task.on_finish()
        return {}
    except errors.TaskBreak as e:
        logger.debug("TaskBack", e.priority, task.url)
        task._queue.insert(task.url, e.priority)
        return {}
    except errors.TaskError as e:
        task._queue.report_error(e.__class__.__name__, task.url)
        logger.warning("Task报告的异常", str(e), task.url)
        return {}
    except Exception as e:
        if task.on_error(e):
            return {}
        task._queue.report_error("unknown", task.url)
        logger.error(f"Task未处理的异常", "unknown", task.url)
        traceback.print_exc()
        return {}
Пример #7
0
 def interval(self, value):
     if not isinstance(value, (int, float)):
         raise errors.TaskError("interval应为int或float型")
     self._spider.set_field("interval", value)
Пример #8
0
 def timeout(self, value):
     if not isinstance(value, (int, float)):
         raise errors.TaskError("timeout应为int或float型")
     self._spider.set_field("timeout", value)