def _parse_detail_page(self, item): url = item.get('url') if not url: if 'sc' not in item: item['sc'] = '' item['issue_time'] = '' item['title'] = '' return detail_pattern = item['_patterns']['detail_pattern'] logger.info('parse detail: %s', url) try: res = downloader.get(url, headers=headers) if res.status_code == 404: logger.debug('skip 404 url: %s', item['url']) del item['url'] return page = res.content encoding = chardet.detect(page)['encoding'] if encoding == 'GB2312': encoding = 'gb18030' elif encoding is None: encoding = 'utf-8' page = page.decode(encoding, 'ignore') for key, pattern in detail_pattern.items(): _parser = build_parser(pattern) _parser.parse(page) item[key] = _parser.result except Exception as e: logger.error("parse detail page error: %s", url, exc_info=True) raise
def test_multi_xpath(self): pattern = { 'pattern': '//a[@class="sister"]', 'type': 'xpath', 'target': 'html' } _parser = parser.build_parser(pattern) _parser.parse(html_doc) self.assertEqual(len(_parser.result), 3)
def test_xpath_text(self): pattern = { 'pattern': '//p[@class="title"]', 'type': 'xpath', 'target': 'text' } _parser = parser.build_parser(pattern) _parser.parse(html_doc) self.assertEqual(_parser.result, 'The Dormouse\'s story')
def test_xpath_html(self): pattern = { 'pattern': '//p[@class="title"]', 'type': 'xpath', 'target': 'html' } _parser = parser.build_parser(pattern) _parser.parse(html_doc) self.assertEqual(_parser.result, '<p class="title"><b>The Dormouse\'s story</b></p>')
def process_page(self, item_, base_url, page, index_pattern): """处理列表页""" list_pattern = index_pattern['_list'] _parser = build_parser(list_pattern) _parser.parse(page) cur_li_hash = hashlib.md5(json.dumps(_parser.result)).hexdigest() if cur_li_hash == self._last_li_hash: # 两次请求列表页元素一致 # 可能在最后一页,直接跳出 logger.debug('break click loop') self._click_next = False return self._last_li_hash = cur_li_hash # 处理列表页详情链接 for element in _parser.source: # 获取详情时,清空每个item item = copy.deepcopy(item_) item['url'] = self._get_item_link(base_url, element) for key, pattern in index_pattern.items(): if key in ['_list', '_next_page']: continue if pattern['pattern'].startswith('/html'): _element = page else: _element = element _parser = build_parser(pattern) _parser.parse(_element) item[key] = _parser.result # 执行自定义函数,处理列表页元素及item self._module.process_list_item(element, item) # 解析详情页 self._parse_detail_page(item) if item.get('url'): # 执行自定义函数,处理获取详情页后的item self._module.process_detail_item(item) # 执行项目默认函数 self._proj_module.process_item(item) # 保存item self._save_item(item) self._click_next = item.get('_click_next', True)
def test_multi_css(self): pattern = {'pattern': 'a.sister', 'type': 'css', 'target': 'html'} _parser = parser.build_parser(pattern) _parser.parse(html_doc) self.assertEqual(len(_parser.result), 3)
def test_css_text(self): pattern = {'pattern': 'p.title', 'type': 'css', 'target': 'text'} _parser = parser.build_parser(pattern) _parser.parse(html_doc) self.assertEqual(_parser.result, 'The Dormouse\'s story')
def test_build_parser_css(self): pattern = {'pattern': 'a', 'type': 'css', 'target': 'html'} _parser = parser.build_parser(pattern) self.assertIsInstance(_parser, parser.CSSParser)
def test_build_parser_xpath(self): pattern = {'pattern': '//a', 'type': 'xpath', 'target': 'html'} _parser = parser.build_parser(pattern) self.assertIsInstance(_parser, parser.XPathParser)