def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info('Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]'.format( response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse(response, crawl_key, list_page_content_md5): return _item_idx = 0 for selector in response.xpath(response.meta['param']['xpath_of_list']): _detail_url = '' try: _item_idx += 1 _detail_url = response.urljoin( selector.xpath(response.meta['param']['xpath_of_detail_url']).extract_first()) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format(crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) if loop_break: return if item_break: continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class'](selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param'] ) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 self.crawl_helper.increase_total_item_num(crawl_key) logging_item = item.copy() logging_item["content"] = "" logging.info('item is: {}'.format(logging_item)) yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))
def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info('Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]'.format( response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse(response, crawl_key, list_page_content_md5): return _item_idx = 0 if response.meta['param']['requests_type'] == "dict": _request = response.text.encode('utf-8') _response_data = json.loads(response.text) # _dict_xpath = response.meta['param']['xpath_of_list'].split("/") # if len(_dict_xpath) > 1: for _dictn_num in response.meta['param']["xpath_of_list"]: _response_data = _response_data[_dictn_num] for selector in _response_data: _detail_url = '' try: _item_idx += 1 # _detail_url = response.urljoin( # selector.xpath(response.meta['param']['xpath_of_detail_url']).extract_first()) _detail_url = response.urljoin( selector[response.meta['param']['xpath_of_detail_url']] ) _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format(crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) if loop_break: return if item_break: continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class'](selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param'] ) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url)) else: for selector in response.xpath(response.meta['param']['xpath_of_list']): # print(selector.xpath('string()').extract_first()) _detail_url = '' try: _item_idx += 1 _detail_url = "http://ggzyjy.nmg.gov.cn" + selector.xpath( response.meta['param']['xpath_of_detail_url']).extract_first() _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format(crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) if loop_break: return if item_break: continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class'](selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param'] ) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 self.crawl_helper.increase_total_item_num(crawl_key) logging_item = item.copy() logging_item['content'] = "" logging.info('item is: {}'.format(logging_item)) yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))
def parse_list_page_common(self, response): """ 通用版list页面解析 必要条件: :param response: :return: """ assert 'crawl_key' in response.meta assert 'page_index' in response.meta assert 'param' in response.meta assert 'xpath_of_list' in response.meta['param'] assert 'xpath_of_detail_url' in response.meta['param'] assert 'item_parse_class' in response.meta['param'] list_page_content_md5 = hashlib.md5(response.body).hexdigest() logging.info( 'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]' .format(response.meta['page_index'], response.url, response.status, list_page_content_md5)) logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info)) crawl_key = response.meta['crawl_key'] # 更新状态表记录 # self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active') if not self.crawl_helper.should_continue_page_parse( response, crawl_key, list_page_content_md5): return _item_idx = 0 for selector in response.xpath( response.meta['param']['xpath_of_list']): _detail_url = '' try: # url 标签 onc = selector.xpath( response.meta['param']['xpath_of_detail_url']) # logging.info(onc.extract_first()) if len(onc) > 0: lis = onc.extract_first().replace('showProjectDetail', '').replace('showNewsDetail', '') \ .replace('(', '').replace('\'', '').replace(', ', ',').replace(')', '').replace(';', '').rstrip().lstrip() detail_param = lis.split(',') # logging.info(lis.split(',')) if response.meta['param']['connect_type'] == 'project': peoject_default = 'http://ecp.sgcc.com.cn/html/project/{}/{}.html' _detail_url = peoject_default.format( detail_param[0], detail_param[1]) else: news_default = 'http://ecp.sgcc.com.cn/html/news/{}/{}.html' _detail_url = news_default.format( detail_param[0], detail_param[1]) else: continue _item_idx += 1 _unq_id = JyScrapyUtil.get_unique_id(_detail_url) logging.info('Parse item, [{}]-[{}/{}]'.format( crawl_key, _item_idx, response.meta['page_index'])) # 检查记录是否已在库中,并做相应的跳出动作 # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id) # if loop_break: # return # if item_break: # continue # 生成并返回爬取item item_parser = response.meta['param']['item_parse_class']( selector) item = item_parser.get_common_raw_item( _id=_unq_id, detail_url=_detail_url, site=self.site, ext_param=response.meta['param']) # 随机休眠 time.sleep(random.randint(50, 100) / 1000.0) # 更新数据库中爬取数量 # self.crawl_helper.increase_total_item_num(crawl_key) logging.info('item is: {}'.format(item)) yield item except Exception as e: logging.exception('Handle [{}] failed'.format(_detail_url))