def chufaparse(self, response): items = self.configParse(chufaConfigs, response, response) for item in items.__iter__(): _item = item url = re.search("\((.*?)\)", _item['result']['contents']).group(1).strip() url = response.urljoin(url) try: content = S._txtparse(url) content = S.replace_invalid_char(content) except: content = None _item['result']['contents'] = content _item['result']['url'] = url yield _item page, totalpage, counts = getTotalPage(response) if page < totalpage: page += 1 url = 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1759_cxda&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT={totalpage}&tab1RECORDCOUNT={counts}&REPORT_ACTION=navigate'.format( page=page, totalpage=totalpage, counts=counts) yield scrapy.Request(url, headers=hdr(), meta={ 'page': page, 'totalpage': totalpage, 'counts': counts }, callback=self.chufaparse, priority=1)
def zrdsinfoparse(self, response): item = SzseItem() result = response.meta['result'] result['ins'] = "".join( response.xpath( '//span[@id="ViewResume1_lblContent"]/text()').extract()) result['ins'] = S.replace_invalid_char(result['ins']) item['result'] = result item['db'] = response.meta['db'] item['keys'] = response.meta['keys'] yield item
def configParse(self, configs, _response, response=None): item = SzseItem() if isinstance(configs, dict): configs = [configs] for _configs in configs: #迭代可能多个的configs if _configs.get('flag') is None: _configs['flag'] = True if _configs['list']['v'] and _configs['flag']: res = S.select_content(_response, _configs['list']) elif isinstance(_response, list): res = _response else: #list(response) ----让response可迭代 res = [_response] if res: for _res in res: #初始化result result = dict() #遍历每个字段提取 for config in _configs['data']: k = config['En'] result[k] = S.select_content(_res, config, response) result[k] = S.replace_invalid_char(result[k]) result[k] = S.replace_invalid_html_char(result[k]) if _configs.get('clear'): for config in _configs['clear']: k = config['En'] result[k] = S.select_content( result[k], config, response) item['result'] = result item['keys'] = _configs['list']['keys'] item['db'] = _configs['list']['db'] item['conn'] = _configs['list'].get('conn') #传递pipelin处理item字段 if result[_configs['list']['check']]: yield item