예제 #1
0
파일: pedaily.py 프로젝트: xfzhu2003/github
    def infoParse(self,response):
        request = checkTimeError(response)
        if request:
            yield request 
            return False
        __ = S.replace_invalid_html_char(response.text)
        response = response.replace(body = __)
#        print(re.findall('(?=投[^>]*?资[^>]*?方).*?<a\s*?href=\".*?show(\d+)\/',response.text,re.S))
        InfoConfigs = choice(response.url,contentsConfigs)
        items = self.configParse(InfoConfigs,response,response)
        for item in items.__iter__():
#            yield item
            print(item)
예제 #2
0
 def configParse(self, configs, _response, response=None):
     item = SzseItem()
     if isinstance(configs, dict):
         configs = [configs]
     for _configs in configs:
         #迭代可能多个的configs
         if _configs.get('flag') is None:
             _configs['flag'] = True
         if _configs['list']['v'] and _configs['flag']:
             res = S.select_content(_response, _configs['list'])
         elif isinstance(_response, list):
             res = _response
         else:
             #list(response)   ----让response可迭代
             res = [_response]
         if res:
             for _res in res:
                 #初始化result
                 result = dict()
                 #遍历每个字段提取
                 for config in _configs['data']:
                     k = config['En']
                     result[k] = S.select_content(_res, config, response)
                     result[k] = S.replace_invalid_char(result[k])
                     result[k] = S.replace_invalid_html_char(result[k])
                 if _configs.get('clear'):
                     for config in _configs['clear']:
                         k = config['En']
                         result[k] = S.select_content(
                             result[k], config, response)
                 item['result'] = result
                 item['keys'] = _configs['list']['keys']
                 item['db'] = _configs['list']['db']
                 item['conn'] = _configs['list'].get('conn')
                 #传递pipelin处理item字段
                 if result[_configs['list']['check']]:
                     yield item