def parse_html_item(self, response, loop, fields): meta = response.meta hxs = Selector(response) self.macro.update({'URL':response.url, 'keyword':meta.get('keyword', '')}) for e in hxs.xpath(loop or '(//*)[1]'): loader = ItemLoader(item=Item(), selector=e) for k,v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'css' in v: get_v_x = loader.get_css v_x = v.get('css') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value", "xpath" or "css"'.format(k), level=log.WARNING) continue val = get_v_x( self.macro.expand(v_x, meta), parser.make_parser(v.get('parse', {})), re=v.get('regex') ) if not val and 'default' in v: val = arg_to_iter(self.macro.expand(v.get('default'), meta)) if not (val or v.get('multi') or v.get('opt')): log.msg(u'field [{}] is empty:\n{}'.format(k, loader.load_item()), level=log.WARNING) break loader.add_value(k, val) else: yield loader.load_item()
def parse_json_item(self, response, loop, fields): meta = response.meta enc = getattr(self, 'json_enc', 'utf-8') txt = unicode(response.body, encoding=enc, errors='ignore') if hasattr(self, 'json_type') and self.json_type=='list': l, r = txt.find('['), txt.rfind(']') else: l, r = txt.find('{'), txt.rfind('}') obj = json.loads(txt[l:r+1]) self.macro.update({'URL':response.url, 'keyword':meta.get('keyword', '')}) for e in jsonpath.jsonpath(obj, loop or '$[]') or []: item = Item() for k,v in fields.iteritems(): if 'value' in v: v_x = self.macro.expand(v.get('value')) elif 'jpath' in v: v_x = jsonpath.jsonpath(e, self.macro.expand(v.get('jpath'))) v_x = None if v_x==False else v_x else: log.msg(u'field [{}] should contains "value" or "jpath"'.format(k), level=log.WARNING) continue val = parser.make_parser(v.get('parse', {}))(v_x) if not val and 'default' in v: val = self.macro.expand(v.get('default')) if not (val or v.get('multi') or v.get('opt')): log.msg(u'field [{}] is empty:\n{}'.format(k, item), level=log.WARNING) break item[k] = arg_to_iter(val) else: yield item