def parse_final(self,response): #我去,这个Final_Xpath竟然只会传递一次......你要是动了这个Final_Xpath,那就无法修改回来了 Final_Xpath = response.meta.get('Final_Xpath',None) Some_Info = response.meta.get('Some_Info',None) if 'All_Xpath' not in Final_Xpath.keys(): item = TvSpiderItem() l = ItemLoader(item=item, response=response) for key in Final_Xpath.keys(): item.fields[key] = Field() try: #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("") if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]] and key != "site_name": map(lambda x:l.add_value(key , ""),["just_one"]) elif key == "site_name": map(lambda x:l.add_value(key , x),Final_Xpath[key]) else: map(lambda x:l.add_xpath(key , x) if response.xpath(x).extract() != [] else "",Final_Xpath[key]) except Exception,e: print Exception,":",e if Some_Info: for key in Some_Info.keys(): item.fields[key] = Field() l.add_value(key , Some_Info[key]) yield l.load_item()
def parse_final(self, response): #我去,这个Final_Xpath竟然只会传递一次......你要是动了这个Final_Xpath,那就无法修改回来了 Index_Url = response.meta['Index_Url'] pid = response.meta.get('pid', None) print pid, "\n" Final_Xpath = response.meta.get('Final_Xpath', None) Some_Info = response.meta.get('Some_Info', None) if 'All_Xpath' not in Final_Xpath.keys(): my_Final_Xpath = Final_Xpath.copy() item = TvSpiderItem() l = ItemLoader(item=item, response=response) if "json_data" in my_Final_Xpath.keys(): json_data = Get_Json_Content(pid, self.name) print json_data, "\n" for key in my_Final_Xpath['json_data'].keys(): item.fields[key] = Field() if ''.join(my_Final_Xpath['json_data'] [key]) in json_data.keys(): map(lambda x: l.add_value(key, json_data[x]), my_Final_Xpath['json_data'][key]) #使用完了再删除 del my_Final_Xpath['json_data'] for key in my_Final_Xpath.keys(): item.fields[key] = Field() try: #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("") if map( lambda x: 1 if x else 0, map( lambda x: response.xpath(x).extract() if x != "/" else "", my_Final_Xpath[key])) in [ [0, 0], [0] ] and key != "site_name": map(lambda x: l.add_value(key, ""), ["just_one"]) elif key == "site_name": map(lambda x: l.add_value(key, x), my_Final_Xpath[key]) else: map( lambda x: l.add_xpath(key, x) if response.xpath(x).extract() != [] else "", my_Final_Xpath[key]) except Exception, e: print Exception, ":", e if Some_Info: for key in Some_Info.keys(): item.fields[key] = Field() l.add_value(key, Some_Info[key]) yield l.load_item()
except Exception,e: print Exception,":",e if Some_Info: for key in Some_Info.keys(): item.fields[key] = Field() l.add_value(key , Some_Info[key]) yield l.load_item() else: #感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法 my_Final_Xpath = Final_Xpath.copy() All_Xpath = my_Final_Xpath['All_Xpath'].copy() del my_Final_Xpath['All_Xpath'] all_xpath = All_Xpath['all_xpath'] del All_Xpath['all_xpath'] for i in response.xpath(all_xpath[0]): item = TvSpiderItem() l = ItemLoader(item=item, response=response) #把All_Xpath中的数据提取出来 for key in All_Xpath.keys(): item.fields[key] = Field() try: #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("") if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]]: map(lambda x:l.add_value(key , ""),["just_one"]) else: map(lambda x:l.add_value(key, i.xpath(x).extract()) if i.xpath(x).extract() != [] else "",Final_Xpath[key]) except Exception,e: print Exception,",",e #将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据 for key in my_Final_Xpath.keys(): item.fields[key] = Field()