예제 #1
0
	def parse_final(self,response):
		#我去,这个Final_Xpath竟然只会传递一次......你要是动了这个Final_Xpath,那就无法修改回来了
		Final_Xpath = response.meta.get('Final_Xpath',None)
		Some_Info = response.meta.get('Some_Info',None)
		
		if 'All_Xpath' not in Final_Xpath.keys():
				item = TvSpiderItem()
				l = ItemLoader(item=item, response=response)
				for key in Final_Xpath.keys():
						item.fields[key] = Field()
						try:
								#itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("")
								if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]] and key != "site_name":		
										map(lambda x:l.add_value(key , ""),["just_one"])
								elif key == "site_name":
										map(lambda x:l.add_value(key , x),Final_Xpath[key])
								else:
										map(lambda x:l.add_xpath(key , x) if response.xpath(x).extract() != [] else "",Final_Xpath[key])
						except Exception,e:
								print Exception,":",e
				if Some_Info:
						for key in Some_Info.keys():
								item.fields[key] = Field()
								l.add_value(key , Some_Info[key])
				yield l.load_item()
예제 #2
0
    def parse_final(self, response):
        #我去,这个Final_Xpath竟然只会传递一次......你要是动了这个Final_Xpath,那就无法修改回来了
        Index_Url = response.meta['Index_Url']
        pid = response.meta.get('pid', None)
        print pid, "\n"
        Final_Xpath = response.meta.get('Final_Xpath', None)
        Some_Info = response.meta.get('Some_Info', None)

        if 'All_Xpath' not in Final_Xpath.keys():
            my_Final_Xpath = Final_Xpath.copy()

            item = TvSpiderItem()
            l = ItemLoader(item=item, response=response)
            if "json_data" in my_Final_Xpath.keys():
                json_data = Get_Json_Content(pid, self.name)
                print json_data, "\n"
                for key in my_Final_Xpath['json_data'].keys():
                    item.fields[key] = Field()
                    if ''.join(my_Final_Xpath['json_data']
                               [key]) in json_data.keys():
                        map(lambda x: l.add_value(key, json_data[x]),
                            my_Final_Xpath['json_data'][key])
                #使用完了再删除
                del my_Final_Xpath['json_data']
            for key in my_Final_Xpath.keys():
                item.fields[key] = Field()
                try:
                    #itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("")
                    if map(
                            lambda x: 1 if x else 0,
                            map(
                                lambda x: response.xpath(x).extract()
                                if x != "/" else "", my_Final_Xpath[key])) in [
                                    [0, 0], [0]
                                ] and key != "site_name":
                        map(lambda x: l.add_value(key, ""), ["just_one"])
                    elif key == "site_name":
                        map(lambda x: l.add_value(key, x), my_Final_Xpath[key])
                    else:
                        map(
                            lambda x: l.add_xpath(key, x)
                            if response.xpath(x).extract() != [] else "",
                            my_Final_Xpath[key])
                except Exception, e:
                    print Exception, ":", e
            if Some_Info:
                for key in Some_Info.keys():
                    item.fields[key] = Field()
                    l.add_value(key, Some_Info[key])
            yield l.load_item()
예제 #3
0
						except Exception,e:
								print Exception,":",e
				if Some_Info:
						for key in Some_Info.keys():
								item.fields[key] = Field()
								l.add_value(key , Some_Info[key])
				yield l.load_item()
		else:
		#感觉这里不能用itemloader的add_xxx方法了,因为要先找到一个页面所有的含有目标item的块,再在每个块里面提取出单个item,itemloader的话是一次性直接全取出,add_xpath不能再细分了;;打算用add_value方法
				my_Final_Xpath = Final_Xpath.copy()
				All_Xpath = my_Final_Xpath['All_Xpath'].copy()
				del my_Final_Xpath['All_Xpath']
				all_xpath = All_Xpath['all_xpath']
				del All_Xpath['all_xpath']
				for i in response.xpath(all_xpath[0]):
						item = TvSpiderItem()
						l = ItemLoader(item=item, response=response)
						#把All_Xpath中的数据提取出来
						for key in All_Xpath.keys():
								item.fields[key] = Field()
								try:
										#itemloader在add_xxx方法找不到值的时候,会自动忽略这个字段,可是我不想忽略它,这时候需要将其置为空("")
										if map(lambda x:1 if x else 0, map(lambda x:response.xpath(x).extract() if x != "/" else "",Final_Xpath[key])) in [[0,0],[0]]:
												map(lambda x:l.add_value(key , ""),["just_one"])
										else:
												map(lambda x:l.add_value(key, i.xpath(x).extract()) if i.xpath(x).extract() != [] else "",Final_Xpath[key])
								except Exception,e:
										print Exception,",",e
						#将除了All_Xpath中的数据提取出来,像豆瓣就特别需要这种情况,一般下面的数据是(多次取得),All_Xpath中才是真正单条的数据
						for key in my_Final_Xpath.keys():
								item.fields[key] = Field()