def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source","news.sina.com.cn") el.add_value("site_type","news") el.add_value("task_id",self.task_id) nowTime=time.localtime() nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2]) el.add_value("catch_date",nowDate.strftime('%Y-%m-%d')) baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract() if baidu_url and len(baidu_url)>0: source_url = urlUtil.getRedirectUrl(baidu_url[0],timeout=10) sinaUrl = fo.findSinaNewsUrl(source_url) if sinaUrl: el.add_value('site_url',sinaUrl) else: continue else: continue yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) print hxs.extract() for con in hxs.xpath(self.xpathConf.get("parse_xpath")): s = Selector(text=con.extract()) if s.xpath(self.xpathConf.get("is_parse_xpath")).__len__() > 0: r = [str(self.accountNum)] imgs = s.xpath(self.xpathConf.get("face_image")).extract() if len(imgs) > 0: r.append(imgs[0]) else: r.append("") num_info = s.xpath(self.xpathConf.get("num_info")).extract() if len(num_info) > 0: for num in fo.getWeiboCnUserInfo(num_info[0]): r.append(num) else: r.append(0) r.append(0) r.append(0) #生成保存Redis的格式 save_info = "%s|%s|%s|%s|%s" % tuple(r) print(r) print save_info self.server.lpush(self.out_key, save_info)
def parse(self, response): #获取一个选择器 hxs = Selector(response) print hxs.extract() for con in hxs.xpath(self.xpathConf.get("parse_xpath")): s = Selector(text=con.extract()) if s.xpath(self.xpathConf.get("is_parse_xpath")).__len__()>0: r = [str(self.accountNum)] imgs = s.xpath(self.xpathConf.get("face_image")).extract() if len(imgs)>0: r.append(imgs[0]) else: r.append("") num_info = s.xpath(self.xpathConf.get("num_info")).extract() if len(num_info)>0: for num in fo.getWeiboCnUserInfo(num_info[0]): r.append(num) else: r.append(0) r.append(0) r.append(0) #生成保存Redis的格式 save_info = "%s|%s|%s|%s|%s" % tuple(r) print(r) print save_info self.server.lpush(self.out_key,save_info)
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source", "news.sina.com.cn") el.add_value("site_type", "news") el.add_value("task_id", self.task_id) nowTime = time.localtime() nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2]) el.add_value("catch_date", nowDate.strftime('%Y-%m-%d')) baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract() if baidu_url and len(baidu_url) > 0: source_url = urlUtil.getRedirectUrl(baidu_url[0], timeout=10) sinaUrl = fo.findSinaNewsUrl(source_url) if sinaUrl: el.add_value('site_url', sinaUrl) else: continue else: continue yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) list = hxs.xpath(self.xpathConf.get("search_num")) print list if len(list) > 0: search_num = list[0].extract() num = int(fo.getBaiduSearchNum(search_num)) if num / 10 > 80: self.saveUrlToRedis(80) else: self.saveUrlToRedis(num / 10)
def parse(self, response): #获取一个选择器 hxs = Selector(response) list = hxs.xpath(self.xpathConf.get("search_num")) if len(list) > 0: search_num = list[0].extract() num = int(fo.getNum(search_num)) if num / 25 > 50: self.saveUrlToRedis(50) else: self.saveUrlToRedis(num / 25)
def parse(self, response): #获取一个选择器 hxs = Selector(response) list = hxs.xpath(self.xpathConf.get("search_num")) print list if len(list)>0: search_num = list[0].extract() num = int(fo.getBaiduSearchNum(search_num)) if num/10>80: self.saveUrlToRedis(80) else: self.saveUrlToRedis(num/10)
def parse(self, response): #获取一个选择器 hxs = Selector(response) list = hxs.xpath(self.xpathConf.get("search_num")) if len(list)>0: search_num = list[0].extract() num = int(fo.getNum(search_num)) if num/25>50: self.saveUrlToRedis(50) else: self.saveUrlToRedis(num/25)