def __init__(self, limitTime): self.limitTime = limitTime self.rconn = RedisSet().redisSet() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" self.words = self.load_keyword() self.wm = WeatherModel(self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "keywords.txt") super(SinaSpider, self).__init__(self.name)
def __init__(self, *a, **kw): self.rconn = RedisSet().redisSet() self.dba = Dba() self.keyword = { "新浪网": "Sina", "环球网": "Huanqiu", "搜狐网": "Sohu", "网易": "WangYi", "凤凰网": "Ifeng", "新华网": "Xinhua", "篱笆网": "Liba", "新民网": "Xinmin", "看看新闻网": "KanKan", "中国天气网": "Weather", "东方网": "Eastday", "人民网-上海": "People", "上海热线": "Online", "上观": "ShangGuan", "上海新闻网": "ShangHaiNews", "腾讯大申网": "Tencent", "宽带山": "KuanDai", "中国广播网": "Radio" } self.current_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) super(QxjSpider, self).__init__(*a, **kw)
def __init__(self, *a, **kw): super(SinaMsgSpider, self).__init__(*a, **kw) self.rconn = RedisSet().redisSet()
class SinaMsgSpider(CrawlSpider): name = "SinaMsgSpider" host = "https://weibo.cn" def __init__(self, *a, **kw): super(SinaMsgSpider, self).__init__(*a, **kw) self.rconn = RedisSet().redisSet() def start_requests(self): for lines in self.rconn.smembers("Tweets:sina"): sinaId = lines.split("--")[2] req_url = self.host + "/" + sinaId data = {"key": lines, "page": 1} yield Request(url=req_url, callback=self.parse, meta={"data": data}) fans_url = self.host + "/" + sinaId + "/fans" yield Request(url=fans_url, callback=self.parse_fans, meta={"data": data}) def parse_fans(self, response): data = response.meta["data"] hxs = Selector(response) div = hxs.xpath('body/div[@class="c"]') trs = div.xpath("//tr/td[2]/a[2]/@href").extract() keys = data["key"].split("--") for d in trs: try: userId = d.split("uid=")[1].split("&rl=")[0] info_url = self.host + "/" + userId + "/info" detail = {} detail["userId"] = userId detail["SinaName"] = keys[0] detail["SinaId"] = keys[2] yield Request(url=info_url, callback=self.parse_info, meta={"data": detail}, dont_filter=True) except: pass next_url = hxs.xpath('//a[text()="下页"]/@href'.decode('utf8')).extract() if next_url: yield Request(url=self.host + next_url[0], callback=self.parse_fans, meta={"data": data}, dont_filter=True) def parse_info(self, response): hxs = Selector(response) data = response.meta["data"] div = ";".join( hxs.xpath('//div[@class="c"][3]/text()').extract()) + ";" NickName = re.findall('昵称[::]?(.*?);'.decode('utf8'), div) Birthday = re.findall('生日[::]?(.*?);'.decode('utf8'), div) Gender = re.findall('性别[::]?(.*?);'.decode('utf8'), div) Marriage = re.findall('感情状况[::]?(.*?);'.decode('utf8'), div) Province = re.findall('地区[::]?(.*?);'.decode('utf8'), div) Signature = re.findall('简介[::]?(.*?);'.decode('utf8'), div) if NickName and NickName[0]: data['NickName'] = NickName[0] else: data['NickName'] = "Null" if Marriage and Marriage[0]: data['Marriage'] = Marriage[0] else: data['Marriage'] = "Null" if Birthday and Birthday[0]: data['Birthday'] = Birthday[0] else: data['Birthday'] = "Null" if Gender and Gender[0]: data['Gender'] = Gender[0] else: data['Gender'] = "Null" if Province and Province[0]: dou = Province[0].split(" ") if len(dou) == 2: data['Province'] = dou[0] data['City'] = dou[1] else: data['Province'] = dou[0] data['City'] = "Null" else: data['Province'] = "Null" data['City'] = "Null" if Signature and Signature[0]: data['Signature'] = Signature[0] else: data['Signature'] = "Null" req_url = "https://weibo.cn/attgroup/opening?uid=" + data["userId"] yield Request(url=req_url, callback=self.parse_page, meta={"data": data}) def parse_page(self, response): hxs = Selector(response) data = response.meta["data"] msgs = ";".join( hxs.xpath('//div[@class="tip2"]/a/text()').extract()) + ";" Num_Fans = re.findall('微博\[(\d+)\]'.decode('utf8'), msgs) Num_Follows = re.findall('关注\[(\d+)\]'.decode('utf8'), msgs) Num_Tweets = re.findall('粉丝\[(\d+)\]'.decode('utf8'), msgs) if Num_Fans and Num_Fans[0]: data["Num_Fans"] = Num_Fans[0] else: data["Num_Fans"] = "Null" if Num_Follows and Num_Follows[0]: data["Num_Follows"] = Num_Follows[0] else: data["Num_Follows"] = "Null" if Num_Tweets and Num_Tweets[0]: data["Num_Tweets"] = Num_Tweets[0] else: data["Num_Tweets"] = "Null" item = SinaInformationItem() for key, val in data.items(): item[key] = val yield item def parse(self, response): hxs = Selector(response) data = response.meta["data"] c = hxs.xpath('body/div[@class="c" and @id]') for div in c: try: like = re.findall('赞\[(\d+)\]'.decode('utf8'), div.extract())[0] # 点赞数 transfer = re.findall('转发\[(\d+)\]'.decode('utf8'), div.extract())[0] # 转载数 commentNum = re.findall('评论\[(\d+)\]'.decode('utf8'), div.extract())[0] # 评论数 contentId = div.xpath("@id").extract()[0].split('M_')[1] others = div.xpath('div/span[@class="ct"]/text()').extract( ) # 求时间和使用工具(手机或平台) strs = others[0].split(u"来自") pushTime, flag = Time_stamp().time_handle(strs[0].strip()) tool = strs[1] detail = {} detail["key"] = data["key"] comment_url = "https://weibo.cn/comment/%s" % contentId detail["contentId"] = contentId detail["pushTime"] = pushTime detail["commentNum"] = commentNum detail["transfer"] = transfer detail["like"] = like detail["tool"] = tool yield Request(url=comment_url, callback=self.parse_comment, meta={"data": detail}, dont_filter=True) # break except: pass url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if url_next and data['page'] < 5: req_url = "https://weibo.cn%s" % url_next[0] data["page"] += 1 yield Request(url=req_url, callback=self.parse, meta={"data": data}, dont_filter=True) def parse_comment(self, response): hxs = Selector(response) data = response.meta["data"] c = hxs.xpath('body/div[@class="c" and @id]') contentStr = c[0].xpath( 'div/span[@class="ctt"]//text()').extract() # 微博内容 content = "" for con in contentStr: content += con content.strip(":") data["content"] = content keys = data["key"].split("--") data["SinaName"] = keys[0] data["SinaId"] = keys[2] del data["key"] item = SinaTweetsItem() for key, val in data.items(): item[key] = val yield item
class SinaSpider(CrawlSpider): name = "SinaSpider" rootUrl = "https://weibo.cn" def __init__(self, limitTime): self.limitTime = limitTime self.rconn = RedisSet().redisSet() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" # self.path = "weatheroutput/weatheroutput/" self.words = self.load_keyword() self.wm = WeatherModel(self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "keywords.txt") super(SinaSpider, self).__init__(self.name) @classmethod def from_settings(cls, settings): return cls(settings.get('LIMIT_TIME')) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = SinaSpider.from_settings(crawler.settings) spider._set_crawler(crawler) return spider def start_requests(self): url = "https://weibo.cn/%s" le = 1 for key in self.rconn.smembers("News:sina"): req_url = url % key.split("--")[-2] data = {"key": key, "page": 1} yield Request(url=req_url, callback=self.parse, meta={"data": data}, dont_filter=True) if le < 1: break le += 1 def parse(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta["data"] flag_list = [] for i in soup.find_all("div", class_="c")[1:-2]: strTime = i.find("span", class_="ct").get_text(strip=True).split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) flag_list.append(flag) if flag == 1: content_id = i["id"].strip("M_") # self.rconn.delete("Sina:content_id") redis_flag = self.rconn.sadd("Sina:content_id", content_id) # redis_flag = 1 if redis_flag == 1: detail = {} detail["key"] = data["key"] comment_url = "https://weibo.cn/comment/%s" % content_id detail["contentId"] = content_id detail["pushTime"] = pushTime yield Request(url=comment_url, callback=self.parse_comment, meta={"data": detail}) # break if 2 not in flag_list: hxs = Selector(response) url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract()[0] if url_next: req_url = "https://weibo.cn%s" % url_next yield Request(url=req_url, callback=self.parse, meta={"data": data}) def parse_comment(self, response): data = response.meta["data"] hxs = Selector(response) if not data.has_key("page"): detail = {} detail["contentId"] = data["contentId"] detail["pushTime"] = data["pushTime"] keys = data["key"].split("--") detail["SinaName"] = keys[0] detail["Vermicelli"] = keys[1] detail["SinaID"] = keys[2] detail["SinaOthID"] = keys[2] contentStr = hxs.xpath( '//div/span[@class="ctt"]//text()').extract() # 微博内容 reprintStr = hxs.xpath( '//div/span[@class="pms"]/preceding-sibling::span/a//text()' ).extract() commontStr = hxs.xpath( '//div/span[@class="pms"]//text()').extract() thumbs_upStr = hxs.xpath( '//div/span[@class="pms"]/following-sibling::span/a//text()' ).extract() content = "0" reprint = "0" commont = "0" if '[' in str(reprintStr[0]): reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') + 1:str(reprintStr[0]).index(']')] if '[' in str(commontStr[0]): commont = str(commontStr[0])[str(commontStr[0]).index('[') + 1:str(commontStr[0]).index(']')] thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') + 1:str(thumbs_upStr[0]).index(']')] for cd in contentStr: if len(cd) >= 3: content += cd.replace(" ", "") detail["content"] = content detail["reprint"] = int(reprint) detail["commont"] = int(commont) detail["thumbs_up"] = int(thumbs_up) flag = int(self.wm.predict(detail["content"])[0]) if flag != 1: total = 0 for word in self.words: if word.strip() in detail["content"]: total += 1 if total >= 2: flag = 1 break if flag == 1: detail["flag"] = 1 contentItem = SinaContentItem() for key, val in detail.items(): contentItem[key] = val yield contentItem c = hxs.xpath('body/div[@class="c" and @id]')[1:] else: c = [] else: c = hxs.xpath('body/div[@class="c" and @id]') for div in c: comme = {} comme["contentId"] = data["contentId"] ID = div.xpath("a/@href").extract_first() userName = div.xpath("a//text()").extract_first() commentId = div.xpath("@id").extract()[0].split('C_')[1] try: userId = ID.split("u/")[1] except: userId = ID.split('/')[1] commentStr = div.xpath( 'span[@class="ctt"]//text()').extract() # 微博内容 comment = "" for co in commentStr: if len(co) >= 3: comment += co.replace(" ", "") strTime = div.xpath( 'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) comme['pushTime'] = pushTime comme["userName"] = userName comme["commentId"] = commentId comme["userId"] = userId comme["comment"] = comment commentItem = SinaCommentItem() for key, val in comme.items(): commentItem[key] = val yield commentItem url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if c != [] and url_next: data["page"] = True next_url = self.rootUrl + url_next[0] yield Request(url=next_url, callback=self.parse_comment, meta={"data": data}, dont_filter=True) def load_keyword(self): fs = open(self.path + "weatherwords.txt", "r") words = fs.readlines() return words
class SoGouSpider(CrawlSpider): name = "SoGouSpiderS" host = "http://mp.weixin.qq.com" def __init__(self, *a, **kw): super(SoGouSpider, self).__init__(*a, **kw) self.rconn = RedisSet().redisSet() def start_requests(self): url = "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_=" for key in self.rconn.smembers("News:sogou"): req_url = url % key.split("--")[-1] yield Request(url=req_url, callback=self.parse, meta={"key": key}) def parse(self, response): soup = BeautifulSoup(response.body, "lxml") next_url = soup.find("p", class_="tit").find("a")["href"] yield Request(url=next_url, callback=self.parse_result, meta=response.meta) def parse_result(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta try: scr = soup.find_all("script")[-2:-1][0].get_text(strip=True) data_list = scr.split("var msgList = ")[1].split( 'seajs.use("sougou/p')[0].strip().strip(";") j_data = json.loads(data_list) art_list = [] for li in j_data["list"]: dic = {} dic["id"] = li["comm_msg_info"]["id"] dic["pushtime"] = li["comm_msg_info"]["datetime"] dic["title"] = li["app_msg_ext_info"]["title"] dic["fileid"] = li["app_msg_ext_info"]["fileid"] dic["url"] = self.host + li["app_msg_ext_info"][ "content_url"].replace("amp;", "") art_list.append(dic) for ls in li["app_msg_ext_info"]["multi_app_msg_item_list"]: dict_ls = {} dict_ls["id"] = dic["id"] dict_ls["pushtime"] = dic["pushtime"] dict_ls["url"] = self.host + ls["content_url"].replace( "amp;", "") dict_ls["title"] = ls["title"] dict_ls["fileid"] = ls["fileid"] art_list.append(dict_ls) for ks in art_list: timeFormat, flag = Time_stamp().time_handle(ks["pushtime"]) if flag == 1: line = data["key"].split("--")[-1] + "--" + str( ks["pushtime"]) self.rconn.delete("SoGou:Account") flag = self.rconn.sadd("SoGou:Account", line) if flag == 1: ks["pushtime"] = timeFormat ks["keyword"] = data["key"] data["ks"] = ks yield Request(url=ks["url"], callback=self.parse_article, meta=data) except Exception, e: logger.error("parse_result error <<%s>>" % e)