def __init__(self, limitTime): self.limitTime = limitTime self.rconn = RedisSet().redisSet() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" self.words = self.load_keyword() self.wm = WeatherModel(self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "keywords.txt") super(SinaSpider, self).__init__(self.name)
def __init__(self): # client = pymongo.MongoClient("192.168.20.216", 27017) # db = client["SinaWebchatWeather"] # self.sinaComment = db["SinaComment"] # self.sinaContent = db["SinaContent"] # self.sogou = db["SoGouContent"] # self.tweets = db["Tweets"] # self.Info = db["Information"] self.ora = Dba() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" # self.path = "weatheroutput/weatheroutput/" self.keys = self.load_key() self.wm = WeatherModel( self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "weatherwords.txt") self.tags = jieba.analyse.extract_tags self.ConSql = """ INSERT INTO QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date) VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """ self.old_key_sql = """ insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date) VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')""" self.all_key_sql = """ insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')""" self.CommSql = """ INSERT INTO QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date) VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))""" self.SinComSql = """
class SinaSpider(CrawlSpider): name = "SinaSpider" rootUrl = "https://weibo.cn" def __init__(self, limitTime): self.limitTime = limitTime self.rconn = RedisSet().redisSet() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" # self.path = "weatheroutput/weatheroutput/" self.words = self.load_keyword() self.wm = WeatherModel(self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "keywords.txt") super(SinaSpider, self).__init__(self.name) @classmethod def from_settings(cls, settings): return cls(settings.get('LIMIT_TIME')) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = SinaSpider.from_settings(crawler.settings) spider._set_crawler(crawler) return spider def start_requests(self): url = "https://weibo.cn/%s" le = 1 for key in self.rconn.smembers("News:sina"): req_url = url % key.split("--")[-2] data = {"key": key, "page": 1} yield Request(url=req_url, callback=self.parse, meta={"data": data}, dont_filter=True) if le < 1: break le += 1 def parse(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta["data"] flag_list = [] for i in soup.find_all("div", class_="c")[1:-2]: strTime = i.find("span", class_="ct").get_text(strip=True).split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) flag_list.append(flag) if flag == 1: content_id = i["id"].strip("M_") # self.rconn.delete("Sina:content_id") redis_flag = self.rconn.sadd("Sina:content_id", content_id) # redis_flag = 1 if redis_flag == 1: detail = {} detail["key"] = data["key"] comment_url = "https://weibo.cn/comment/%s" % content_id detail["contentId"] = content_id detail["pushTime"] = pushTime yield Request(url=comment_url, callback=self.parse_comment, meta={"data": detail}) # break if 2 not in flag_list: hxs = Selector(response) url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract()[0] if url_next: req_url = "https://weibo.cn%s" % url_next yield Request(url=req_url, callback=self.parse, meta={"data": data}) def parse_comment(self, response): data = response.meta["data"] hxs = Selector(response) if not data.has_key("page"): detail = {} detail["contentId"] = data["contentId"] detail["pushTime"] = data["pushTime"] keys = data["key"].split("--") detail["SinaName"] = keys[0] detail["Vermicelli"] = keys[1] detail["SinaID"] = keys[2] detail["SinaOthID"] = keys[2] contentStr = hxs.xpath( '//div/span[@class="ctt"]//text()').extract() # 微博内容 reprintStr = hxs.xpath( '//div/span[@class="pms"]/preceding-sibling::span/a//text()' ).extract() commontStr = hxs.xpath( '//div/span[@class="pms"]//text()').extract() thumbs_upStr = hxs.xpath( '//div/span[@class="pms"]/following-sibling::span/a//text()' ).extract() content = "0" reprint = "0" commont = "0" if '[' in str(reprintStr[0]): reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') + 1:str(reprintStr[0]).index(']')] if '[' in str(commontStr[0]): commont = str(commontStr[0])[str(commontStr[0]).index('[') + 1:str(commontStr[0]).index(']')] thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') + 1:str(thumbs_upStr[0]).index(']')] for cd in contentStr: if len(cd) >= 3: content += cd.replace(" ", "") detail["content"] = content detail["reprint"] = int(reprint) detail["commont"] = int(commont) detail["thumbs_up"] = int(thumbs_up) flag = int(self.wm.predict(detail["content"])[0]) if flag != 1: total = 0 for word in self.words: if word.strip() in detail["content"]: total += 1 if total >= 2: flag = 1 break if flag == 1: detail["flag"] = 1 contentItem = SinaContentItem() for key, val in detail.items(): contentItem[key] = val yield contentItem c = hxs.xpath('body/div[@class="c" and @id]')[1:] else: c = [] else: c = hxs.xpath('body/div[@class="c" and @id]') for div in c: comme = {} comme["contentId"] = data["contentId"] ID = div.xpath("a/@href").extract_first() userName = div.xpath("a//text()").extract_first() commentId = div.xpath("@id").extract()[0].split('C_')[1] try: userId = ID.split("u/")[1] except: userId = ID.split('/')[1] commentStr = div.xpath( 'span[@class="ctt"]//text()').extract() # 微博内容 comment = "" for co in commentStr: if len(co) >= 3: comment += co.replace(" ", "") strTime = div.xpath( 'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0] pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime) comme['pushTime'] = pushTime comme["userName"] = userName comme["commentId"] = commentId comme["userId"] = userId comme["comment"] = comment commentItem = SinaCommentItem() for key, val in comme.items(): commentItem[key] = val yield commentItem url_next = hxs.xpath( 'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href' .decode('utf8')).extract() if c != [] and url_next: data["page"] = True next_url = self.rootUrl + url_next[0] yield Request(url=next_url, callback=self.parse_comment, meta={"data": data}, dont_filter=True) def load_keyword(self): fs = open(self.path + "weatherwords.txt", "r") words = fs.readlines() return words