Exemplo n.º 1
0
 def __init__(self, limitTime):
     self.limitTime = limitTime
     self.rconn = RedisSet().redisSet()
     self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
     self.words = self.load_keyword()
     self.wm = WeatherModel(self.path + "LinearSVCl2.model",
                            self.path + "vectorizer.data",
                            self.path + "ch2.data",
                            self.path + "keywords.txt")
     super(SinaSpider, self).__init__(self.name)
Exemplo n.º 2
0
    def __init__(self, *a, **kw):

        self.rconn = RedisSet().redisSet()
        self.dba = Dba()
        self.keyword = {
            "新浪网": "Sina",
            "环球网": "Huanqiu",
            "搜狐网": "Sohu",
            "网易": "WangYi",
            "凤凰网": "Ifeng",
            "新华网": "Xinhua",
            "篱笆网": "Liba",
            "新民网": "Xinmin",
            "看看新闻网": "KanKan",
            "中国天气网": "Weather",
            "东方网": "Eastday",
            "人民网-上海": "People",
            "上海热线": "Online",
            "上观": "ShangGuan",
            "上海新闻网": "ShangHaiNews",
            "腾讯大申网": "Tencent",
            "宽带山": "KuanDai",
            "中国广播网": "Radio"
        }
        self.current_date = time.strftime('%Y-%m-%d',
                                          time.localtime(time.time()))
        super(QxjSpider, self).__init__(*a, **kw)
Exemplo n.º 3
0
 def __init__(self, *a, **kw):
     super(SinaMsgSpider, self).__init__(*a, **kw)
     self.rconn = RedisSet().redisSet()
Exemplo n.º 4
0
class SinaMsgSpider(CrawlSpider):
    name = "SinaMsgSpider"
    host = "https://weibo.cn"

    def __init__(self, *a, **kw):
        super(SinaMsgSpider, self).__init__(*a, **kw)
        self.rconn = RedisSet().redisSet()

    def start_requests(self):
        for lines in self.rconn.smembers("Tweets:sina"):
            sinaId = lines.split("--")[2]
            req_url = self.host + "/" + sinaId
            data = {"key": lines, "page": 1}
            yield Request(url=req_url,
                          callback=self.parse,
                          meta={"data": data})
            fans_url = self.host + "/" + sinaId + "/fans"
            yield Request(url=fans_url,
                          callback=self.parse_fans,
                          meta={"data": data})

    def parse_fans(self, response):
        data = response.meta["data"]
        hxs = Selector(response)
        div = hxs.xpath('body/div[@class="c"]')
        trs = div.xpath("//tr/td[2]/a[2]/@href").extract()
        keys = data["key"].split("--")
        for d in trs:
            try:
                userId = d.split("uid=")[1].split("&rl=")[0]
                info_url = self.host + "/" + userId + "/info"
                detail = {}
                detail["userId"] = userId
                detail["SinaName"] = keys[0]
                detail["SinaId"] = keys[2]
                yield Request(url=info_url,
                              callback=self.parse_info,
                              meta={"data": detail},
                              dont_filter=True)
            except:
                pass
        next_url = hxs.xpath('//a[text()="下页"]/@href'.decode('utf8')).extract()
        if next_url:
            yield Request(url=self.host + next_url[0],
                          callback=self.parse_fans,
                          meta={"data": data},
                          dont_filter=True)

    def parse_info(self, response):
        hxs = Selector(response)
        data = response.meta["data"]
        div = ";".join(
            hxs.xpath('//div[@class="c"][3]/text()').extract()) + ";"
        NickName = re.findall('昵称[::]?(.*?);'.decode('utf8'), div)
        Birthday = re.findall('生日[::]?(.*?);'.decode('utf8'), div)
        Gender = re.findall('性别[::]?(.*?);'.decode('utf8'), div)
        Marriage = re.findall('感情状况[::]?(.*?);'.decode('utf8'), div)
        Province = re.findall('地区[::]?(.*?);'.decode('utf8'), div)
        Signature = re.findall('简介[::]?(.*?);'.decode('utf8'), div)
        if NickName and NickName[0]:
            data['NickName'] = NickName[0]
        else:
            data['NickName'] = "Null"
        if Marriage and Marriage[0]:
            data['Marriage'] = Marriage[0]
        else:
            data['Marriage'] = "Null"
        if Birthday and Birthday[0]:
            data['Birthday'] = Birthday[0]
        else:
            data['Birthday'] = "Null"
        if Gender and Gender[0]:
            data['Gender'] = Gender[0]
        else:
            data['Gender'] = "Null"
        if Province and Province[0]:
            dou = Province[0].split(" ")
            if len(dou) == 2:
                data['Province'] = dou[0]
                data['City'] = dou[1]
            else:
                data['Province'] = dou[0]
                data['City'] = "Null"
        else:
            data['Province'] = "Null"
            data['City'] = "Null"
        if Signature and Signature[0]:
            data['Signature'] = Signature[0]
        else:
            data['Signature'] = "Null"
        req_url = "https://weibo.cn/attgroup/opening?uid=" + data["userId"]
        yield Request(url=req_url,
                      callback=self.parse_page,
                      meta={"data": data})

    def parse_page(self, response):
        hxs = Selector(response)
        data = response.meta["data"]
        msgs = ";".join(
            hxs.xpath('//div[@class="tip2"]/a/text()').extract()) + ";"
        Num_Fans = re.findall('微博\[(\d+)\]'.decode('utf8'), msgs)
        Num_Follows = re.findall('关注\[(\d+)\]'.decode('utf8'), msgs)
        Num_Tweets = re.findall('粉丝\[(\d+)\]'.decode('utf8'), msgs)
        if Num_Fans and Num_Fans[0]:
            data["Num_Fans"] = Num_Fans[0]
        else:
            data["Num_Fans"] = "Null"
        if Num_Follows and Num_Follows[0]:
            data["Num_Follows"] = Num_Follows[0]
        else:
            data["Num_Follows"] = "Null"
        if Num_Tweets and Num_Tweets[0]:
            data["Num_Tweets"] = Num_Tweets[0]
        else:
            data["Num_Tweets"] = "Null"
        item = SinaInformationItem()
        for key, val in data.items():
            item[key] = val
        yield item

    def parse(self, response):
        hxs = Selector(response)
        data = response.meta["data"]
        c = hxs.xpath('body/div[@class="c" and @id]')
        for div in c:
            try:
                like = re.findall('赞\[(\d+)\]'.decode('utf8'),
                                  div.extract())[0]  # 点赞数
                transfer = re.findall('转发\[(\d+)\]'.decode('utf8'),
                                      div.extract())[0]  # 转载数
                commentNum = re.findall('评论\[(\d+)\]'.decode('utf8'),
                                        div.extract())[0]  # 评论数
                contentId = div.xpath("@id").extract()[0].split('M_')[1]
                others = div.xpath('div/span[@class="ct"]/text()').extract(
                )  # 求时间和使用工具(手机或平台)
                strs = others[0].split(u"来自")
                pushTime, flag = Time_stamp().time_handle(strs[0].strip())
                tool = strs[1]
                detail = {}
                detail["key"] = data["key"]
                comment_url = "https://weibo.cn/comment/%s" % contentId
                detail["contentId"] = contentId
                detail["pushTime"] = pushTime
                detail["commentNum"] = commentNum
                detail["transfer"] = transfer
                detail["like"] = like
                detail["tool"] = tool
                yield Request(url=comment_url,
                              callback=self.parse_comment,
                              meta={"data": detail},
                              dont_filter=True)
                # break
            except:
                pass
        url_next = hxs.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
            .decode('utf8')).extract()
        if url_next and data['page'] < 5:
            req_url = "https://weibo.cn%s" % url_next[0]
            data["page"] += 1
            yield Request(url=req_url,
                          callback=self.parse,
                          meta={"data": data},
                          dont_filter=True)

    def parse_comment(self, response):
        hxs = Selector(response)
        data = response.meta["data"]
        c = hxs.xpath('body/div[@class="c" and @id]')
        contentStr = c[0].xpath(
            'div/span[@class="ctt"]//text()').extract()  # 微博内容
        content = ""
        for con in contentStr:
            content += con
        content.strip(":")
        data["content"] = content
        keys = data["key"].split("--")
        data["SinaName"] = keys[0]
        data["SinaId"] = keys[2]
        del data["key"]
        item = SinaTweetsItem()
        for key, val in data.items():
            item[key] = val
        yield item
Exemplo n.º 5
0
class SinaSpider(CrawlSpider):
    name = "SinaSpider"
    rootUrl = "https://weibo.cn"

    def __init__(self, limitTime):
        self.limitTime = limitTime
        self.rconn = RedisSet().redisSet()
        self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
        # self.path = "weatheroutput/weatheroutput/"
        self.words = self.load_keyword()
        self.wm = WeatherModel(self.path + "LinearSVCl2.model",
                               self.path + "vectorizer.data",
                               self.path + "ch2.data",
                               self.path + "keywords.txt")
        super(SinaSpider, self).__init__(self.name)

    @classmethod
    def from_settings(cls, settings):
        return cls(settings.get('LIMIT_TIME'))

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = SinaSpider.from_settings(crawler.settings)
        spider._set_crawler(crawler)
        return spider

    def start_requests(self):
        url = "https://weibo.cn/%s"
        le = 1
        for key in self.rconn.smembers("News:sina"):

            req_url = url % key.split("--")[-2]
            data = {"key": key, "page": 1}
            yield Request(url=req_url,
                          callback=self.parse,
                          meta={"data": data},
                          dont_filter=True)
            if le < 1:
                break
            le += 1

    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        data = response.meta["data"]
        flag_list = []
        for i in soup.find_all("div", class_="c")[1:-2]:
            strTime = i.find("span",
                             class_="ct").get_text(strip=True).split(u" 来自")[0]
            pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
            flag_list.append(flag)
            if flag == 1:
                content_id = i["id"].strip("M_")
                # self.rconn.delete("Sina:content_id")
                redis_flag = self.rconn.sadd("Sina:content_id", content_id)
                # redis_flag = 1
                if redis_flag == 1:
                    detail = {}
                    detail["key"] = data["key"]
                    comment_url = "https://weibo.cn/comment/%s" % content_id
                    detail["contentId"] = content_id
                    detail["pushTime"] = pushTime
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={"data": detail})
                    # break
        if 2 not in flag_list:
            hxs = Selector(response)
            url_next = hxs.xpath(
                'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
                .decode('utf8')).extract()[0]
            if url_next:
                req_url = "https://weibo.cn%s" % url_next
                yield Request(url=req_url,
                              callback=self.parse,
                              meta={"data": data})

    def parse_comment(self, response):
        data = response.meta["data"]
        hxs = Selector(response)
        if not data.has_key("page"):
            detail = {}
            detail["contentId"] = data["contentId"]
            detail["pushTime"] = data["pushTime"]
            keys = data["key"].split("--")
            detail["SinaName"] = keys[0]
            detail["Vermicelli"] = keys[1]
            detail["SinaID"] = keys[2]
            detail["SinaOthID"] = keys[2]
            contentStr = hxs.xpath(
                '//div/span[@class="ctt"]//text()').extract()  # 微博内容
            reprintStr = hxs.xpath(
                '//div/span[@class="pms"]/preceding-sibling::span/a//text()'
            ).extract()
            commontStr = hxs.xpath(
                '//div/span[@class="pms"]//text()').extract()
            thumbs_upStr = hxs.xpath(
                '//div/span[@class="pms"]/following-sibling::span/a//text()'
            ).extract()
            content = "0"
            reprint = "0"
            commont = "0"
            if '[' in str(reprintStr[0]):
                reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') +
                                             1:str(reprintStr[0]).index(']')]
            if '[' in str(commontStr[0]):
                commont = str(commontStr[0])[str(commontStr[0]).index('[') +
                                             1:str(commontStr[0]).index(']')]
            thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') +
                                             1:str(thumbs_upStr[0]).index(']')]
            for cd in contentStr:
                if len(cd) >= 3:
                    content += cd.replace(" ", "")
            detail["content"] = content
            detail["reprint"] = int(reprint)
            detail["commont"] = int(commont)
            detail["thumbs_up"] = int(thumbs_up)
            flag = int(self.wm.predict(detail["content"])[0])
            if flag != 1:
                total = 0
                for word in self.words:
                    if word.strip() in detail["content"]:
                        total += 1
                        if total >= 2:
                            flag = 1
                            break
            if flag == 1:
                detail["flag"] = 1
                contentItem = SinaContentItem()
                for key, val in detail.items():
                    contentItem[key] = val
                yield contentItem
                c = hxs.xpath('body/div[@class="c" and @id]')[1:]
            else:
                c = []
        else:
            c = hxs.xpath('body/div[@class="c" and @id]')
        for div in c:
            comme = {}
            comme["contentId"] = data["contentId"]
            ID = div.xpath("a/@href").extract_first()
            userName = div.xpath("a//text()").extract_first()
            commentId = div.xpath("@id").extract()[0].split('C_')[1]
            try:
                userId = ID.split("u/")[1]
            except:
                userId = ID.split('/')[1]
            commentStr = div.xpath(
                'span[@class="ctt"]//text()').extract()  # 微博内容
            comment = ""
            for co in commentStr:
                if len(co) >= 3:
                    comment += co.replace(" ", "")
            strTime = div.xpath(
                'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0]
            pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
            comme['pushTime'] = pushTime
            comme["userName"] = userName
            comme["commentId"] = commentId
            comme["userId"] = userId
            comme["comment"] = comment
            commentItem = SinaCommentItem()
            for key, val in comme.items():
                commentItem[key] = val
            yield commentItem
        url_next = hxs.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
            .decode('utf8')).extract()
        if c != [] and url_next:
            data["page"] = True
            next_url = self.rootUrl + url_next[0]
            yield Request(url=next_url,
                          callback=self.parse_comment,
                          meta={"data": data},
                          dont_filter=True)

    def load_keyword(self):
        fs = open(self.path + "weatherwords.txt", "r")
        words = fs.readlines()
        return words
Exemplo n.º 6
0
class SoGouSpider(CrawlSpider):
    name = "SoGouSpiderS"
    host = "http://mp.weixin.qq.com"

    def __init__(self, *a, **kw):
        super(SoGouSpider, self).__init__(*a, **kw)
        self.rconn = RedisSet().redisSet()

    def start_requests(self):
        url = "http://weixin.sogou.com/weixin?type=1&s_from=input&query=%s&ie=utf8&_sug_=n&_sug_type_="
        for key in self.rconn.smembers("News:sogou"):
            req_url = url % key.split("--")[-1]
            yield Request(url=req_url, callback=self.parse, meta={"key": key})

    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        next_url = soup.find("p", class_="tit").find("a")["href"]
        yield Request(url=next_url,
                      callback=self.parse_result,
                      meta=response.meta)

    def parse_result(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        data = response.meta
        try:
            scr = soup.find_all("script")[-2:-1][0].get_text(strip=True)
            data_list = scr.split("var msgList = ")[1].split(
                'seajs.use("sougou/p')[0].strip().strip(";")
            j_data = json.loads(data_list)
            art_list = []
            for li in j_data["list"]:
                dic = {}
                dic["id"] = li["comm_msg_info"]["id"]
                dic["pushtime"] = li["comm_msg_info"]["datetime"]
                dic["title"] = li["app_msg_ext_info"]["title"]
                dic["fileid"] = li["app_msg_ext_info"]["fileid"]
                dic["url"] = self.host + li["app_msg_ext_info"][
                    "content_url"].replace("amp;", "")
                art_list.append(dic)
                for ls in li["app_msg_ext_info"]["multi_app_msg_item_list"]:
                    dict_ls = {}
                    dict_ls["id"] = dic["id"]
                    dict_ls["pushtime"] = dic["pushtime"]
                    dict_ls["url"] = self.host + ls["content_url"].replace(
                        "amp;", "")
                    dict_ls["title"] = ls["title"]
                    dict_ls["fileid"] = ls["fileid"]
                    art_list.append(dict_ls)
            for ks in art_list:
                timeFormat, flag = Time_stamp().time_handle(ks["pushtime"])
                if flag == 1:
                    line = data["key"].split("--")[-1] + "--" + str(
                        ks["pushtime"])
                    self.rconn.delete("SoGou:Account")
                    flag = self.rconn.sadd("SoGou:Account", line)
                    if flag == 1:
                        ks["pushtime"] = timeFormat
                        ks["keyword"] = data["key"]
                        data["ks"] = ks
                        yield Request(url=ks["url"],
                                      callback=self.parse_article,
                                      meta=data)
        except Exception, e:
            logger.error("parse_result error <<%s>>" % e)