Python WeatherModel примеры использования

Язык программирования: Python

Пространство имен/Пакет: NewsSpider.WeatherModel

Класс/Тип: WeatherModel

Примеров на hotexamples.com: 3

Python WeatherModel - 3 примера найдено. Это лучшие примеры Python кода для NewsSpider.WeatherModel.WeatherModel, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

WeatherModel(2)

predict(1)

Основные методы

WeatherModel (2)

predict (1)

Пример #1

Показать файл

 def __init__(self, limitTime):
     self.limitTime = limitTime
     self.rconn = RedisSet().redisSet()
     self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
     self.words = self.load_keyword()
     self.wm = WeatherModel(self.path + "LinearSVCl2.model",
                            self.path + "vectorizer.data",
                            self.path + "ch2.data",
                            self.path + "keywords.txt")
     super(SinaSpider, self).__init__(self.name)

Пример #2

Показать файл

 def __init__(self):
     # client = pymongo.MongoClient("192.168.20.216", 27017)
     # db = client["SinaWebchatWeather"]
     # self.sinaComment = db["SinaComment"]
     # self.sinaContent = db["SinaContent"]
     # self.sogou = db["SoGouContent"]
     # self.tweets = db["Tweets"]
     # self.Info = db["Information"]
     self.ora = Dba()
     self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
     # self.path = "weatheroutput/weatheroutput/"
     self.keys = self.load_key()
     self.wm = WeatherModel(
         self.path + "LinearSVCl2.model",  self.path + "vectorizer.data",
         self.path + "ch2.data", self.path + "weatherwords.txt")
     self.tags = jieba.analyse.extract_tags
     self.ConSql = """
         INSERT INTO  QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date)
         VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """
     self.old_key_sql = """
         insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date)
         VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')"""
     self.all_key_sql = """
         insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')"""
     self.CommSql = """
         INSERT INTO  QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date)
         VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))"""
     self.SinComSql = """

Пример #3

Показать файл

class SinaSpider(CrawlSpider):
    name = "SinaSpider"
    rootUrl = "https://weibo.cn"

    def __init__(self, limitTime):
        self.limitTime = limitTime
        self.rconn = RedisSet().redisSet()
        self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
        # self.path = "weatheroutput/weatheroutput/"
        self.words = self.load_keyword()
        self.wm = WeatherModel(self.path + "LinearSVCl2.model",
                               self.path + "vectorizer.data",
                               self.path + "ch2.data",
                               self.path + "keywords.txt")
        super(SinaSpider, self).__init__(self.name)

    @classmethod
    def from_settings(cls, settings):
        return cls(settings.get('LIMIT_TIME'))

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = SinaSpider.from_settings(crawler.settings)
        spider._set_crawler(crawler)
        return spider

    def start_requests(self):
        url = "https://weibo.cn/%s"
        le = 1
        for key in self.rconn.smembers("News:sina"):

            req_url = url % key.split("--")[-2]
            data = {"key": key, "page": 1}
            yield Request(url=req_url,
                          callback=self.parse,
                          meta={"data": data},
                          dont_filter=True)
            if le < 1:
                break
            le += 1

    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        data = response.meta["data"]
        flag_list = []
        for i in soup.find_all("div", class_="c")[1:-2]:
            strTime = i.find("span",
                             class_="ct").get_text(strip=True).split(u" 来自")[0]
            pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
            flag_list.append(flag)
            if flag == 1:
                content_id = i["id"].strip("M_")
                # self.rconn.delete("Sina:content_id")
                redis_flag = self.rconn.sadd("Sina:content_id", content_id)
                # redis_flag = 1
                if redis_flag == 1:
                    detail = {}
                    detail["key"] = data["key"]
                    comment_url = "https://weibo.cn/comment/%s" % content_id
                    detail["contentId"] = content_id
                    detail["pushTime"] = pushTime
                    yield Request(url=comment_url,
                                  callback=self.parse_comment,
                                  meta={"data": detail})
                    # break
        if 2 not in flag_list:
            hxs = Selector(response)
            url_next = hxs.xpath(
                'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
                .decode('utf8')).extract()[0]
            if url_next:
                req_url = "https://weibo.cn%s" % url_next
                yield Request(url=req_url,
                              callback=self.parse,
                              meta={"data": data})

    def parse_comment(self, response):
        data = response.meta["data"]
        hxs = Selector(response)
        if not data.has_key("page"):
            detail = {}
            detail["contentId"] = data["contentId"]
            detail["pushTime"] = data["pushTime"]
            keys = data["key"].split("--")
            detail["SinaName"] = keys[0]
            detail["Vermicelli"] = keys[1]
            detail["SinaID"] = keys[2]
            detail["SinaOthID"] = keys[2]
            contentStr = hxs.xpath(
                '//div/span[@class="ctt"]//text()').extract()  # 微博内容
            reprintStr = hxs.xpath(
                '//div/span[@class="pms"]/preceding-sibling::span/a//text()'
            ).extract()
            commontStr = hxs.xpath(
                '//div/span[@class="pms"]//text()').extract()
            thumbs_upStr = hxs.xpath(
                '//div/span[@class="pms"]/following-sibling::span/a//text()'
            ).extract()
            content = "0"
            reprint = "0"
            commont = "0"
            if '[' in str(reprintStr[0]):
                reprint = str(reprintStr[0])[str(reprintStr[0]).index('[') +
                                             1:str(reprintStr[0]).index(']')]
            if '[' in str(commontStr[0]):
                commont = str(commontStr[0])[str(commontStr[0]).index('[') +
                                             1:str(commontStr[0]).index(']')]
            thumbs_up = str(thumbs_upStr[0])[str(thumbs_upStr[0]).index('[') +
                                             1:str(thumbs_upStr[0]).index(']')]
            for cd in contentStr:
                if len(cd) >= 3:
                    content += cd.replace(" ", "")
            detail["content"] = content
            detail["reprint"] = int(reprint)
            detail["commont"] = int(commont)
            detail["thumbs_up"] = int(thumbs_up)
            flag = int(self.wm.predict(detail["content"])[0])
            if flag != 1:
                total = 0
                for word in self.words:
                    if word.strip() in detail["content"]:
                        total += 1
                        if total >= 2:
                            flag = 1
                            break
            if flag == 1:
                detail["flag"] = 1
                contentItem = SinaContentItem()
                for key, val in detail.items():
                    contentItem[key] = val
                yield contentItem
                c = hxs.xpath('body/div[@class="c" and @id]')[1:]
            else:
                c = []
        else:
            c = hxs.xpath('body/div[@class="c" and @id]')
        for div in c:
            comme = {}
            comme["contentId"] = data["contentId"]
            ID = div.xpath("a/@href").extract_first()
            userName = div.xpath("a//text()").extract_first()
            commentId = div.xpath("@id").extract()[0].split('C_')[1]
            try:
                userId = ID.split("u/")[1]
            except:
                userId = ID.split('/')[1]
            commentStr = div.xpath(
                'span[@class="ctt"]//text()').extract()  # 微博内容
            comment = ""
            for co in commentStr:
                if len(co) >= 3:
                    comment += co.replace(" ", "")
            strTime = div.xpath(
                'span[@class="ct"]//text()').extract()[0].split(u" 来自")[0]
            pushTime, flag = Time_stamp().time_handle(strTime, self.limitTime)
            comme['pushTime'] = pushTime
            comme["userName"] = userName
            comme["commentId"] = commentId
            comme["userId"] = userId
            comme["comment"] = comment
            commentItem = SinaCommentItem()
            for key, val in comme.items():
                commentItem[key] = val
            yield commentItem
        url_next = hxs.xpath(
            'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="下页"]/@href'
            .decode('utf8')).extract()
        if c != [] and url_next:
            data["page"] = True
            next_url = self.rootUrl + url_next[0]
            yield Request(url=next_url,
                          callback=self.parse_comment,
                          meta={"data": data},
                          dont_filter=True)

    def load_keyword(self):
        fs = open(self.path + "weatherwords.txt", "r")
        words = fs.readlines()
        return words