Exemplo n.º 1
0
 def __init__(self, streamDir, userDir, userBuilder, urlBuilder):
     StoppableThread.__init__(self, self.__class__.__name__)
     self.__queue = PriorityQueue(maxsize=50)
     self.__urlBuilder = urlBuilder
     self.__userBuilder = userBuilder
     self.__dir = os.path.join(streamDir, "tweets")
     self.__model = SqlModel(os.path.join(streamDir, "finalUrl.db"), drop=False)
     self.__userMgr = UserMgr(userDir)
     self.__server = EmbeddedHttpServer(self, self.__userMgr)
     self.__crawlers = []
Exemplo n.º 2
0
class ResolvedTweetQueue(StoppableThread):
    def __init__(self, streamDir, userDir, userBuilder, urlBuilder):
        StoppableThread.__init__(self, self.__class__.__name__)
        self.__queue = PriorityQueue(maxsize=50)
        self.__urlBuilder = urlBuilder
        self.__userBuilder = userBuilder
        self.__dir = os.path.join(streamDir, "tweets")
        self.__model = SqlModel(os.path.join(streamDir, "finalUrl.db"), drop=False)
        self.__userMgr = UserMgr(userDir)
        self.__server = EmbeddedHttpServer(self, self.__userMgr)
        self.__crawlers = []

    def tweetResolved(self, tweet):
        self.__queue.put((10, tweet))

    def atBegin(self):
        self.__server.start()

    def getServerUrl(self):
        return self.__server.getHref()

    def runPart(self):
        try:
            obj = self.__queue.get(block=True, timeout=3)[1]
            if isinstance(obj, GetMsg):
                self.__parseGet(obj)
            elif isinstance(obj, CrawlerResult):
                self.__afterCallback(obj.crawler(), obj.results())
            else:
                self.__parseTweet(obj)

        except Empty:
            pass
        return

    def __parseGet(self, getMsg):
        urls = self.__model.selectUrls(getMsg.cat())
        urls = [u.copy() for u in urls]
        getMsg.setResponse(urls)

    def __parseTweet(self, tweet):
        for url in tweet.urls():
            if url.isError():
                logger.info(u"Tweet bad: wrong url: " + unicode(tweet) + u" " + unicode(url))
                self.__urlBuilder.delete(url)
                break
            url.setDocumentClasses(TxtClassificatorWrapper.instance().classify(url.getText()))
            if url.isRoot() or url.lang() != "en" or "short" in url.documentClasses():
                logger.info(u"Tweet bad: " + unicode(tweet) + u" " + unicode(url))
                self.__urlBuilder.delete(url)
                break
            logger.info(u"Tweet good: " + unicode(tweet) + u" " + unicode(url))
            logger.info(u"URL: " + unicode(url))
            self.__model.updateUrl(url)

    def __callback(self, crawler, userFeatures):
        self.__queue.put(CrawlerResult(crawler, userFeatures))

    def __createUserProfile(self):
        users = self.__model.selectUserWithoutCat(1)
        if not users:
            return False
        token = self.__server.getToken()
        for u in users:
            crawler = UserCrawler(token, self.__userMgr, self.__callback, userId=u.id)
            self.__crawlers.append(crawler)
        return True

    def __afterCallback(self, crawler, userFeatures):
        self.__crawlers.remove(crawler)
        userFeatures.cats()
        userFeatures.langs()

    def atEnd(self):
        StoppableThread.atEnd(self)
        self.__server.stop()
        self.__userMgr.close()

    def finalUrls(self, cat=None):
        req = GetMsg(cat)
        self.__queue.put((1, req))
        return req.getResponse()

    def __tweetWithUrlToRoot(self, tweet):
        for u in tweet.urls():
            if u.isRoot():
                return True
        return False