def __init__(self, streamDir, userDir, userBuilder, urlBuilder): StoppableThread.__init__(self, self.__class__.__name__) self.__queue = PriorityQueue(maxsize=50) self.__urlBuilder = urlBuilder self.__userBuilder = userBuilder self.__dir = os.path.join(streamDir, "tweets") self.__model = SqlModel(os.path.join(streamDir, "finalUrl.db"), drop=False) self.__userMgr = UserMgr(userDir) self.__server = EmbeddedHttpServer(self, self.__userMgr) self.__crawlers = []
class ResolvedTweetQueue(StoppableThread): def __init__(self, streamDir, userDir, userBuilder, urlBuilder): StoppableThread.__init__(self, self.__class__.__name__) self.__queue = PriorityQueue(maxsize=50) self.__urlBuilder = urlBuilder self.__userBuilder = userBuilder self.__dir = os.path.join(streamDir, "tweets") self.__model = SqlModel(os.path.join(streamDir, "finalUrl.db"), drop=False) self.__userMgr = UserMgr(userDir) self.__server = EmbeddedHttpServer(self, self.__userMgr) self.__crawlers = [] def tweetResolved(self, tweet): self.__queue.put((10, tweet)) def atBegin(self): self.__server.start() def getServerUrl(self): return self.__server.getHref() def runPart(self): try: obj = self.__queue.get(block=True, timeout=3)[1] if isinstance(obj, GetMsg): self.__parseGet(obj) elif isinstance(obj, CrawlerResult): self.__afterCallback(obj.crawler(), obj.results()) else: self.__parseTweet(obj) except Empty: pass return def __parseGet(self, getMsg): urls = self.__model.selectUrls(getMsg.cat()) urls = [u.copy() for u in urls] getMsg.setResponse(urls) def __parseTweet(self, tweet): for url in tweet.urls(): if url.isError(): logger.info(u"Tweet bad: wrong url: " + unicode(tweet) + u" " + unicode(url)) self.__urlBuilder.delete(url) break url.setDocumentClasses(TxtClassificatorWrapper.instance().classify(url.getText())) if url.isRoot() or url.lang() != "en" or "short" in url.documentClasses(): logger.info(u"Tweet bad: " + unicode(tweet) + u" " + unicode(url)) self.__urlBuilder.delete(url) break logger.info(u"Tweet good: " + unicode(tweet) + u" " + unicode(url)) logger.info(u"URL: " + unicode(url)) self.__model.updateUrl(url) def __callback(self, crawler, userFeatures): self.__queue.put(CrawlerResult(crawler, userFeatures)) def __createUserProfile(self): users = self.__model.selectUserWithoutCat(1) if not users: return False token = self.__server.getToken() for u in users: crawler = UserCrawler(token, self.__userMgr, self.__callback, userId=u.id) self.__crawlers.append(crawler) return True def __afterCallback(self, crawler, userFeatures): self.__crawlers.remove(crawler) userFeatures.cats() userFeatures.langs() def atEnd(self): StoppableThread.atEnd(self) self.__server.stop() self.__userMgr.close() def finalUrls(self, cat=None): req = GetMsg(cat) self.__queue.put((1, req)) return req.getResponse() def __tweetWithUrlToRoot(self, tweet): for u in tweet.urls(): if u.isRoot(): return True return False