def __init__(self, filename = "/run/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/stream/clusteringData.db"): logger.info("Start building " + self.__class__.__name__) self.__mutex = threading.Semaphore() data = shelve.open(filename, protocol=-1, flag="r") langDetect = LangDetect.instance() vectors = [features(item["text"]) for digest, item in data.items() if item["text"] and item["text"] != "ERROR" and langDetect.detect(item["text"]) is "en"] self.__maxV = calcDiv(vectors) #vectors = normalize(vectors, self.__maxV) means = [array([10, 40, 0, 1]), array([30, 340, 2, 30]), array([120, 1500, 15, 50])] self.__clusterer = cluster.KMeansClusterer(3, euclidean_distance, initial_means=means, avoid_empty_clusters=True) self.__clusterer.cluster(vectors) klassIdToSize = {"0": 0, "1": 0, "2": 0} klassIdToWordsCount = {"0": 0, "1": 0, "2": 0} for item in data.itervalues(): text = item["text"] if text and text != "ERROR": feat = features(text) #feat = normalize(feat, self.__maxV) klass = str(self.__clusterer.classify(feat)) klassIdToSize[klass] += 1 klassIdToWordsCount[klass] += len(text.split()) data.close() results = [] for klassId in ["0", "1", "2"]: meanWordsInKlass = klassIdToWordsCount[klassId] / klassIdToSize[klassId] if klassIdToSize[klassId] != 0 else 0 results.append({"klass": klassId, "mean" : meanWordsInKlass}) logger.info("Clustering results: " + str(results)) sortedKlass = sorted(results, lambda x,y: x["mean"] < y["mean"]) self.__klassIdToLabel = {klassIdWithLabel[0]: klassIdWithLabel[1] for klassIdWithLabel in zip([item["klass"] for item in sortedKlass], ["short", "medium", "long"])}
def __init__(self, mainDir, input, inlinedWebpageDir): self.__mainDir = mainDir self.__input = input self.__langId = LangDetect.instance() self.__inlinedWebpageDir = inlinedWebpageDir if not os.path.exists(self.__inlinedWebpageDir): os.makedirs(self.__inlinedWebpageDir) htmlsDir = os.path.join(self.__inlinedWebpageDir, "htmls"); if not os.path.exists(htmlsDir): os.makedirs(htmlsDir) data = shelve.open(self.__input) self.__data = [] self.__classes = set([self.defaultClass()]) url2klass = self.__readKlassFile() logger.info("Read shelve...") for item in data.itervalues(): text = item["text"] url = item["url"] klass = self.__getKlass(url2klass, url) if not self.__ignorable(text, url): self.__data.append(RowModel(url, text, klass, self)) if klass: self.__classes.add(klass) logger.info("Done " + str(len(self.__data))) Publisher.subscribe(self._onSave, "model.save") self.__downloader = UrlDownloaderController(self) self.__downloader.start()
def setTextAndHtmlAndUrl(self, text, html, url): if text is None: raise ValueError("Text is None!") if html is None: raise ValueError("HTML is None!") if url is None: raise ValueError("URL is None!") logger.info(u"Url " + self.__realUrl + u" resolved") self.__text = text self.__html = html if self.__realUrl != url: logger.info(u"Redirected from \"" + self.__realUrl + u"\" to \"" + url + u"\"") self.__realUrl = url try: self.__lang = LangDetect.instance().detect(text) if text else None except BaseException as e: logger.exception(u"lang detect error: " + unicode(text)) raise e
sortedKlass = sorted(results, lambda x,y: x["mean"] < y["mean"]) self.__klassIdToLabel = {klassIdWithLabel[0]: klassIdWithLabel[1] for klassIdWithLabel in zip([item["klass"] for item in sortedKlass], ["short", "medium", "long"])} def classify(self, document): try: self.__mutex.acquire() feat = features(document) #feat = normalize(feat, self.__maxV) docClass = self.__clusterer.classify(feat) return self.__klassIdToLabel[str(docClass)] finally: self.__mutex.release() if __name__ == "__main__": c = DocumentSizeClustering() langDetect = LangDetect.instance() data = shelve.open("/run/media/eea1ee1d-e5c4-4534-9e0b-24308315e271/pynews/stream/clusteringData.db", protocol=-1, flag="r") print "Documents: " + str(len(data)) position = 0 labels = {"short": 0, "medium": 0, "long": 0} input = [] for digest, item in data.items(): if item["text"] and item["text"] != "ERROR" and langDetect.detect(item["text"]) is "en": input.append(item) testItems = input #testItems = [] #for i in range(0, 150): # e = choice(input) # input.remove(e) # testItems.append(e)