class ClassificationWorker: def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainD = None self.classifier = None self.phraseId = None self.phraseData = None self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" self.features = self.config["generator"]["features"] for module in self.config["processor"]["modules"]: self.features = self.features + module["features"] self.workerName = "bayzee.classification.worker" self.timeout = 600000 self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def classify(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "classify": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) self.phraseId = message["content"]["phraseId"] if self.classifier == None: self.trainD = self.__loadDataFromES("train", None) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.__train() self.trainD = self.__loadDataFromES("train", None) testD = self.__loadDataFromES("test", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) testD = orange.ExampleTable(self.trainD.domain, testD) for row in testD: phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) self.phraseData["_source"]["prob"] = prob self.phraseData["_source"]["class_type"] = classType self.logger.info("Classified '" + phrase + "' as " + classType + " with probability " + str(prob)) self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId, body=self.phraseData["_source"]) self.worker.reply(message, {"phraseId": self.phraseId, "status" : "classified", "type" : "reply"}, 120000000) self.logger.info("Terminating classification worker") def __getOrangeVariableForFeature(self, feature): if feature["isNumerical"]: return orange.FloatVariable(feature["name"]) else: return orange.EnumVariable(feature["name"]) def __loadDataFromES(self, dataType, domain): table = None if dataType != "train": table = orange.ExampleTable(domain) else: attributes = map(self.__getOrangeVariableForFeature, self.features) classAttribute = orange.EnumVariable("is_good", values = ["0", "1"]) domain = orange.Domain(attributes, classAttribute) domain.addmeta(orange.newmetaid(), orange.StringVariable("phrase")) table = orange.ExampleTable(domain) phrases = [] if dataType == "train": phrasesCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] elif dataType == "holdout": phraseCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] else: self.phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId) phrases = [self.phraseData] for row in phrases: try: row = row["_source"] featureValues = [] classType = "?" for feature in self.features: featureValues.append(row["features"][feature["name"]].encode("ascii")) if dataType == "train": classType = row["is_training"].encode("ascii", "ignore") elif dataType == "holdout": classType = row["is_holdout"].encode("ascii") example = None for i,featureValue in enumerate(featureValues): attr = domain.attributes[i] if type(attr) is orange.EnumVariable: attr.addValue(featureValue) example = orange.Example(domain, (featureValues + [classType])) example[domain.getmetas().items()[0][0]] = row["phrase"].encode("ascii") table.append(example) except: self.logger.error("Error classifying phrase '" + row["phrase"] + "'") return table def __train(self): for a in self.trainD.domain.attributes: self.logger.info("%s: %s" % (a.name,reduce(lambda x,y: x+', '+y, [i for i in a.values]))) trainSet = [] for row in self.trainD: phrase = row.getmetas().values()[0].value classType = row[-1].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value trainSet.append((featureSet, classType)) self.logger.info("\nTraining Naive Bayes Classifier with " + str(len(trainSet)) + " phrases...") self.classifier = nltk.NaiveBayesClassifier.train(trainSet) self.classifier.show_most_informative_features(50) def __calculateMeasures(self): falsePositives = 0 falseNegatives = 0 truePositives = 0 trueNegatives = 0 totalPositives = 0 totalNegatives = 0 totalHoldOutGoodPhrases = 0 totalHoldOutBadPhrases = 0 self.trainD = self.__loadDataFromES("train", None) self.holdOutD = self.__loadDataFromES("hold", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.holdOutD = orange.ExampleTable(self.trainD.domain, self.holdOutD) for row in self.holdOutD: actualClassType = row[-1].value phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value if self.classifier == None: classifierFile = open(self.classifierFilePath) self.classifier = pickle.load(classifierFile) classifierFile.close() prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) if classType == "1": totalPositives += 1 if classType == actualClassType: truePositives += 1 else: totalNegatives += 1 if classType == actualClassType: trueNegatives += 1 if actualClassType == "1": totalHoldOutGoodPhrases += 1 else: totalHoldOutBadPhrases += 1 precisionOfGood = 100.0 * truePositives/totalPositives recallOfGood = 100.0 * truePositives/totalHoldOutGoodPhrases fMeasureOfGood = 2.0 * precisionOfGood * recallOfGood / (precisionOfGood + recallOfGood) precisionOfBad = 100.0 * trueNegatives/totalNegatives recallOfBad = 100.0*trueNegatives/totalHoldOutBadPhrases fMeasureOfBad = 2.0 * precisionOfBad * recallOfBad / (precisionOfBad + recallOfBad) self.logger.info("\nPrecision of Good: " + str(round(precisionOfGood, 2)) + "%") self.logger.info("Recall of Good: " + str(round(recallOfGood, 2)) + "%") self.logger.info("Balanced F-measure of Good: " + str(round(fMeasureOfGood, 2)) + "%") self.logger.info("Precision of Bad: " + str(round(precisionOfBad, 2)) + "%") self.logger.info("Recall of Bad: " + str(round(recallOfBad, 2)) + "%") self.logger.info("Balanced F-measure of Bad: " + str(round(fMeasureOfBad, 2)) + "%") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)
class GenerationWorker: def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def generate(self): self.__extractFeatures() def __extractFeatures(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "generate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) phraseId = message["content"]["phraseId"] phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id = phraseId) floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}" token = phraseData["_source"]["phrase"] documentId = phraseData["_source"]["document_id"] self.logger.info("Extracted common features for phrase '" + token + "'") entry = {} shouldMatch = map(lambda x: {"match_phrase":{x:token}}, self.corpusFields) query = {"query":{"bool":{"should":shouldMatch}}} data = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body=query, explain=True, size=self.corpusSize) entry["max_score"] = 0 maxScore = 0 avgScore = 0 maxTermFrequency = 0 avgTermFrequency = 0 for hit in data["hits"]["hits"]: avgScore += float(hit["_score"]) numOfScores = 0 hitTermFrequency = 0 explanation = json.dumps(hit["_explanation"]) while len(explanation) > len(token): indexOfToken = explanation.find("tf(") + len("tf(") if indexOfToken < len("tf("): break explanation = explanation[indexOfToken:] freqToken = explanation.split(")")[0] explanation = explanation.split(")")[1] if freqToken.find("freq=") >= 0: numOfScores += 1 hitTermFrequency += float(freqToken.split("=")[1]) if numOfScores > 0 : hitTermFrequency = hitTermFrequency / numOfScores if maxTermFrequency < hitTermFrequency: maxTermFrequency = hitTermFrequency avgTermFrequency += hitTermFrequency if len(data["hits"]["hits"]) > 0: avgTermFrequency = avgTermFrequency * 1.0 / len(data["hits"]["hits"]) if int(data["hits"]["total"]) > 0: avgScore = (avgScore * 1.0) / int(data["hits"]["total"]) if data["hits"]["max_score"] != None: maxScore = data["hits"]["max_score"] if "max_score" in self.featureNames: entry["max_score"] = floatPrecision.format(float(maxScore)) if "doc_count" in self.featureNames: entry["doc_count"] = floatPrecision.format(float(data["hits"]["total"])) if "avg_score" in self.featureNames: entry["avg_score"] = floatPrecision.format(float(avgScore)) if "max_term_frequency" in self.featureNames: entry["max_term_frequency"] = floatPrecision.format(float(maxTermFrequency)) if "avg_term_frequency" in self.featureNames: entry["avg_term_frequency"] = floatPrecision.format(float(avgTermFrequency)) # get additional features for processorInstance in self.config["processor_instances"]: processorInstance.extractFeatures(self.config, token, entry) phraseData["_source"]["features"] = entry if token in self.trainingDataset: phraseData["_source"]["is_training"] = self.trainingDataset[token].strip() if token in self.holdOutDataset: phraseData["_source"]["is_holdout"] = self.holdOutDataset[token].strip() self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId, body=phraseData["_source"]) self.worker.reply(message, {"phraseId": phraseId, "status" : "generated", "type" : "reply"}, 120000000) if message["content"]["type"] == "stop_dispatcher": self.worker.reply(message, {"phraseId": -1, "status" : "stop_dispatcher", "type" : "stop_dispatcher"}, self.timeout) self.logger.info("Terminating generation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)
class AnnotationWorker: def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.workerName = "bayzee.annotation.worker" self.timeout = 6000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.analyzerIndex = self.corpusIndex + "__analysis__" self.worker = DurableChannel(self.workerName, config) self.dispatchers = {} def annotate(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "annotate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) documentId = message["content"]["documentId"] document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields) if "fields" in document: for field in self.corpusFields: shingles = [] if field in document["fields"]: if type(document["fields"][field]) is list: for element in document["fields"][field]: if len(element) > 0: shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle") shingles += shingleTokens["tokens"] else: if len(document["fields"][field]) > 0: shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"] shingles = map(self.__replaceUnderscore, shingles) shingles = filter(self.__filterTokens, shingles) if shingles != None and len(shingles) > 0: for shingle in shingles: phrase = shingle["token"] key = self.__keyify(phrase) if len(key) > 0: data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]} if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key): self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data) sleep(1) for processorInstance in self.config["processor_instances"]: processorInstance.annotate(self.config, documentId) self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout) self.logger.info("Terminating annotation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName) def __keyify(self, phrase): phrase = phrase.strip() if len(phrase) == 0: return "" key = re.sub("[^A-Za-z0-9]", " ", phrase) key = " ".join(phrase.split()) key = key.lower() key = "-".join(phrase.split()) return key def __replaceUnderscore(self,shingle): token = shingle["token"] token = token.replace("_","") token = re.sub('\s+', ' ', token).strip() shingle["token"] = token return shingle def __filterTokens(self, shingle): global esStopWords tokens = shingle["token"].split(" ") firstToken = tokens[0] lastToken = tokens[-1] isValid = True isValid = (isValid and lastToken != None) isValid = (isValid and len(lastToken) > 1) isValid = (isValid and not firstToken.replace(".","",1).isdigit()) isValid = (isValid and not lastToken.replace(".","",1).isdigit()) isValid = (isValid and firstToken not in esStopWords) isValid = (isValid and lastToken not in esStopWords) return isValid
class GenerationWorker: def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def generate(self): self.__extractFeatures() def __extractFeatures(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "generate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[ message["content"]["from"]] = RemoteChannel( message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen( self.unregisterDispatcher) phraseId = message["content"]["phraseId"] phraseData = self.esClient.get( index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId) floatPrecision = "{0:." + str( self.config["generator"]["floatPrecision"]) + "f}" token = phraseData["_source"]["phrase"] documentId = phraseData["_source"]["document_id"] self.logger.info("Extracted common features for phrase '" + token + "'") entry = {} shouldMatch = map(lambda x: {"match_phrase": { x: token }}, self.corpusFields) query = {"query": {"bool": {"should": shouldMatch}}} data = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body=query, explain=True, size=self.corpusSize) entry["max_score"] = 0 maxScore = 0 avgScore = 0 maxTermFrequency = 0 avgTermFrequency = 0 for hit in data["hits"]["hits"]: avgScore += float(hit["_score"]) numOfScores = 0 hitTermFrequency = 0 explanation = json.dumps(hit["_explanation"]) while len(explanation) > len(token): indexOfToken = explanation.find("tf(") + len("tf(") if indexOfToken < len("tf("): break explanation = explanation[indexOfToken:] freqToken = explanation.split(")")[0] explanation = explanation.split(")")[1] if freqToken.find("freq=") >= 0: numOfScores += 1 hitTermFrequency += float(freqToken.split("=")[1]) if numOfScores > 0: hitTermFrequency = hitTermFrequency / numOfScores if maxTermFrequency < hitTermFrequency: maxTermFrequency = hitTermFrequency avgTermFrequency += hitTermFrequency if len(data["hits"]["hits"]) > 0: avgTermFrequency = avgTermFrequency * 1.0 / len( data["hits"]["hits"]) if int(data["hits"]["total"]) > 0: avgScore = (avgScore * 1.0) / int(data["hits"]["total"]) if data["hits"]["max_score"] != None: maxScore = data["hits"]["max_score"] if "max_score" in self.featureNames: entry["max_score"] = floatPrecision.format(float(maxScore)) if "doc_count" in self.featureNames: entry["doc_count"] = floatPrecision.format( float(data["hits"]["total"])) if "avg_score" in self.featureNames: entry["avg_score"] = floatPrecision.format(float(avgScore)) if "max_term_frequency" in self.featureNames: entry["max_term_frequency"] = floatPrecision.format( float(maxTermFrequency)) if "avg_term_frequency" in self.featureNames: entry["avg_term_frequency"] = floatPrecision.format( float(avgTermFrequency)) # get additional features for processorInstance in self.config["processor_instances"]: processorInstance.extractFeatures(self.config, token, entry) phraseData["_source"]["features"] = entry if token in self.trainingDataset: phraseData["_source"][ "is_training"] = self.trainingDataset[token].strip() if token in self.holdOutDataset: phraseData["_source"]["is_holdout"] = self.holdOutDataset[ token].strip() self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId, body=phraseData["_source"]) self.worker.reply(message, { "phraseId": phraseId, "status": "generated", "type": "reply" }, 120000000) if message["content"]["type"] == "stop_dispatcher": self.worker.reply( message, { "phraseId": -1, "status": "stop_dispatcher", "type": "stop_dispatcher" }, self.timeout) self.logger.info("Terminating generation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)