def __init__(self): self.MIN_SCORE = .1 self.dbConnection = HelpConnection() self.filt = NormalTokFilter() self.title_weight = 1 self.tag_weight = .5 self.rating_weight = .3 self.view_weight = .1 self.body_weight = 1 self.max_views = 1 self.max_rating = 1 self.stop_words = ['i', 'the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'hot', 'but', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'will', 'way', 'about', 'many', 'then', 'them', 'would', 'like', 'so', 'these', 'her', 'thing', 'see', 'him', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'who', 'may', 'down', 'object', 'side', 'been', 'now', 'any', 'work', 'part', 'take', 'place', 'made', 'live', 'back', 'little', 'only', 'round', 'man']
class HInferenceEngine(): def __init__(self): self.MIN_SCORE = .1 self.dbConnection = HelpConnection() self.filt = NormalTokFilter() self.title_weight = 1 self.tag_weight = .5 self.rating_weight = .3 self.view_weight = .1 self.body_weight = 1 self.max_views = 1 self.max_rating = 1 self.stop_words = ['i', 'the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'hot', 'but', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'will', 'way', 'about', 'many', 'then', 'them', 'would', 'like', 'so', 'these', 'her', 'thing', 'see', 'him', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'who', 'may', 'down', 'object', 'side', 'been', 'now', 'any', 'work', 'part', 'take', 'place', 'made', 'live', 'back', 'little', 'only', 'round', 'man'] """ takes list of strings as input, returns string """ def infer(self, msg): qid = self.find_best_answer(msg) response = self.findAnswer(qid) return response def find_best_answer(self, msg): # Find first 100 question/answer pairs that contain # the specified keywords in either question title/test or # answer text tmp = list(set(msg).difference(self.stop_words)) stripped = tmp if tmp else list(set(msg)) qDict = dict() q = 'select q.qid, q.creator, q.editor, q.title, q.text, ' \ 'q.rating, q.num_views, q.favorited, q.created, q.edited from question q where %s order by q.rating desc' q = q % ' or '.join(["q.title ilike '%%%s%%'" % k for k in stripped]) qs = self.dbConnection.query(q) if not qs: return None self.max_views = float(max(qs, key=lambda x: x[6])[6]) self.max_rating = float(qs[0][5]) for qu in qs: (qid, creator, editor, title, body, rating, num_views, favorited, created, edited) = qu rank = self.rankQuestion(qu, msg) qDict[qu] = (rank, qid) best = max(qDict.values(), key=lambda x: x[0]) return best[1] if best[0] >= self.MIN_SCORE else None def get_body_score(self, body, keywords): temp1 = set(self.filt.filter(body)).difference(self.stop_words) temp2 = set(keywords).difference(self.stop_words) body_set = temp1 if temp1 else set(self.filt.filter(body)) keyword_set = temp2 if temp2 else set(keywords) return (len(body_set.intersection(keyword_set))/float(len(body_set))) * self.body_weight def get_tag_score(self, tags, keywords): temp = set(keywords).difference(self.stop_words) keyword_set = temp if temp else set(keywords) tag_set = set(tags) return (len(tag_set.intersection(keyword_set))/float(len(tag_set))) * self.tag_weight def rankQuestion(self, question, msg): (qid, creator, editor, title, body, rating, num_views, favorited, created, edited) = question tags = map(lambda x: x[0], self.dbConnection.query(('select distinct tag from tag_question where qid = %s', [qid]))) title = self.filt.filter(title) # Tag score tag_score = self.get_tag_score(tags, msg) # Body score body_score = self.get_body_score(body, msg) #Title score titleScore = self.getTitleScore(title, msg) #Views score viewScore = self.getViewScore(num_views) #Rank score ratingScore = self.getRatingScore(rating) #Sum the weights here rank = titleScore + viewScore + ratingScore + body_score + tag_score return rank def getRatingScore(self, rating): return self.rating_weight * (rating / self.max_rating) def getViewScore(self, num_views): return self.view_weight * (num_views / self.max_views) def getTitleScore(self, title, msg): return self.title_weight * self.calcDist(title, msg) def calcDist(self, title, message): title_features = set([]) title_features = title_features.union(title).union(message) titleVector = [0]*len(title_features) messageVector = [0]*len(title_features) for i, word in enumerate(title_features): if word in title: titleVector[i] = 1 if word in message: messageVector[i] = 1 return ((len(title_features) - sum([(x - y)**2 for x, y in zip(titleVector, messageVector)]))) / float(len(title_features)) #Finds the most likely answer in a very basic way def findAnswer(self, qid): # Find the answer associated with the best qid try: answers = self.dbConnection.query( ("SELECT answer.text " \ "FROM answer " \ "WHERE answer.qid = %s " \ "ORDER BY answer.rating DESC", [str(qid)])) print qid return answers[0] except: self.dbConnection.connection.commit() return "I could not find an answer"