class QueryMaker(object): def __init__(self): self.preprocessor = PreProcessor() self.modelWrapper = TensorServer() self._question_maker = QuestionMaker() self._service_shuttle = ShuttleBus() self._service_search = Search() self.CONFIG = config.QUERY def by_category(self, chat, category, matched_question=None): if category == 'shuttle_bus': return self._service_shuttle.response() elif category == 'talk' or category == 'prepared': return {"mode": category, "answer": matched_question.answer} elif category == 'food': return {'mode': 'food', 'answer': '학식 보여주기'} elif category == 'book': return {'mode': 'book', 'answer': '도서관 모드 진입'} elif category == 'search': answer, output = self._service_search.response(chat) if not answer: # 정답이 오지 않았다면 실패 return {'mode': 'unknown', 'answer': '무슨 말인지 모르겠다냥~ 다시 해달라냥'} return {'mode': 'search', 'answer': answer, 'output': output} def make_query(self, chat, added_time=None, analysis=False): chat, removed = self.preprocessor.clean(chat) if chat is '' or chat is None: return None if not added_time: added_time = datetime.utcnow().astimezone(UTC) added_time.astimezone(UTC) def get_top(distances, measure='jaccard'): if not distances: return None assert type(distances) is OrderedDict output = {} for n, each in enumerate(list(distances.items())): item = each[0] distance = each[1] if distance >= self.CONFIG[ 'jaccard_threshold'] and measure == 'jaccard': question_matched = questions.find_by_text(item) output[n] = (question_matched, distance) if distance >= self.CONFIG[ 'cosine_threshold'] and measure == 'cosine': question_matched = questions.find_by_text(item) output[n] = (question_matched, distance) # question_matched = questions.find_by_text(item) # output[n] = (question_matched, distance) if len(output) == 0: return None return output feature_vector = self.modelWrapper.similarity(chat) jaccard_similarity = None top_feature_distance = None category = None keywords = self.preprocessor.get_keywords(chat) morphs = self.preprocessor.get_morphs(chat) # 우선 자카드 유사도 TOP 5를 찾음 jaccard_top_distances = get_top(self.get_jaccard(chat), measure='jaccard') if jaccard_top_distances and not analysis: measurement = '자카드 유사도' matched_question, jaccard_similarity = jaccard_top_distances[0][ 0], jaccard_top_distances[0][1] category = matched_question.category else: # 자카드 유사도가 없다면, 유클리드 또는 맨하탄 거리 비교로 넘어간다. feature_top_distances = get_top(self.get_similarity( chat, keywords, analysis), measure='cosine') if analysis: return feature_top_distances measurement = self.CONFIG['distance'] if feature_top_distances is None: category = 'search' matched_question = None top_feature_distance = None else: matched_question = feature_top_distances[0][0] top_feature_distance = feature_top_distances[0][1] category = matched_question.category answer = self.by_category(chat, category, matched_question) query = Query(chat=chat, feature_vector=feature_vector, keywords=keywords, matched_question=matched_question, manhattan_similarity=top_feature_distance, jaccard_similarity=jaccard_similarity, added_time=added_time, answer=answer, morphs=morphs, measurement=measurement, category=category) return query def get_jaccard(self, chat): assert chat is not None question_list = questions.find_all() assert question_list is not None distance_dict = {} def _calc_jaacard(A, B): A_output = A['text'] B_output = B['text'] VISITED = [] num_union = len(A) + len(B) - 2 # output 뺀 것 num_joint = 0 for key_a, tag_a in A.items(): for key_b, tag_b in B.items(): if key_a == 'text' or key_b == 'text': continue if key_a == key_b and tag_a == tag_b and key_a not in VISITED: num_joint += 1 VISITED.append(key_a) return num_joint / (num_union - num_joint) chat_morphs = self.preprocessor.get_morphs(chat) for each in question_list: question_morphs = self.preprocessor.get_morphs(each.text) distance_dict[each.text] = _calc_jaacard(chat_morphs, question_morphs) return OrderedDict( sorted(distance_dict.items(), key=lambda t: t[1], reverse=True)) def get_similarity(self, chat, keywords, analysis=False): assert chat is not None feature_vector = self.modelWrapper.similarity(chat) question_list = questions.find_by_keywords(keywords=keywords) if not question_list: # 걸리는 키워드가 없는 경우 모두 다 비교 # search 로 넘어가는 것이, 성능적으로 좋을 듯 # question_list = questions.find_all() return None # question_list = questions.find_all() distances = {} a_vector = self.get_weighted_average_vector(chat, feature_vector) if type(a_vector) != np.ndarray: return None for question in question_list: b_vector = self.get_weighted_average_vector( question.text, question.feature_vector) if self.CONFIG['distance'] == 'manhattan': distance = manhattan_distance(a_vector, b_vector) elif self.CONFIG['distance'] == 'euclidean': distance = euclidean_distance(a_vector, b_vector) elif self.CONFIG['distance'] == 'cosine': distance = cosine_similarity(a_vector, b_vector) else: raise Exception('CONFIG distance measurement Error!') distances[question.text] = distance return OrderedDict( sorted(distances.items(), key=lambda t: t[1], reverse=True)) # 유클리드 할거면 바꿔야되 def get_weighted_average_vector(self, text, vector): if len(vector.shape) == 1: return vector assert len(vector.shape) == 2 text, _ = self.preprocessor.clean(text) tokens = self.preprocessor.str_to_tokens(text) idf_ = self._question_maker.idf_ vocabulary_ = self._question_maker.vocabulary_ output_vector = [] for i, token in enumerate(tokens): idx = vocabulary_[token] idf = idf_[idx] # if token == '[UNK]': # continue # elif idf == 1.0: # output_vector.append(vector[i]) # continue # else: vector[i] += vector[i] * idf * self.CONFIG['idf_weight'] output_vector.append(vector[i]) if output_vector: output_vector = np.sum(output_vector, axis=0) return output_vector else: return np.array([0.0] * 768)
class Search(metaclass=Singleton): def __init__(self): self.tfidf_matrix = None self.contexts_list = None self.CONFIG = config.SEARCH self.tfidf_vectorizer = TfidfVectorizer( stop_words=None, sublinear_tf=self.CONFIG['sublinear_tf']) self.preprocessor = PreProcessor() self.set_context() self.set_tfidf_matrix() self.tensor_server = TensorServer() def response(self, chat): # context TF IDF 로 찾기 output = self.find_context(chat) context = output['context-1'] score = output['score-1'] if score == 0: return None, None answer = self.tensor_server.search(chat, context) return answer, output def response_with_subject(self, _chat, _subject): context = contexts.find_by_subject(_subject=_subject) context = context['text'] answer = self.tensor_server.search(chat=_chat, context=context) return answer, context def response_with_context(self, _chat, _context): answer = self.tensor_server.search(chat=_chat, context=_context) return answer def response_with_id(self, _chat, _id): context = contexts.find_by_id(_id=_id)['text'] return self.tensor_server.search(chat=_chat, context=context), context def set_tfidf_matrix(self): text_list = list( map(lambda x: ' '.join(self.preprocessor.get_keywords(x['text'])), self.contexts_list)) self.tfidf_matrix = self.tfidf_vectorizer.fit_transform( text_list).todense().tolist() def set_context(self): self.contexts_list = list(contexts.find_all()) def find_context(self, chat): chat = ' '.join(self.preprocessor.get_keywords(chat)) chat_tfidf = self.tfidf_vectorizer.transform([chat ]).todense().tolist()[0] num_context = len(self.tfidf_matrix) score = 0 ordered_list = [] output = { 'context_subject-1': None, 'context_subject-2': None, 'context_subject-3': None, 'context-1': None, 'context-2': None, 'context-3': None, 'score-1': None, 'score-2': None, 'score-3': None } for i in range(num_context): context_tfidf = self.tfidf_matrix[i] num_context_voca = len(context_tfidf) for j in range(num_context_voca): score += chat_tfidf[j] * context_tfidf[j] ordered_list.append((i, score)) score = 0 ordered_list = sorted(ordered_list, key=lambda x: x[1], reverse=True) for i in range(self.CONFIG['max_context_num']): output['context_subject-{}'.format(i + 1)] = self.get_context( ordered_list[i][0])['subject'] output['score-{}'.format(i + 1)] = ordered_list[i][1] output['context-{}'.format(i + 1)] = self.get_context( ordered_list[i][0])['text'] return output def get_context(self, idx): return self.contexts_list[idx]
class QuestionMaker(object): def __init__(self): self.CONFIG = config.QUESTION self.model_wrapper = TensorServer() self.preprocessor = PreProcessor() vocab = self.preprocessor.vocab[:-1] self.tfidf_vectorizer = TfidfVectorizer( smooth_idf=True, token_pattern=self.CONFIG['tfidf_token_pattern'], stop_words=None, vocabulary=vocab) self.idf_, self.vocabulary_ = self.set_idf() def create_question(self, text, answer=None, category=None): text, removed = self.preprocessor.clean(text) if category not in self.CONFIG['categories']: raise Exception('category must be ', self.CONFIG['categories']) keywords = self.preprocessor.get_keywords(text=text) morphs = self.preprocessor.get_morphs(text=text) vector = self.model_wrapper.similarity(text) # ELMO LIKE return Question(text, category, answer, vector, morphs, keywords=keywords) def set_idf(self): question_list = _questions.find_all() raw_documents = [] for question in question_list: text = ' '.join(self.preprocessor.str_to_tokens(question.text)) raw_documents.append(text) self.tfidf_vectorizer.fit_transform(raw_documents=raw_documents) idf_ = self.tfidf_vectorizer.idf_ # idf_ /= max(self.tfidf_vectorizer.idf_) # 최대값으로 정규화 return idf_, self.tfidf_vectorizer.vocabulary_ def insert_text(self, text, answer=None, category=None): question = self.create_question(text, answer, category) return _questions.insert(question) def rebase(self): questions = _questions.find_all() for question in questions: backup = None orig_text = question.text try: backup = question question = self.create_question(text=question.text, category=question.category, answer=question.answer) _questions.delete_by_text(orig_text) _questions.insert(question) print('rebase: {}'.format(question.text)) except Exception as err: print('rebase: ', str(err)) if backup: _questions.insert(backup) return def check_idf(self, word): return self.idf_[self.vocabulary_[word]]