class Dataset: def __init__(self): self.pp = PreProcessing() def import_dataset(self): messages = pd.read_csv(DATA_FILE, delimiter="\t", quoting=3, encoding="ISO-8859-2") messages.columns = [ 'msg_line', 'user_id', 'movie_id', 'msg', 'msg_pre_processed', 'msg_2', 'target' ] return messages def get_questions(self, messages): return set( messages[messages["target"] == 1]["msg_pre_processed"].astype(str)) def get_answers(self, messages): return set( messages[messages["target"] == 0]["msg_pre_processed"].astype(str)) def get_page_compute(self, qea=0): pc = None file = None if qea == 0: file = PAGE_RANK_ANSWERS else: file = PAGE_RANK_QUESTIONS pc = self.pp.pre_processing_page_rank_file(file) return self.pp.normalize_dictionary(pc) def load_tokenizer(self): with open(TOKENIZER_FILE, "rb") as handle: tokenizer = pickle.load(handle) return tokenizer
class Similarity: def __init__(self, questions: set, answers: set, word_vectors=None): self.bow = CountVectorizer() self.questions = questions self.answers = answers self.word_vectors = word_vectors self.pp = PreProcessing() def get_the_next_conversation(self, conversations, df): """ Get the first item in the dict """ keys_view = conversations.keys() keys_iterator = iter(keys_view) try: conversation = next(keys_iterator) except Exception as e: save_content_to_log(e) return naive_massage() return list(df[df['msg_pre_processed'] == conversation]['msg_2'])[0] def return_conversation_by_page_rank(self, msg, conversations, page_compute, reverse=True): """ Return a dictionary of message and similarity sorted by highter similarity """ conversations = self.pp.normalize_dictionary(conversations) conversations = { k: page_compute[k] + v for k, v in conversations.items() } return { k: v for k, v in sorted(conversations.items(), key=lambda item: item[1], reverse=reverse) } def return_conversation_by_cossine(self, msg, res): """ Return a dictionary of message and similarity sorted by highter similarity """ if res >= 0.5: msg_list = self.questions else: msg_list = self.answers similarity = [] for m in msg_list: m = str(m) new_msg_list = [msg, m] vector_bow = self.bow.fit_transform(new_msg_list) msg_bow = vector_bow.todense()[0] m_bow = vector_bow.todense()[1] d1_array = (1, 1) if m_bow.shape == d1_array and msg_bow.shape == d1_array: d = 1 - distance.euclidean(msg_bow, m_bow) else: d = 1 - distance.cosine(msg_bow, m_bow) if math.isnan(float(d)): similarity.append(0.0) else: similarity.append(d) """ vector_bow = [self.bow.fit_transform([msg, m]) for m in msg_list] msg_bow = [vect.todense()[0] for vect in vector_bow] m_bow = [vect.todense()[1] for vect in vector_bow] similarity = [1 - distance.cosine(msg_vect, m_vect) for msg_vect, m_vect in zip(msg_bow, m_bow)] """ result = dict(zip(msg_list, similarity)) return result