def load_data(dirFolder, testRatio, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i: classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) splitPoint = int(testRatio * len(documents)) trainDocs, testDocs = documents[splitPoint:], documents[:splitPoint] allDocs.append([trainDocs, testDocs]) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), trainDocs, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X_train, Y_train = [], [] X_test, Y_test = [], [] for i, dclass in enumerate(classes): for j in range(len(allDocs[i])): for doc in allDocs[i][j]: processedFile = preprocess.readFile( os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [words.get(w, 0) for w in vocabulary] if j == 0: X_train.append(features) Y_train.append(i) else: X_test.append(features) Y_test.append(i) return (np.stack(X_train), Y_train), (np.stack(X_test), Y_test)
def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) # TODO: wait for models self.cluster_model = joblib.load(self.cluster_md) self.vec_model = Doc2Vec.load(self.vec_md) jieba.initialize()
def load_data_word2vec(dirFolder, featureKeepRatio=1.0): model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True) classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] def getIt(dablu): try: return model[dablu] except: return np.zeros((300,)) for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = list(set(processedFile)) features = [ getIt(w) for w in vocabulary] X.append(features) return np.stack(X)
def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) self.cluster_model = joblib.load(self.cluster_md) # self.vec_model = Doc2Vec.load(self.vec_md) # self.vec_model = BertClient() self.load_stop_words(self.config) jieba.initialize()
def load_data(dirFolder, featureKeepRatio=1.0): classes = sorted(os.listdir(dirFolder)) vocabulary = set() cMap = {i:classes[i] for i in range(len(classes))} allDocs = [] for i, dclass in enumerate(classes): documents = os.listdir(os.path.join(dirFolder, dclass)) np.random.shuffle(documents) allDocs.append(documents) # Process documents for vocabulary selection tfidf = TfIdf(os.path.join(dirFolder, dclass), documents, featureKeepRatio) selectedWords = tfidf.selectWords() vocabulary = vocabulary | selectedWords # Featurize data according to above vocabulary vocabulary = list(vocabulary) X = [] for i, dclass in enumerate(classes): for doc in allDocs[i]: processedFile = preprocess.readFile(os.path.join(os.path.join(dirFolder, dclass), doc)) words = Counter(processedFile) features = [ words.get(w, 0) for w in vocabulary] X.append(features) return np.stack(X)
import datetime from inverted_index import InvertedIndex from tf_idf import TfIdf import pandas as pd custom_separator = "###" if __name__ == '__main__': df=pd.read_csv('../data/stories.csv', sep=',',header=None) df_head = pd.read_csv('../data/db_books.csv', sep=',') array_content = df.values # array_segment = array_content[0:3] # INVERTED INDEX print("Init INVERTED INDEX") begin_time = datetime.datetime.now() invindex = InvertedIndex(array_content) invindex.process() print(datetime.datetime.now() - begin_time) invindex.saveIndexInDisc() # TF IDF print("Init TF IDF") begin_time = datetime.datetime.now() tf_idf = TfIdf(array_content) tf_idf.process() print(datetime.datetime.now() - begin_time) tf_idf.saveIndexInDisc()
parser = PreProcess() parsed_trainning_documents = {} print('processing...') for k, v in reader.train.items(): parsed_trainning_documents[k] = parser.process(v) # Entrada para o tf-idf, devemos anotar os documentos com suas classes. # Receberá como entrada um array de tuplas: ([tokens], classe) parsed_trainning_documents_with_classes = [] for k in parsed_trainning_documents.keys(): parsed_trainning_documents_with_classes += [(v, k) for v in parsed_trainning_documents[k]] # Execução tf-idf print('generating tf.idf...') tf_idf_calculator = TfIdf(parsed_trainning_documents_with_classes) tf_idf_calculator.run() # testa os parâmetros do knn: métrica de distância e valor de K for metric in ['cosine', 'euclid']: for k in range(5, 11, 2): knn = KNN(tf_idf_calculator.results, k, metric) # confusion_matrix[A][B] = quantas vezes um documento da classe A foi atribuído à classe B topics = ['baseball', 'christian', 'guns'] confusion_matrix = {topic:{t:0 for t in topics} for topic in topics} print_log = False i = 0 ytrue = [] ypred = []
class Agent: good_qualified_corpus = set() def __init__(self): self.config = config self.stop_words = '' self.punctuation_str = ''.join(self.config.punctuation_ls) self.frequency_domain_dict = frequency_domain.frequency_dict self.cluster_md = self.config.cluster_model # self.vec_md = self.config.doc_vector_model self.init_all_states() self.fuzzy_weight = 0.2 self.tf_idf_weight = 0.8 self.good_corpus_threshold = 200 self.good_corpus_score = 0.95 def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) self.cluster_model = joblib.load(self.cluster_md) # self.vec_model = Doc2Vec.load(self.vec_md) # self.vec_model = BertClient() self.load_stop_words(self.config) jieba.initialize() def get_utterance_type(self, utterance): # tmp_vector = self.vec_model.infer_vector(utterance) # tmp_vector = self.vec_model.encode([utterance]) # label = self.cluster_model.predict(tmp_vector) # print(label) # return self.config.cluster_file[label[0]] return self.config.cluster_file[0] def record_good_conversations(self, utterance, score_ls, context_ls): def write_conversations(): localtime = (time.asctime(time.localtime(time.time()))).replace( ' ', '_').replace(':', '-') with open(self.config.path_of_good_conversation + localtime, 'wb') as wfp: pickle.dump(Agent.good_qualified_corpus, wfp) Agent.good_qualified_corpus.clear() # print(Agent.good_qualified_corpus) for index in range(len(score_ls)): if score_ls[index] > self.good_corpus_score: if context_ls[index][0] and context_ls[index][1]: # print((utterance, context_ls[index][1])) Agent.good_qualified_corpus.add( (utterance, context_ls[index][1])) # print(len(Agent.good_qualified_corpus)) if len(Agent.good_qualified_corpus) > self.good_corpus_threshold: record_thread = threading.Thread(target=write_conversations) record_thread.start() def random_chose_index(self, score_ls, max_score): max_score_indexes = [] for i in range(len(score_ls)): if score_ls[i] == max_score: max_score_indexes.append(i) return choice(max_score_indexes) def load_stop_words(self, config): with open(config.stop_words, 'rb') as fpr: self.stop_words = pickle.load(fpr) def remove_special_words(self, stop_words_ls, input_sentence): sentence = input_sentence for special_word in self.config.special_modal_words: if special_word in sentence: sentence = sentence.replace(special_word, '') return sentence def response_answer(self, reply_msg, max_score): if type(max_score) is np.ndarray: final_max_score = max_score[0][0] else: final_max_score = max_score return reply_msg, final_max_score def get_answer(self, utterance, file_name=None): try: utterance = utterance.rstrip(self.punctuation_str) file_name = self.get_utterance_type(utterance) self.retrieval.read_indexes(file_name) context_ls = self.retrieval.search_sentences( utterance, self.stop_words) if not context_ls: return "", 0 utterance_no_stop = self.remove_special_words( self.stop_words, utterance) new_context_ls = [] for each_context in context_ls: ques = self.remove_special_words(self.stop_words, each_context[0]) ans = self.remove_special_words(self.stop_words, each_context[1]) if not ques or not ans: new_context_ls.append((0, 0)) continue new_context_ls.append((ques, ans)) # print("control!!!!!!!!!!!!!!!!!: {},{}".format(utterance, new_context_ls)) # print(len(new_context_ls)) fuzzy_ratio_ls = fuzzy_matching(utterance_no_stop, new_context_ls) self.tf_idf.select_model(file_name) self.tf_idf.predict_tfidf(utterance_no_stop, new_context_ls) tf_idf_score_ls = self.tf_idf.calculate_distances() if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1: fuzzy_best_index = self.random_chose_index( fuzzy_ratio_ls, max(fuzzy_ratio_ls)) else: fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls)) if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1: tftdf_best_index = self.random_chose_index( tf_idf_score_ls, max(tf_idf_score_ls)) else: tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls)) fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip( self.punctuation_str) tfidf_best_content = context_ls[tftdf_best_index][0].rstrip( self.punctuation_str) if fuzzy_best_content == utterance or utterance.strip(''.join( config.special_modal_words)) in fuzzy_best_content: best_index = fuzzy_best_index # return context_ls[best_index][1], max(fuzzy_ratio_ls) return self.response_answer(context_ls[best_index][1], max(fuzzy_ratio_ls)) if tfidf_best_content == utterance or utterance.strip(''.join( config.special_modal_words)) in tfidf_best_content: best_index = tftdf_best_index # return context_ls[best_index][1], max(tf_idf_score_ls) return self.response_answer(context_ls[best_index][1], max(tf_idf_score_ls)) final_score_ls = [(fuzzy_ratio * self.fuzzy_weight + tf_tdf_score * self.tf_idf_weight) for fuzzy_ratio, tf_tdf_score in zip( fuzzy_ratio_ls, tf_idf_score_ls)] # TODO: find a suitable weight self.record_good_conversations(utterance, final_score_ls, context_ls) max_score = max(final_score_ls) if final_score_ls.count(max_score) > 1: best_index = self.random_chose_index(final_score_ls, max_score) else: best_index = final_score_ls.index(max_score) # print("final result:{}".format(context_ls[best_index])) # print(type(max_score)) return self.response_answer(context_ls[best_index][1], max_score) except Exception as e: return "", 0 def test(self, utterance): answer = self.get_answer(utterance) return answer def start_cmd(self): while True: utterance = input(">>>") if utterance.strip() == "exit1": break answer, score = self.get_answer(utterance) print("<<<{}:{}".format(answer, score)) def api(self, utterance): answer, score = self.get_answer(utterance) return [answer, score] def socket_get(self, utterance): answer, score = self.get_answer(utterance) # print(answer + '---' + str(score[0][0])) return answer + '---' + str(score)
import pandas as pd from preprocessing import InputFrame from tf_idf import TfIdf from NMF import LatentFeatures import cPickle as pk filename = '../../StateNames.csv' dataframe = pd.read_csv(filename) #Run the male preprocessing and model male_instance = InputFrame(dataframe, 'M') male_instance.clean_data_from_frame() male_frame = male_instance.gender_frame male_tfidf = TfIdf(male_frame) male_vectors = male_tfidf.tfidf_matrix() male_nmf = LatentFeatures(male_vectors) male_matrices = male_nmf.fit_model() # male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20) male_features_dict = male_nmf.latent_features_dict(male_tfidf.index, male_tfidf.features, 20) #Store the male model and latent features dictionary with open('../data/male_nmf.pkl', 'w') as f: pk.dump(male_nmf, f) with open('../data/male_latent_features.pkl', 'w') as f: pk.dump(male_features_dict, f)
def calc_doc_len(doc: Document, tf_idf: TfIdf) -> float: result = 0 for word in doc.words: result = result + math.pow(tf_idf.get(word, {}).get(doc, 0), 2) return math.sqrt(result)
import pandas as pd from preprocessing import InputFrame from tf_idf import TfIdf from NMF import LatentFeatures import cPickle as pk filename = '../../StateNames.csv' dataframe = pd.read_csv(filename) #Run the male preprocessing and model male_instance = InputFrame(dataframe,'M') male_instance.clean_data_from_frame() male_frame = male_instance.gender_frame male_tfidf = TfIdf(male_frame) male_vectors = male_tfidf.tfidf_matrix() male_nmf = LatentFeatures(male_vectors) male_matrices = male_nmf.fit_model() # male_nmf.print_W_H_features(male_tfidf.index, male_tfidf.features, 20) male_features_dict = male_nmf.latent_features_dict(male_tfidf.index, male_tfidf.features, 20) #Store the male model and latent features dictionary with open('../data/male_nmf.pkl','w') as f: pk.dump(male_nmf,f) with open('../data/male_latent_features.pkl','w') as f: pk.dump(male_features_dict,f)
class Agent: def __init__(self): self.config = config self.punctuation_str = ''.join(self.config.punctuation_ls) self.frequency_domain_dict = frequency_domain.frequency_dict self.cluster_md = self.config.cluster_model self.vec_md = self.config.doc_vector_model self.init_all_states() self.fuzzy_weight = 0.7 self.tf_idf_weight = 0.3 # self.record_chat_ls = [] def init_all_states(self): self.retrieval = Retrieval(num_ir=NUM_OF_IR, config=self.config) self.tf_idf = TfIdf(self.config) # TODO: wait for models self.cluster_model = joblib.load(self.cluster_md) self.vec_model = Doc2Vec.load(self.vec_md) jieba.initialize() def select_domain(self, utterance): utterance_words = [each_word for each_word in jieba.cut(utterance, cut_all=False)] # print(utterance_words) for each_word in utterance_words: if each_word in self.frequency_domain_dict.keys() and len(each_word) > 1: # print(each_word) return "domains" return "xiaohuangji" def get_utterance_type(self, utterance): # TODO: wait for models tmp_vector = self.vec_model.infer_vector(utterance) label = self.cluster_model.predict(tmp_vector.reshape(1, -1)) # print(label) return self.config.cluster_file[label[0]] def record_good_chat(self): pass # TODO: build a new thread to record conversation whose score is more than 0.95 in interval time # TODO: by this way we can get a lot of good conversations def random_chose_index(self, score_ls, max_score): max_score_indexes = [] for i in range(len(score_ls)): if score_ls[i] == max_score: max_score_indexes.append(i) return choice(max_score_indexes) def get_answer(self, utterance, file_name=None): try: # file_name = self.get_utterance_type(utterance) utterance = utterance.rstrip(self.punctuation_str) if not file_name: file_name = self.select_domain(utterance) # print(file_name) # file_name = self.get_utterance_type(utterance) self.retrieval.read_indexes(file_name) context_ls = self.retrieval.search_sentences(utterance) # print(context_ls) if not context_ls and file_name != "domains": return "对不起亲,没听懂你说啥,你再重新组织一下语言吧。" if not context_ls and file_name == "domains": answer = self.get_answer(utterance, "weibo") return answer if file_name == "domains": fuzzy_ratio_ls = fuzzy_for_domains(utterance, context_ls) else: fuzzy_ratio_ls = fuzzy_matching(utterance, context_ls) # print(fuzzy_ratio_ls) self.tf_idf.select_model(file_name) self.tf_idf.predict_tfidf(utterance, context_ls) tf_idf_score_ls = self.tf_idf.calculate_distances() if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1: fuzzy_best_index = self.random_chose_index(fuzzy_ratio_ls, max(fuzzy_ratio_ls)) else: fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls)) if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1: tftdf_best_index = self.random_chose_index(tf_idf_score_ls, max(tf_idf_score_ls)) else: tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls)) fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip(self.punctuation_str) tfidf_best_content = context_ls[tftdf_best_index][0].rstrip(self.punctuation_str) if fuzzy_best_content == utterance or utterance in fuzzy_best_content: best_index = fuzzy_best_index # print(context_ls[best_index][0]) return context_ls[best_index][1] if tfidf_best_content == utterance or utterance in tfidf_best_content: best_index = tftdf_best_index # print(context_ls[best_index][0]) return context_ls[best_index][1] final_score_ls = [(fuzzy_ratio * self.fuzzy_weight + tf_tdf_score * self.tf_idf_weight) for fuzzy_ratio, tf_tdf_score in zip(fuzzy_ratio_ls, tf_idf_score_ls)] # TODO: find a suitable weight # print(final_score_ls) if max(final_score_ls) < 0.85 and file_name != "weibo" and file_name != "domains": # TODO: ugly code # print(max(final_score_ls)) answer = self.get_answer(utterance, "weibo") return answer else: # print(max(final_score_ls)) max_score = max(final_score_ls) if final_score_ls.count(max_score) > 1: best_index = self.random_chose_index(final_score_ls, max_score) else: best_index = final_score_ls.index(max_score) # print(context_ls[best_index][0]) return context_ls[best_index][1] except Exception as e: return "对不起亲,这个问题实在不晓得呀!" def get_answer2(self, utterance, file_name=None): try: utterance = utterance.rstrip(self.punctuation_str) file_name = self.get_utterance_type(utterance) # print(file_name) self.retrieval.read_indexes(file_name) context_ls = self.retrieval.search_sentences(utterance) # print(context_ls) if not context_ls: return "", 0 fuzzy_ratio_ls = fuzzy_matching(utterance, context_ls) self.tf_idf.select_model(file_name) self.tf_idf.predict_tfidf(utterance, context_ls) tf_idf_score_ls = self.tf_idf.calculate_distances() if fuzzy_ratio_ls.count(max(fuzzy_ratio_ls)) > 1: fuzzy_best_index = self.random_chose_index(fuzzy_ratio_ls, max(fuzzy_ratio_ls)) else: fuzzy_best_index = fuzzy_ratio_ls.index(max(fuzzy_ratio_ls)) if tf_idf_score_ls.count(max(tf_idf_score_ls)) > 1: tftdf_best_index = self.random_chose_index(tf_idf_score_ls, max(tf_idf_score_ls)) else: tftdf_best_index = tf_idf_score_ls.index(max(tf_idf_score_ls)) fuzzy_best_content = context_ls[fuzzy_best_index][0].rstrip(self.punctuation_str) tfidf_best_content = context_ls[tftdf_best_index][0].rstrip(self.punctuation_str) if fuzzy_best_content == utterance or utterance in fuzzy_best_content: best_index = fuzzy_best_index # print(context_ls[best_index][0]) return context_ls[best_index][1], max(fuzzy_ratio_ls) if tfidf_best_content == utterance or utterance in tfidf_best_content: best_index = tftdf_best_index # print(context_ls[best_index][0]) return context_ls[best_index][1], max(tf_idf_score_ls) final_score_ls = [(fuzzy_ratio * self.fuzzy_weight + tf_tdf_score * self.tf_idf_weight) for fuzzy_ratio, tf_tdf_score in zip(fuzzy_ratio_ls, tf_idf_score_ls)] # TODO: find a suitable weight max_score = max(final_score_ls) if final_score_ls.count(max_score) > 1: best_index = self.random_chose_index(final_score_ls, max_score) else: best_index = final_score_ls.index(max_score) # print(context_ls[best_index][0]) return context_ls[best_index][1], max_score except Exception as e: return "", 0 def test(self, utterance): answer = self.get_answer2(utterance) return answer def start(self): while True: utterance = input(">>>") if utterance.strip() == "exit1": break answer = self.get_answer2(utterance) print("<<<{}".format(answer)) def api(self, utterance): answer, score = self.get_answer2(utterance) return [answer, score]