def init(): # тут поднять всякие модели и пр. models_rout = os.path.dirname(__file__) # загрузка моделей для каждого pubid и формирование словаря, в котором pubid являются ключами, # значениями классифицирующие модели и связки тегов с айди ответов (и модулей) models_dict = {"simple_rules_model": None} # загрузка моделей и превращение их сразу в loader_obj for model_name in models_dict: with open( os.path.join(models_rout, 'models/tax_tags', str(model_name) + ".pickle"), "br") as f: model = pickle.load(f) models_dict[model_name] = model # загрузим лемматизатор для паттернов: tknz = TokenizerApply(Loader(models_dict["simple_rules_model"])) global pattern1, pattern2 # лемматизируем паттерны для обора фрагментов pattern1 = tknz.texts_processing(["в ходе проведения"])[0] pattern2 = tknz.texts_processing(["В течение 5 <4> рабочих дней"])[0] """определение моделей, которые потом используются для разных pubid""" model_1 = ModelsChain([(SimpleRules, models_dict["simple_rules_model"])]) global pub_models pub_models = { 1: { "model": model_1, "tag_answ_link": None, "tokenizer": tknz } }
def lsi_model_maker(**kwargs): key_words = ['%%'] questions = [] for word in key_words: res = questions_from_clickhouse( clickhose_host="srv02.ml.dev.msk3.sl.amedia.tech", user='******', password='******', date_in='2020-01-01', date_out='2020-05-31', limit=100000, pubids_tuple=kwargs["pubids_tuple"], key_word=word) qs, dts = zip(*res) questions = questions + list(qs) shuffle(questions) etalons_df = pd.read_csv(kwargs["lingv_rules_csv_path"]) data_for_models = list(etalons_df["words"]) + questions[:100000] print(data_for_models[:10]) print(len(data_for_models)) # модель для токенизатора: model_parameters = { "model_type": "simple_rules", "stopwords_csv_path": os.path.join(data_path, "04_stopwords.csv"), "ngrams_csv_path": os.path.join(data_path, "kss_ngrams.csv"), "synonyms_files_csv_path": [ os.path.join(data_path, "01_synonyms.csv"), os.path.join(data_path, "02_synonyms.csv"), os.path.join(data_path, "03_synonyms.csv") ], "tokenizer": "SimpleTokenizer", "is_lingvo_lemmatize": True, "is_etalons_lemmatize": True } model_for_tokenizer = model_make(**model_parameters) tokenizer = SimpleTokenizer(Loader(model_for_tokenizer)) tz_txs = tokenizer.texts_processing(data_for_models) # соберем LSI модель на основании коллекции из 100 тысяч вопросов: lsi_model_dict = lsi_model_create(tz_txs, topics=1500) with open(kwargs["lsi_model_path"], "bw") as f: pickle.dump(lsi_model_dict, f) return 0
data_rout = r'./data' models_rout = r'./models' """ with open(os.path.join(models_rout, "fast_answrs", "include_and_model.pickle"), "br") as f: model = pickle.load(f) smpltk = SimpleTokenizer(Loader(model)) """ txts = [ "упрощенная бухгалтерская отчетность кто сдает Фи ТАм котОРый али бы", "кто должен сдавать аудиторское заключение", "кто должен подписывать справки", "парит летит воздушный судно" ] with open(os.path.join(models_rout, "fast_answrs", "bss_lsi_model.pickle"), "br") as f: model = pickle.load(f) ''' lsi_tkz = LsiTokenizer(Loader(model)) t1 = time.time() tk_m = lsi_tkz.model_tokenize() print(time.time() - t1) tk_txt = lsi_tkz.texts_processing(txts) print(tk_txt) print(len(tk_txt)) ''' tk_appl = TokenizerApply(Loader(model)) print(tk_appl.texts_processing(txts))
data_path = r'./data' models_path = r'./models' model_parameters = { "model_type": "simple_rules", "stopwords_csv_path": os.path.join(data_path, "04_stopwords.csv"), "synonyms_files_csv_path": [ os.path.join(data_path, "01_synonyms.csv"), os.path.join(data_path, "02_synonyms.csv"), os.path.join(data_path, "03_synonyms.csv") ], "tokenizer": "SimpleTokenizer", "is_lingvo_lemmatize": True, "is_etalons_lemmatize": True } model_for_tokenizer = model_make(**model_parameters) print(model_for_tokenizer) tokenizer = SimpleTokenizer(Loader(model_for_tokenizer)) tknz_texts = tokenizer.texts_processing(questions) bigrams_df = bigrams_dictionary_create(tknz_texts) print(bigrams_df) bigrams_df.to_csv(os.path.join(data_path, "kss_ngrams_candidates.csv"))
print(quests50th_df[:100]) etalons_df = pd.read_csv( os.path.join(data_rout, "kosgu_data", "lingv_rules.csv")) print(etalons_df["words"][:100]) print(etalons_df.shape) train_df = pd.DataFrame( pd.concat([quests50th_df["words"], etalons_df["words"]], axis=0)) print('\n', train_df) print(train_df.shape) with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f: model = pickle.load(f) tknz_txts = TokenizerApply(Loader(model)) # tx = "вчера нам пожелали доброго вечера 345 раз" tz_txs = tknz_txts.texts_processing(list(train_df["words"])) print(tz_txs[:10]) print(len(tz_txs)) # подготовка списка синонимов: stopwords_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'stopwords.csv')) lingv_rules_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'lingv_rules.csv')) ngrams_df = pd.read_csv(os.path.join(data_rout, 'kosgu_data', 'ngrams.csv')) sinonims_files = ['01_sinonims.csv', '02_sinonims.csv'] synonyms = []
if __name__ == "__main__": data_rout = r"./data" txt_df = pd.read_csv(os.path.join(data_rout, "bss_data", "texts_collection.tsv"), sep="\t") print(txt_df) models_rout = r"./models" with open( os.path.join(models_rout, "fast_answrs", "bss_include_and_model.pickle"), "br") as f: model = pickle.load(f) smp_tkz = SimpleTokenizer(Loader(model)) tknz_txts = smp_tkz.texts_processing(list(txt_df["texts"][:1000])) print(tknz_txts[:10]) print(len(tknz_txts)) dct1 = tf_idf_model_create(tknz_txts) print(dct1) dct2 = tf_model_create(tknz_txts) print(dct2) dct3 = lsi_model_create(tknz_txts, topics=10) print(dct3) # проверка векторизации lsi модели: txt_corp = dct3["dictionary"].doc2bow(tknz_txts[5])
# Тестируем под задачу "быстрых ответов" первоначальные вопросы (на которых сеть обучалась) используем в качестве # эталонов, смотрим, насколько качественно она отбирает во входящем потоке похожие на них вопросы import os, pickle, time from utility import Loader from texts_processors import TokenizerApply import pandas as pd # загрузка файлов с данными: tokenize_path = r'./tokenize_model' test_path = r'./test' with open(os.path.join(tokenize_path, "tokenizator_model.pickle"), "rb") as f: tokenize_model = pickle.load(f) tokenize_loader = Loader(tokenize_model) tknz = TokenizerApply(tokenize_loader) # загрузка вопросов df_data = pd.read_csv(os.path.join(test_path, "ндс_прибыль_5000.csv")) df_data.rename(columns={"0": "text"}, inplace=True) # загрузка словаря, который "знает" нейронная сеть work_dict_df = pd.read_csv(os.path.join(test_path, "dictionary_work.csv")) work_dict_list = list(work_dict_df["token"]) print(work_dict_list) # загрузка эталонов (первоначальных запросов, на которых обучалась нейронная сеть) df_etalons = pd.read_csv(os.path.join(test_path, "etalons.csv")) df_etalons = df_data tktxs = tknz.texts_processing(df_data["text"])
import os, pickle from utility import Loader models_rout = r'./models' with open(os.path.join(models_rout, "fast_answrs", "kosgu_lsi_model.pickle"), "br") as f: model = pickle.load(f) loader_obj = Loader(model) print(loader_obj.application_field["coeff"])
import os, pickle import pandas as pd import random from texts_processors import TokenizerApply from utility import Loader data_rout = r'./data' models_rout = r'./models' with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f: lingv_model = pickle.load(f) tk_appl = TokenizerApply(Loader(lingv_model)) data_df = pd.read_csv(os.path.join(data_rout, "data_group_01.csv")) lemm_txts_l = tk_appl.texts_processing(list(data_df['text'])) lemm_txts_df = pd.DataFrame(list(zip([" ".join(x) for x in lemm_txts_l], data_df['group']))) lemm_txts_df.rename(columns={0: 'text', 1: 'group'}, inplace=True) print(lemm_txts_df) lemm_txts_df.to_csv(os.path.join(data_rout, "lemm_data_group_01.csv"), index=False, columns=['text', 'group']) df = pd.read_csv(os.path.join(data_rout, "lemm_data_group_01.csv")) print(df) # герерация пар семантически одинаковых вопросов lbs = set(df['group']) results_tuples = [] for lb in lbs: work_list = list(df['text'][df['group'] == lb]) for tx1 in work_list: for tx2 in work_list:
if __name__ == "__main__": import time data_rout = r'./data' models_rout = r'./models' with open( os.path.join(models_rout, "fast_answrs", "kosgu_lsi_model.pickle"), "br") as f: model = pickle.load(f) print(model["model_type"]) for i in model: print(i) cl = LsiClassifier(Loader(model)) mc = ModelsChain([Loader(model)]) tx = "командировки статья косгу" t1 = time.time() print(mc.rules_apply([tx]), time.time() - t1) """ with open(os.path.join(models_rout, "fast_answrs", "kosgu_incl_and_test_model.pickle"), "br") as f: kosgu_incl_and = pickle.load(f) with open(os.path.join(models_rout, "fast_answrs", "bss_lsi_model.pickle"), "br") as f: bss_lsi = pickle.load(f) with open(os.path.join(models_rout, "fast_answrs", "bss_intersec_share_model.pickle"), "br") as f: bss_intersec = pickle.load(f) with open(os.path.join(models_rout, "fast_answrs", "bss_include_and_model.pickle"), "br") as f:
vcs_arr = vcs_arr.reshape(vcs_arr.shape[0], vcs_arr.shape[1], 1) with graph.as_default(): scores = self.model.classificator_algorithms[ "siamese_lstm_model"].predict([tx_tensor, vcs_arr]) trues = [(tg, True) for scr, cf, tg in zip(scores, coeffs, tags) if scr < cf] falses = [(tg, False) for scr, cf, tg in zip(scores, coeffs, tags) if scr > cf] decisions.append((num, trues + falses)) return decisions if __name__ == "__main__": import time from utility import Loader data_rout = r'./data' models_rout = r'./models' with open( os.path.join(models_rout, "fast_answrs", "bss_siamese_lstm_d2v.pickle"), "br") as f: bss_siamese = pickle.load(f) tx = ["кто может применять упрощенный баланс"] mdschain = ModelsChain([Loader(bss_siamese)], classes=[SiameseNnDoc2VecClassifier]) t1 = time.time() rt_t = mdschain.rules_apply(tx) print(tx[0], "bss_siamese:", rt_t, time.time() - t1)
if scr > cf] # отсортируем, чтобы выводить наиболее подходящие результаты (с наибольшим скором) trues = [ tg for tg, scr, cf in sorted( trues_list_scores, key=lambda x: x[1], reverse=True) ] texts_tags_similarity.append((num, trues)) return texts_tags_similarity if __name__ == "__main__": import time models_path = r'./models' with open(os.path.join(models_path, "bss_model_lsi.pickle"), "br") as f: model_lsi = pickle.load(f) loader_obj = Loader(model_lsi) print(loader_obj.dictionaries) t1 = time.time() cl = LsiClassifier(Loader(model_lsi)) print(time.time() - t1) tx = "упрощенная бухгалтерская отчетность кто сдает" t1 = time.time() rls = cl.rules_apply([tx]) print(time.time() - t1) print(rls)
if __name__ == '__main__': init() models_rout = os.path.dirname(__file__) with open( os.path.join(models_rout, 'models/tax_tags', "simple_rules_model" + ".pickle"), "br") as f: model_tax = pickle.load(f) for i in model_tax: print(i) print("model_type:", model_tax["model_type"]) print("model_type:", Loader(model_tax).model_type) clss = SimpleRules(Loader(model_tax)) # print(clss.tknz_model.application_field) print(clss.tknz_model.application_field) print("pattern1:", pattern1, "pattern2:", pattern2) data_rout = r"./tax_demands" with open("example.txt", "r") as f: tx = f.read() # print(tx)
def model_make(**kwargs): if "lingv_rules_csv_path" in kwargs: lingv_rules_df = pd.read_csv(kwargs["lingv_rules_csv_path"]) rules_dict = { 'rules': list(lingv_rules_df["rules"]), 'words': list(lingv_rules_df["words"]), 'tags': list(lingv_rules_df["tag"]), 'coeff': list(lingv_rules_df["coeff"]) } else: rules_dict = {'rules': [], 'words': [], 'tags': [], 'coeff': []} if "lingvo" in kwargs: lingvo_list = [] for lingv_dict in kwargs["lingvo"]: if "stopwords_csv_path" in lingv_dict: stopwords = [] for file_name in lingv_dict["stopwords_csv_path"]: stopwords_df = pd.read_csv(file_name) stopwords.append(list(stopwords_df['words'])) lingvo_list.append({"stopwords": stopwords, "tokenize": True}) if "synonyms_files_csv_path" in lingv_dict: synonyms = [] for file_name in lingv_dict["synonyms_files_csv_path"]: synonyms_df = pd.read_csv(file_name) synonyms.append( list( zip(synonyms_df["words"], synonyms_df["initial_forms"]))) lingvo_list.append({"synonyms": synonyms, "tokenize": True}) if "ngrams_csv_path" in lingv_dict: ngrams = [] for file_name in lingv_dict["ngrams_csv_path"]: ngrams_df = pd.read_csv(file_name) ngrams.append([ (" ".join([w1, w2]), tk) for w1, w2, tk in zip( list(ngrams_df["w1"]), list(ngrams_df["w2"]), list(ngrams_df["bigrams"])) ]) lingvo_list.append({"ngrams": ngrams, "tokenize": False}) if "workwords_csv_path" in lingv_dict: workwords = [] for file_name in lingv_dict["workwords_csv_path"]: workwords_df = pd.read_csv(file_name) workwords.append(list(workwords_df['words'])) lingvo_list.append({"workwords": workwords, "tokenize": True}) else: lingvo_list = [] if kwargs["model_type"] == 'simple_rules': # соберем модель для запуска токенизатора: model_dict_simple = models_create(tokenizer="SimpleTokenizer", model_type="simple_rules", lingv_rules=rules_dict, lingvo=lingvo_list) tokenizer = SimpleTokenizer(Loader(model_dict_simple)) if "is_lingvo_lemmatize" in kwargs: is_lingvo_lemmatize = kwargs["is_lingvo_lemmatize"] if is_lingvo_lemmatize: # print("tokenizer.dictionaries:", "\n", tokenizer.dictionaries) lingvo_list = tokenizer.dictionaries else: is_lingvo_lemmatize = False if "is_etalons_lemmatize" in kwargs: is_etalons_lemmatize = kwargs["is_etalons_lemmatize"] if is_etalons_lemmatize: rules_dict["words"] = tokenizer.texts_processing( rules_dict["words"]) else: is_etalons_lemmatize = False result_model_dict = models_create( tokenizer=kwargs["tokenizer"], model_type=kwargs["model_type"], lingv_rules=rules_dict, lingvo=lingvo_list, is_lingvo_lemmatize=is_lingvo_lemmatize, is_etalons_lemmatize=is_etalons_lemmatize) return result_model_dict if kwargs["model_type"] == 'lsi': # загрузка lsi модели: with open(kwargs["lsi_model_path"], "rb") as f: lsi_dict = pickle.load(f) # соберем модель для запуска токенизатора: model_dict_lsi = models_create(tokenizer="LsiTokenizer", model_type="lsi", lingv_rules=rules_dict, lingvo=lingvo_list, texts_algorithms=lsi_dict) tokenizer = LsiTokenizer(Loader(model_dict_lsi)) if 'index' not in lsi_dict: et_vectors = tokenizer.texts_processing(rules_dict['words']) index = MatrixSimilarity(et_vectors, num_features=lsi_dict["num_topics"]) lsi_dict["index"] = index if "is_lingvo_lemmatize" in kwargs: is_lingvo_lemmatize = kwargs["is_lingvo_lemmatize"] if is_lingvo_lemmatize: lingvo_list = tokenizer.dictionaries else: is_lingvo_lemmatize = False result_model_dict = models_create( tokenizer="LsiTokenizer", model_type=kwargs["model_type"], lingv_rules=rules_dict, lingvo=lingvo_list, texts_algorithms=lsi_dict, is_lingvo_lemmatize=is_lingvo_lemmatize, is_etalons_lemmatize=True) return result_model_dict
def texts_processing(self, incoming_text): return self.tnzr.texts_processing(incoming_text) if __name__ == "__main__": data_rout = r'./data' models_rout = r'./models' """ with open(os.path.join(models_rout, "fast_answrs", "include_and_model.pickle"), "br") as f: model = pickle.load(f) smpltk = SimpleTokenizer(Loader(model)) """ txts = ["упрощенная бухгалтерская отчетность кто сдает Фи ТАм котОРый али бы", "кто должен сдавать аудиторское заключение", "кто должен подписывать справки", "парит летит воздушный судно"] with open(os.path.join(models_rout, "bss_model_lsi.pickle"), "br") as f: model = pickle.load(f) print("model is loaded") for cl in model: print(cl) print(model["lingvo"]) print(model["is_lingvo_lemmatize"]) # tk_appl = TokenizerApply(Loader(model)) tk_appl = LsiTokenizer(Loader(model)) print(tk_appl.dictionaries) print(tk_appl.texts_processing(txts))
def doc2vec_model_maker(**kwargs): key_words = ['%%'] questions = [] for word in key_words: res = questions_from_clickhouse( clickhose_host="srv02.ml.dev.msk3.sl.amedia.tech", user='******', password='******', date_in='2020-04-01', date_out='2020-08-31', limit=1000000, pubids_tuple=kwargs["pubids_tuple"], key_word=word) qs, dts = zip(*res) questions = questions + list(qs) print(len(questions)) shuffle(questions) questions = questions # [:1000] # data_for_models = list(questions[:1000000]) # модель для токенизатора (используем простую модель, которая предполагается в наличие у каждой системы): if "simple_model_path" in kwargs: with open(kwargs["simple_model_path"], "rb") as f: model_for_tokenizer = pickle.load(f) tokenizer = SimpleTokenizer(Loader(model_for_tokenizer)) tz_txs = tokenizer.texts_processing(questions) # надо сделать отдельную функцию для лемматизации if "lingvo_data" in kwargs: asc_dsc_syn = [] asc_dsc_ngrm = [] if "synonyms" in kwargs["lingvo_data"]: for fn in kwargs["lingvo_data"]["synonyms"]: temp_syn_df = pd.read_csv(fn) syn_asc_temp = [ " " + tx + " " for tx in texts_lemmatize(temp_syn_df['words']) ] syn_dsc_temp = [ " " + tx + " " for tx in texts_lemmatize(temp_syn_df['initial_forms']) ] asc_dsc_syn += list(zip(syn_asc_temp, syn_dsc_temp)) if "ngrams" in kwargs["lingvo_data"]: asc_dsc_ngrm = [] for fn in kwargs["lingvo_data"]["ngrams"]: temp_ngrms_df = pd.read_csv(fn) temp_ngrms = [(' '.join([w1, w2]), bgr) for w1, w2, bgr in zip( temp_ngrms_df['w1'], temp_ngrms_df['w2'], temp_ngrms_df['bigrams'])] asc_dsc_ngrm += temp_ngrms asc_dsc_list = asc_dsc_syn + asc_dsc_ngrm tz_txs = ngram_apply(asc_dsc_list, texts_lemmatize(questions)) tz_txs_split = [tx.split() for tx in tz_txs if tx.split() != []] # соберем LSI модель на основании коллекции из 100 тысяч вопросов: model_parameters = { "split_txt": tz_txs_split, "model_rout": kwargs["doc2vec_model_path"] } create_doc2vec_model(**model_parameters) return 0