Пример #1
0
def init():
    # тут поднять всякие модели и пр.
    models_rout = os.path.dirname(__file__)

    # загрузка моделей для каждого pubid и формирование словаря, в котором pubid являются ключами,
    # значениями классифицирующие модели и связки тегов с айди ответов (и модулей)
    models_dict = {"simple_rules_model": None}

    # загрузка моделей и превращение их сразу в loader_obj
    for model_name in models_dict:
        with open(
                os.path.join(models_rout, 'models/tax_tags',
                             str(model_name) + ".pickle"), "br") as f:
            model = pickle.load(f)
            models_dict[model_name] = model

    # загрузим лемматизатор для паттернов:
    tknz = TokenizerApply(Loader(models_dict["simple_rules_model"]))

    global pattern1, pattern2
    # лемматизируем паттерны для обора фрагментов
    pattern1 = tknz.texts_processing(["в ходе проведения"])[0]
    pattern2 = tknz.texts_processing(["В течение 5 <4> рабочих дней"])[0]
    """определение моделей, которые потом используются для разных pubid"""
    model_1 = ModelsChain([(SimpleRules, models_dict["simple_rules_model"])])

    global pub_models
    pub_models = {
        1: {
            "model": model_1,
            "tag_answ_link": None,
            "tokenizer": tknz
        }
    }
Пример #2
0
def lsi_model_maker(**kwargs):
    key_words = ['%%']
    questions = []
    for word in key_words:
        res = questions_from_clickhouse(
            clickhose_host="srv02.ml.dev.msk3.sl.amedia.tech",
            user='******',
            password='******',
            date_in='2020-01-01',
            date_out='2020-05-31',
            limit=100000,
            pubids_tuple=kwargs["pubids_tuple"],
            key_word=word)

        qs, dts = zip(*res)
        questions = questions + list(qs)

    shuffle(questions)
    etalons_df = pd.read_csv(kwargs["lingv_rules_csv_path"])
    data_for_models = list(etalons_df["words"]) + questions[:100000]
    print(data_for_models[:10])
    print(len(data_for_models))

    # модель для токенизатора:
    model_parameters = {
        "model_type":
        "simple_rules",
        "stopwords_csv_path":
        os.path.join(data_path, "04_stopwords.csv"),
        "ngrams_csv_path":
        os.path.join(data_path, "kss_ngrams.csv"),
        "synonyms_files_csv_path": [
            os.path.join(data_path, "01_synonyms.csv"),
            os.path.join(data_path, "02_synonyms.csv"),
            os.path.join(data_path, "03_synonyms.csv")
        ],
        "tokenizer":
        "SimpleTokenizer",
        "is_lingvo_lemmatize":
        True,
        "is_etalons_lemmatize":
        True
    }

    model_for_tokenizer = model_make(**model_parameters)
    tokenizer = SimpleTokenizer(Loader(model_for_tokenizer))

    tz_txs = tokenizer.texts_processing(data_for_models)

    # соберем LSI модель на основании коллекции из 100 тысяч вопросов:
    lsi_model_dict = lsi_model_create(tz_txs, topics=1500)

    with open(kwargs["lsi_model_path"], "bw") as f:
        pickle.dump(lsi_model_dict, f)

    return 0
Пример #3
0
    data_rout = r'./data'
    models_rout = r'./models'
    """
    with open(os.path.join(models_rout, "fast_answrs", "include_and_model.pickle"), "br") as f:
        model = pickle.load(f)    
    smpltk = SimpleTokenizer(Loader(model)) 
    """

    txts = [
        "упрощенная бухгалтерская отчетность кто сдает Фи ТАм котОРый али бы",
        "кто должен сдавать аудиторское заключение",
        "кто должен подписывать справки", "парит летит воздушный судно"
    ]

    with open(os.path.join(models_rout, "fast_answrs", "bss_lsi_model.pickle"),
              "br") as f:
        model = pickle.load(f)
    '''
    lsi_tkz = LsiTokenizer(Loader(model))
    t1 = time.time()
    tk_m = lsi_tkz.model_tokenize()
    print(time.time() - t1)
    
    tk_txt = lsi_tkz.texts_processing(txts)
    print(tk_txt)
    print(len(tk_txt))
    '''

    tk_appl = TokenizerApply(Loader(model))
    print(tk_appl.texts_processing(txts))
Пример #4
0
data_path = r'./data'
models_path = r'./models'

model_parameters = {
    "model_type":
    "simple_rules",
    "stopwords_csv_path":
    os.path.join(data_path, "04_stopwords.csv"),
    "synonyms_files_csv_path": [
        os.path.join(data_path, "01_synonyms.csv"),
        os.path.join(data_path, "02_synonyms.csv"),
        os.path.join(data_path, "03_synonyms.csv")
    ],
    "tokenizer":
    "SimpleTokenizer",
    "is_lingvo_lemmatize":
    True,
    "is_etalons_lemmatize":
    True
}

model_for_tokenizer = model_make(**model_parameters)
print(model_for_tokenizer)
tokenizer = SimpleTokenizer(Loader(model_for_tokenizer))
tknz_texts = tokenizer.texts_processing(questions)

bigrams_df = bigrams_dictionary_create(tknz_texts)
print(bigrams_df)
bigrams_df.to_csv(os.path.join(data_path, "kss_ngrams_candidates.csv"))
print(quests50th_df[:100])

etalons_df = pd.read_csv(
    os.path.join(data_rout, "kosgu_data", "lingv_rules.csv"))
print(etalons_df["words"][:100])
print(etalons_df.shape)

train_df = pd.DataFrame(
    pd.concat([quests50th_df["words"], etalons_df["words"]], axis=0))
print('\n', train_df)
print(train_df.shape)

with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f:
    model = pickle.load(f)

tknz_txts = TokenizerApply(Loader(model))
# tx = "вчера нам пожелали доброго вечера 345 раз"

tz_txs = tknz_txts.texts_processing(list(train_df["words"]))
print(tz_txs[:10])
print(len(tz_txs))

# подготовка списка синонимов:
stopwords_df = pd.read_csv(
    os.path.join(data_rout, 'kosgu_data', 'stopwords.csv'))
lingv_rules_df = pd.read_csv(
    os.path.join(data_rout, 'kosgu_data', 'lingv_rules.csv'))
ngrams_df = pd.read_csv(os.path.join(data_rout, 'kosgu_data', 'ngrams.csv'))

sinonims_files = ['01_sinonims.csv', '02_sinonims.csv']
synonyms = []
Пример #6
0

if __name__ == "__main__":
    data_rout = r"./data"
    txt_df = pd.read_csv(os.path.join(data_rout, "bss_data",
                                      "texts_collection.tsv"),
                         sep="\t")
    print(txt_df)

    models_rout = r"./models"
    with open(
            os.path.join(models_rout, "fast_answrs",
                         "bss_include_and_model.pickle"), "br") as f:
        model = pickle.load(f)

    smp_tkz = SimpleTokenizer(Loader(model))
    tknz_txts = smp_tkz.texts_processing(list(txt_df["texts"][:1000]))
    print(tknz_txts[:10])
    print(len(tknz_txts))

    dct1 = tf_idf_model_create(tknz_txts)
    print(dct1)

    dct2 = tf_model_create(tknz_txts)
    print(dct2)

    dct3 = lsi_model_create(tknz_txts, topics=10)
    print(dct3)

    # проверка векторизации lsi модели:
    txt_corp = dct3["dictionary"].doc2bow(tknz_txts[5])
Пример #7
0
# Тестируем под задачу "быстрых ответов" первоначальные вопросы (на которых сеть обучалась) используем в качестве
# эталонов, смотрим, насколько качественно она отбирает во входящем потоке похожие на них вопросы
import os, pickle, time
from utility import Loader
from texts_processors import TokenizerApply
import pandas as pd

# загрузка файлов с данными:
tokenize_path = r'./tokenize_model'
test_path = r'./test'

with open(os.path.join(tokenize_path, "tokenizator_model.pickle"), "rb") as f:
    tokenize_model = pickle.load(f)
    tokenize_loader = Loader(tokenize_model)

tknz = TokenizerApply(tokenize_loader)

# загрузка вопросов
df_data = pd.read_csv(os.path.join(test_path, "ндс_прибыль_5000.csv"))
df_data.rename(columns={"0": "text"}, inplace=True)

# загрузка словаря, который "знает" нейронная сеть
work_dict_df = pd.read_csv(os.path.join(test_path, "dictionary_work.csv"))
work_dict_list = list(work_dict_df["token"])
print(work_dict_list)

# загрузка эталонов (первоначальных запросов, на которых обучалась нейронная сеть)
df_etalons = pd.read_csv(os.path.join(test_path, "etalons.csv"))

df_etalons = df_data
tktxs = tknz.texts_processing(df_data["text"])
Пример #8
0
import os, pickle
from utility import Loader

models_rout = r'./models'
with open(os.path.join(models_rout, "fast_answrs", "kosgu_lsi_model.pickle"),
          "br") as f:
    model = pickle.load(f)

loader_obj = Loader(model)
print(loader_obj.application_field["coeff"])
Пример #9
0
import os, pickle
import pandas as pd
import random
from texts_processors import TokenizerApply
from utility import Loader

data_rout = r'./data'
models_rout = r'./models'

with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f:
    lingv_model = pickle.load(f)

tk_appl = TokenizerApply(Loader(lingv_model))
data_df = pd.read_csv(os.path.join(data_rout, "data_group_01.csv"))
lemm_txts_l = tk_appl.texts_processing(list(data_df['text']))
lemm_txts_df = pd.DataFrame(list(zip([" ".join(x) for x in lemm_txts_l], data_df['group'])))
lemm_txts_df.rename(columns={0: 'text', 1: 'group'}, inplace=True)
print(lemm_txts_df)

lemm_txts_df.to_csv(os.path.join(data_rout, "lemm_data_group_01.csv"), index=False, columns=['text', 'group'])
df = pd.read_csv(os.path.join(data_rout, "lemm_data_group_01.csv"))
print(df)


# герерация пар семантически одинаковых вопросов
lbs = set(df['group'])
results_tuples = []
for lb in lbs:
    work_list = list(df['text'][df['group'] == lb])
    for tx1 in work_list:
        for tx2 in work_list:
Пример #10
0
if __name__ == "__main__":
    import time

    data_rout = r'./data'
    models_rout = r'./models'

    with open(
            os.path.join(models_rout, "fast_answrs", "kosgu_lsi_model.pickle"),
            "br") as f:
        model = pickle.load(f)

    print(model["model_type"])
    for i in model:
        print(i)
    cl = LsiClassifier(Loader(model))
    mc = ModelsChain([Loader(model)])
    tx = "командировки статья косгу"
    t1 = time.time()
    print(mc.rules_apply([tx]), time.time() - t1)
"""
    with open(os.path.join(models_rout, "fast_answrs", "kosgu_incl_and_test_model.pickle"), "br") as f:
        kosgu_incl_and = pickle.load(f)

    with open(os.path.join(models_rout, "fast_answrs", "bss_lsi_model.pickle"), "br") as f:
        bss_lsi = pickle.load(f)

    with open(os.path.join(models_rout, "fast_answrs", "bss_intersec_share_model.pickle"), "br") as f:
        bss_intersec = pickle.load(f)

    with open(os.path.join(models_rout, "fast_answrs", "bss_include_and_model.pickle"), "br") as f:
Пример #11
0
            vcs_arr = vcs_arr.reshape(vcs_arr.shape[0], vcs_arr.shape[1], 1)
            with graph.as_default():
                scores = self.model.classificator_algorithms[
                    "siamese_lstm_model"].predict([tx_tensor, vcs_arr])
            trues = [(tg, True) for scr, cf, tg in zip(scores, coeffs, tags)
                     if scr < cf]
            falses = [(tg, False) for scr, cf, tg in zip(scores, coeffs, tags)
                      if scr > cf]
            decisions.append((num, trues + falses))

        return decisions


if __name__ == "__main__":
    import time
    from utility import Loader

    data_rout = r'./data'
    models_rout = r'./models'

    with open(
            os.path.join(models_rout, "fast_answrs",
                         "bss_siamese_lstm_d2v.pickle"), "br") as f:
        bss_siamese = pickle.load(f)

    tx = ["кто может применять упрощенный баланс"]
    mdschain = ModelsChain([Loader(bss_siamese)],
                           classes=[SiameseNnDoc2VecClassifier])
    t1 = time.time()
    rt_t = mdschain.rules_apply(tx)
    print(tx[0], "bss_siamese:", rt_t, time.time() - t1)
Пример #12
0
                                 if scr > cf]
            # отсортируем, чтобы выводить наиболее подходящие результаты (с наибольшим скором)
            trues = [
                tg for tg, scr, cf in sorted(
                    trues_list_scores, key=lambda x: x[1], reverse=True)
            ]
            texts_tags_similarity.append((num, trues))
        return texts_tags_similarity


if __name__ == "__main__":
    import time

    models_path = r'./models'

    with open(os.path.join(models_path, "bss_model_lsi.pickle"), "br") as f:
        model_lsi = pickle.load(f)

    loader_obj = Loader(model_lsi)
    print(loader_obj.dictionaries)

    t1 = time.time()
    cl = LsiClassifier(Loader(model_lsi))
    print(time.time() - t1)

    tx = "упрощенная бухгалтерская отчетность кто сдает"
    t1 = time.time()
    rls = cl.rules_apply([tx])
    print(time.time() - t1)
    print(rls)
Пример #13
0

if __name__ == '__main__':
    init()
    models_rout = os.path.dirname(__file__)
    with open(
            os.path.join(models_rout, 'models/tax_tags',
                         "simple_rules_model" + ".pickle"), "br") as f:
        model_tax = pickle.load(f)

    for i in model_tax:
        print(i)

    print("model_type:", model_tax["model_type"])

    print("model_type:", Loader(model_tax).model_type)

    clss = SimpleRules(Loader(model_tax))
    # print(clss.tknz_model.application_field)

    print(clss.tknz_model.application_field)

    print("pattern1:", pattern1, "pattern2:", pattern2)

    data_rout = r"./tax_demands"

    with open("example.txt", "r") as f:
        tx = f.read()

    # print(tx)
Пример #14
0
def model_make(**kwargs):
    if "lingv_rules_csv_path" in kwargs:
        lingv_rules_df = pd.read_csv(kwargs["lingv_rules_csv_path"])
        rules_dict = {
            'rules': list(lingv_rules_df["rules"]),
            'words': list(lingv_rules_df["words"]),
            'tags': list(lingv_rules_df["tag"]),
            'coeff': list(lingv_rules_df["coeff"])
        }
    else:
        rules_dict = {'rules': [], 'words': [], 'tags': [], 'coeff': []}

    if "lingvo" in kwargs:
        lingvo_list = []
        for lingv_dict in kwargs["lingvo"]:
            if "stopwords_csv_path" in lingv_dict:
                stopwords = []
                for file_name in lingv_dict["stopwords_csv_path"]:
                    stopwords_df = pd.read_csv(file_name)
                    stopwords.append(list(stopwords_df['words']))
                lingvo_list.append({"stopwords": stopwords, "tokenize": True})

            if "synonyms_files_csv_path" in lingv_dict:
                synonyms = []
                for file_name in lingv_dict["synonyms_files_csv_path"]:
                    synonyms_df = pd.read_csv(file_name)
                    synonyms.append(
                        list(
                            zip(synonyms_df["words"],
                                synonyms_df["initial_forms"])))
                lingvo_list.append({"synonyms": synonyms, "tokenize": True})

            if "ngrams_csv_path" in lingv_dict:
                ngrams = []
                for file_name in lingv_dict["ngrams_csv_path"]:
                    ngrams_df = pd.read_csv(file_name)
                    ngrams.append([
                        (" ".join([w1, w2]), tk) for w1, w2, tk in zip(
                            list(ngrams_df["w1"]), list(ngrams_df["w2"]),
                            list(ngrams_df["bigrams"]))
                    ])
                lingvo_list.append({"ngrams": ngrams, "tokenize": False})

            if "workwords_csv_path" in lingv_dict:
                workwords = []
                for file_name in lingv_dict["workwords_csv_path"]:
                    workwords_df = pd.read_csv(file_name)
                    workwords.append(list(workwords_df['words']))
                lingvo_list.append({"workwords": workwords, "tokenize": True})

    else:
        lingvo_list = []

    if kwargs["model_type"] == 'simple_rules':
        # соберем модель для запуска токенизатора:
        model_dict_simple = models_create(tokenizer="SimpleTokenizer",
                                          model_type="simple_rules",
                                          lingv_rules=rules_dict,
                                          lingvo=lingvo_list)
        tokenizer = SimpleTokenizer(Loader(model_dict_simple))

        if "is_lingvo_lemmatize" in kwargs:
            is_lingvo_lemmatize = kwargs["is_lingvo_lemmatize"]
            if is_lingvo_lemmatize:
                # print("tokenizer.dictionaries:", "\n", tokenizer.dictionaries)
                lingvo_list = tokenizer.dictionaries
        else:
            is_lingvo_lemmatize = False

        if "is_etalons_lemmatize" in kwargs:
            is_etalons_lemmatize = kwargs["is_etalons_lemmatize"]
            if is_etalons_lemmatize:
                rules_dict["words"] = tokenizer.texts_processing(
                    rules_dict["words"])
        else:
            is_etalons_lemmatize = False

        result_model_dict = models_create(
            tokenizer=kwargs["tokenizer"],
            model_type=kwargs["model_type"],
            lingv_rules=rules_dict,
            lingvo=lingvo_list,
            is_lingvo_lemmatize=is_lingvo_lemmatize,
            is_etalons_lemmatize=is_etalons_lemmatize)
        return result_model_dict

    if kwargs["model_type"] == 'lsi':
        # загрузка lsi модели:
        with open(kwargs["lsi_model_path"], "rb") as f:
            lsi_dict = pickle.load(f)

        # соберем модель для запуска токенизатора:
        model_dict_lsi = models_create(tokenizer="LsiTokenizer",
                                       model_type="lsi",
                                       lingv_rules=rules_dict,
                                       lingvo=lingvo_list,
                                       texts_algorithms=lsi_dict)

        tokenizer = LsiTokenizer(Loader(model_dict_lsi))
        if 'index' not in lsi_dict:
            et_vectors = tokenizer.texts_processing(rules_dict['words'])
            index = MatrixSimilarity(et_vectors,
                                     num_features=lsi_dict["num_topics"])
            lsi_dict["index"] = index

        if "is_lingvo_lemmatize" in kwargs:
            is_lingvo_lemmatize = kwargs["is_lingvo_lemmatize"]
            if is_lingvo_lemmatize:
                lingvo_list = tokenizer.dictionaries
        else:
            is_lingvo_lemmatize = False

        result_model_dict = models_create(
            tokenizer="LsiTokenizer",
            model_type=kwargs["model_type"],
            lingv_rules=rules_dict,
            lingvo=lingvo_list,
            texts_algorithms=lsi_dict,
            is_lingvo_lemmatize=is_lingvo_lemmatize,
            is_etalons_lemmatize=True)
        return result_model_dict
Пример #15
0
    def texts_processing(self, incoming_text):
        return self.tnzr.texts_processing(incoming_text)


if __name__ == "__main__":
    data_rout = r'./data'
    models_rout = r'./models'

    """
    with open(os.path.join(models_rout, "fast_answrs", "include_and_model.pickle"), "br") as f:
        model = pickle.load(f)    
    smpltk = SimpleTokenizer(Loader(model)) 
    """

    txts = ["упрощенная бухгалтерская отчетность кто сдает Фи ТАм котОРый али бы",
            "кто должен сдавать аудиторское заключение", "кто должен подписывать справки",
            "парит летит воздушный судно"]

    with open(os.path.join(models_rout, "bss_model_lsi.pickle"), "br") as f:
        model = pickle.load(f)
        print("model is loaded")

    for cl in model:
        print(cl)
    print(model["lingvo"])
    print(model["is_lingvo_lemmatize"])
    # tk_appl = TokenizerApply(Loader(model))
    tk_appl = LsiTokenizer(Loader(model))
    print(tk_appl.dictionaries)
    print(tk_appl.texts_processing(txts))
Пример #16
0
def doc2vec_model_maker(**kwargs):
    key_words = ['%%']
    questions = []
    for word in key_words:
        res = questions_from_clickhouse(
            clickhose_host="srv02.ml.dev.msk3.sl.amedia.tech",
            user='******',
            password='******',
            date_in='2020-04-01',
            date_out='2020-08-31',
            limit=1000000,
            pubids_tuple=kwargs["pubids_tuple"],
            key_word=word)

        qs, dts = zip(*res)
        questions = questions + list(qs)

    print(len(questions))
    shuffle(questions)
    questions = questions  # [:1000]
    # data_for_models = list(questions[:1000000])

    # модель для токенизатора (используем простую модель, которая предполагается в наличие у каждой системы):
    if "simple_model_path" in kwargs:
        with open(kwargs["simple_model_path"], "rb") as f:
            model_for_tokenizer = pickle.load(f)

        tokenizer = SimpleTokenizer(Loader(model_for_tokenizer))
        tz_txs = tokenizer.texts_processing(questions)

    # надо сделать отдельную функцию для лемматизации
    if "lingvo_data" in kwargs:
        asc_dsc_syn = []
        asc_dsc_ngrm = []
        if "synonyms" in kwargs["lingvo_data"]:
            for fn in kwargs["lingvo_data"]["synonyms"]:
                temp_syn_df = pd.read_csv(fn)
                syn_asc_temp = [
                    " " + tx + " "
                    for tx in texts_lemmatize(temp_syn_df['words'])
                ]
                syn_dsc_temp = [
                    " " + tx + " "
                    for tx in texts_lemmatize(temp_syn_df['initial_forms'])
                ]
                asc_dsc_syn += list(zip(syn_asc_temp, syn_dsc_temp))

        if "ngrams" in kwargs["lingvo_data"]:
            asc_dsc_ngrm = []
            for fn in kwargs["lingvo_data"]["ngrams"]:
                temp_ngrms_df = pd.read_csv(fn)
                temp_ngrms = [(' '.join([w1, w2]), bgr) for w1, w2, bgr in zip(
                    temp_ngrms_df['w1'], temp_ngrms_df['w2'],
                    temp_ngrms_df['bigrams'])]
                asc_dsc_ngrm += temp_ngrms

        asc_dsc_list = asc_dsc_syn + asc_dsc_ngrm
        tz_txs = ngram_apply(asc_dsc_list, texts_lemmatize(questions))
        tz_txs_split = [tx.split() for tx in tz_txs if tx.split() != []]

    # соберем LSI модель на основании коллекции из 100 тысяч вопросов:
    model_parameters = {
        "split_txt": tz_txs_split,
        "model_rout": kwargs["doc2vec_model_path"]
    }
    create_doc2vec_model(**model_parameters)

    return 0