def get_tm_index(**kwargs): from util.service_es import search from nlpmonitor.settings import ES_CLIENT name = kwargs['name'] index_tm = kwargs['index_tm'] # Check if already exists if ES_CLIENT.indices.exists(index_tm): query = { "name": name, } if 'perform_actualize' in kwargs: query['is_ready'] = True s = search(ES_CLIENT, index_tm, query, source=[], get_search_obj=True) s = s.filter('exists', field="number_of_topics") s = s.execute() if s: return s[-1] query = { "name.keyword": name, } if 'perform_actualize' in kwargs: query['is_ready'] = True s = search(ES_CLIENT, index_tm, query, source=[], get_search_obj=True) s = s.filter('exists', field="number_of_topics") s = s.execute() if s: return s[-1] raise TMNotFoundException("Topic Modelling index not found!")
def validator(mappings_dict, client, index_theta_one, index_theta_two, datetime_from_tm_2, datetime_to_tm_1, number_of_topics): """ pass """ from sklearn.preprocessing import MinMaxScaler from nltk.metrics import jaccard_distance scaler = MinMaxScaler() scores = dict(zip(mappings_dict.keys(), [0] * len(mappings_dict))) scores_for_normalization = [] for threshhold, map_dict in mappings_dict.items(): cnt_matches_for_threshhold = 0 for topic_parent, topic_childs_list in map_dict.items(): theta_1 = search(client=client, index=index_theta_one, query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1, 'topic_id': topic_parent, 'topic_weight__gte': 0.05}, source=['document_es_id'], start=0, end=1000000, get_scan_obj=True ) scanned_parent = set([elem.document_es_id for elem in theta_1]) for topic_child in topic_childs_list: theta_2 = search(client=client, index=index_theta_two, query={'datetime__gte': datetime_from_tm_2, 'datetime__lte': datetime_to_tm_1, 'topic_id': topic_child, 'topic_weight__gte': 0.05}, source=['document_es_id'], start=0, end=1000000, get_scan_obj=True ) jaccard_score = 1 - jaccard_distance(scanned_parent, set([elem.document_es_id for elem in theta_2])) scores[threshhold] += jaccard_score cnt_matches_for_threshhold += 1 try: avg_score = scores[threshhold] / cnt_matches_for_threshhold scores_for_normalization.append(avg_score) scores[threshhold] = [len(map_dict) / number_of_topics, avg_score] except ZeroDivisionError: scores[threshhold] = [len(map_dict) / number_of_topics, 0] scores_normalized = [score[0] for score in scaler.fit_transform(np.array(scores_for_normalization).reshape(-1, 1))] for i, items in enumerate(scores.items()): scores[items[0]] += [scores_normalized[i]] return scores
def pool_embeddings(**kwargs): from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING start = kwargs['start'] end = kwargs['end'] corpus = kwargs['corpus'] from_embedding_name = kwargs['from_embedding_name'] from_embedding_by_unit = kwargs['from_embedding_by_unit'] to_embedding_name = kwargs['to_embedding_name'] to_embedding_by_unit = kwargs['to_embedding_by_unit'] pooling = kwargs['pooling'] # Get embedding object query = { "corpus": corpus.lower(), # "is_ready": False, # TODO Uncomment "name": to_embedding_name.lower(), } embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1] number_of_documents = embedding['number_of_documents'] # Get documents documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, {"corpus": corpus.lower()}, start=int(start / 100 * number_of_documents), end=int(end / 100 * number_of_documents), source=['id', from_embedding_name], sort=['id']) embeddings_to_write = [] documents_to_write = [] batch_size = 10000 # Pooling for document in documents: embeddings_to_write.append([]) pool_document(document, embeddings_to_write, documents_to_write, from_embedding_name, to_embedding_by_unit, from_embedding_by_unit, pooling) # Update to ES if len(embeddings_to_write) >= batch_size: persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT, documents_to_write, embeddings_to_write, to_embedding_name) embeddings_to_write = [] documents_to_write = [] persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT, documents_to_write, embeddings_to_write, to_embedding_name)
def generate_meta_dtm(**kwargs): from nlpmonitor.settings import ES_CLIENT, ES_INDEX_META_DTM from mainapp.documents import META_DTM from util.service_es import search from elasticsearch_dsl import Index meta_name = kwargs['meta_dtm_name'] volume_days = kwargs['tm_volume_days'] delta_days = kwargs['delta_days'] reset_index = kwargs['reset_index'] from_date = kwargs['from_date'] to_date = kwargs['to_date'] if reset_index: Index(ES_INDEX_META_DTM).delete(using=ES_CLIENT, ignore=404) if not ES_CLIENT.indices.exists(ES_INDEX_META_DTM): ES_CLIENT.indices.create(index=ES_INDEX_META_DTM, body={ "settings": META_DTM.Index.settings, "mappings": META_DTM.Index.mappings }) s = search(client=ES_CLIENT, index=ES_INDEX_META_DTM, query={ 'meta_name': meta_name, 'volume_days': volume_days, 'delta_days': delta_days, 'from_date': from_date, 'to_date': to_date }) if s: ES_CLIENT.update(index=ES_INDEX_META_DTM, id=s[-1].meta.id, body={ "doc": { "meta_name": meta_name, "volume_days": volume_days, "delta_days": delta_days, 'from_date': from_date, 'to_date': to_date, 'reset_index': reset_index } }) else: index = META_DTM( **{ "meta_name": meta_name, "volume_days": volume_days, "delta_days": delta_days, 'from_date': from_date, 'to_date': to_date, 'reset_index': reset_index }) index.save() return 'META DTM GENERATED'
def init_dictionary_index(**kwargs): from elasticsearch_dsl import Search, Index from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD from mainapp.documents import Dictionary, DictionaryWord from util.service_es import search name = kwargs['name'] es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}", using=ES_CLIENT) es_index.delete(ignore=404) settings = DictionaryWord.Index.settings ES_CLIENT.indices.create( index=f"{ES_INDEX_DICTIONARY_WORD}_{name}", body={ "settings": settings, "mappings": DictionaryWord.Index.mappings } ) es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp", using=ES_CLIENT) es_index.delete(ignore=404) settings = DictionaryWord.Index.settings ES_CLIENT.indices.create( index=f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp", body={ "settings": settings, "mappings": DictionaryWord.Index.mappings } ) s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=kwargs['corpuses']) number_of_documents = s.count() kwargs['corpuses'] = ",".join(kwargs['corpuses']) # Check if already exists if ES_CLIENT.indices.exists(ES_INDEX_DICTIONARY_INDEX): query = { "corpus": kwargs['corpuses'], "name": kwargs['name'] } if search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, query): return "Already exists" kwargs["number_of_documents"] = number_of_documents kwargs["is_ready"] = False dictionary = Dictionary(**kwargs) dictionary.save() return "Created"
def es_etl(**kwargs): from util.service_es import search, update_generator from util.constants import BASE_DAG_DIR from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING stuff = kwargs['stuff'] # Extract query = { "corpus": "main", } documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, query) print("!!!", len(documents)) # Transform for document in documents: # if 'num_views' in document: # document.num_views += 1 document.any_stuff = stuff document.literally_any_stuff = { "literally": [{ "any_stuff": [1, 2, 3, 4, 5, 6] }] } print("!!!", list(documents[0].to_dict().keys())) print("!!!", documents[0].any_stuff) print("!!!", documents[0].literally_any_stuff) # Load from elasticsearch.helpers import streaming_bulk for ok, result in streaming_bulk(ES_CLIENT, update_generator(ES_INDEX_DOCUMENT, documents), index=ES_INDEX_DOCUMENT, chunk_size=1000, raise_on_error=True, max_retries=10): print(ok, result)
def init_embedding_index(**kwargs): from elasticsearch_dsl import Search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING from mainapp.documents import EmbeddingIndex s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT) number_of_documents = s.count() # Check if already exists if ES_CLIENT.indices.exists(ES_INDEX_EMBEDDING): query = { "corpus": kwargs['corpus'], "name": kwargs['name'], "number_of_documents": number_of_documents, } if search(ES_CLIENT, ES_INDEX_EMBEDDING, query): return ("!!!", "Already exists") kwargs["number_of_documents"] = number_of_documents index = EmbeddingIndex(**kwargs) index.save()
def preprocessing_raw_data(**kwargs): import re import requests from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from elasticsearch_dsl import Search, Q from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from util.service_es import search, update_generator from util.util import is_word, is_kazakh start = kwargs['start'] end = kwargs['end'] number_of_documents = int( Variable.get("lemmatize_number_of_documents_kz", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['text'], sort=['id'], get_search_obj=True) s = s.exclude('exists', field="is_kazakh") s = s[int(start / 100 * number_of_documents):int(end / 100 * number_of_documents) + 1] documents = s.execute() print('!!! len docs', len(documents)) for doc in documents: if not is_kazakh(doc.text): doc['is_kazakh'] = False continue cleaned_doc = [ x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split() ] result = "" for i in range(len(cleaned_doc) // 10000 + 1): req_text = ' '.join(cleaned_doc[i * 10000:(i + 1) * 10000]) r = requests.get(f"http://apertium-flask:8005?text={req_text}") result += r.json()['result'] doc['text_lemmatized_kz_apertium'] = result doc['is_kazakh'] = True documents_processed = 0 failed = 0 for ok, result in streaming_bulk(ES_CLIENT, update_generator(ES_INDEX_DOCUMENT, documents), index=ES_INDEX_DOCUMENT, chunk_size=5000, raise_on_error=True, max_retries=10): if not ok: failed += 1 if failed > 5: raise Exception("Too many failed ES!!!") documents_processed += 1 return f"{documents_processed} Processed"
def topic_modelling(**kwargs): import numpy as np import numba as nb from util.util import save_obj, load_obj from util.service_es import search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DICTIONARY_WORD from .clustering_util import unique_clots, clots_binding, object_weighting1, cluster_weighting1, full_weighting1, \ n_extra_objects, n_key_objects, save_clustering ''' CLOT IN THE NEIGHBORHOOD Finding the clot in the neighborhood of the object defined by its pairwise distance matrix dm The maximal intra-cluster distance is specified by the parameter d1 None of the pairwise distances in formed clots exceeds d1 start_ind - index of object which will be used as starting point for clot growing Returns object indices included to the builded clot ################################### СГУСТОК В ОКРЕСТНОСТИ Нахождение сгустка в окрестности объекта заданной dm - квадратной матрицей попарных расстояний от анализируемого объекта до всех объектов в его окрестности Максимальное внутрикластерное расстояние задаются параметром d1 В формируемых сгустках расстояние между любыми двумя объектами не превышает d1 start_ind - индекс объекта который будет использован как начальная точка роста при формировании сгустка Возвращает индексы объектов включённых в сгусток ''' @nb.jit(nopython=True) def single_clot(dm, d1, start_ind): n = dm.shape[0] if start_ind < 0 or start_ind > n - 1: raise ValueError('start_ind is out of bounds') R = np.array([start_ind]) C = np.delete(np.arange(n), start_ind) while len(C) > 0: C = C[np.sum(dm[R][:, C] <= d1, axis=0) == len(R)] if len(C) > 0: dist_sum = np.sum(dm[R][:, C], axis=0) best_ind = np.argsort(dist_sum)[0] R = np.append(R, C[best_ind]) C = np.delete(C, best_ind) return R ''' CLOTS FOR ALL OBJECTS Starting the process of finding clots for all objects in their neighbourhood defined by circle with center in considered object and radius d2 Maximal pairwise intra-clot distance is specified by the parameter d1 D - symmetric square matrix of pairwise distances between objects. Each clot is defined by indices of included objects use_medoid - if True then the medoids will be used as initial growing points, otherwise - the circle centers. Returns the array where each i-th element is clot in neighbourhood of i-th object. ################################### СГУСТКИ ДЛЯ КАЖДОГО ИЗ ОБЪЕКТОВ Запустает процесс выявления сгустков для каждого из объектов в их окрестности заданной окружностью с центорм в рассматриваемом объекте и радиусом d2 Максимально допустимое попарное расстояние внутри сгустка задаётся параметром d1 D - матрица попарных межобъектных расстояний между объектами Каждый сгусток задан множеством индексов включённых в него объектов use_medoid - если True, то в качестве точек роста сгустков будут использоваться медоиды. В противном случае - центры окружностей. Возвращает массив в котором каждый i-ый элемент является сгустком в окрестности i-го объекта. ''' @nb.njit(parallel=True) def all_clots(D, d1, d2, use_medoid=True): n = D.shape[0] global_inds = np.arange(n) clots = [np.array([0])] * n for i in nb.prange(n): local_inds = global_inds[D[i] <= d2] if len(local_inds) > 0: dm = D[local_inds][:, local_inds] if use_medoid: start_ind = np.argmin(np.sum(dm, axis=0)) else: start_ind = np.where(local_inds == i)[0][0] clot = single_clot(dm, d1, start_ind) if len(clot) > 0: clots[i] = local_inds[clot] else: clots[i] = np.empty(0, dtype=nb.int64) else: clots[i] = np.empty(0, dtype=nb.int64) return clots print("!!!", "Initial stuff", datetime.datetime.now()) max_dict_size = 10000000 if 'max_dict_size' in kwargs: max_dict_size = kwargs['max_dict_size'] name = kwargs['name'] d1 = kwargs['d1'] d2 = kwargs['d2'] d3 = kwargs['d3'] min_clot_size = kwargs['min_clot_size'] use_medoid = kwargs['use_medoid'] dictionary_words = search(ES_CLIENT, ES_INDEX_DICTIONARY_WORD, query=kwargs['dictionary_filters'], source=("word_normal", ), sort=('_id', ), get_search_obj=True, end=max_dict_size) dictionary_words.aggs.bucket('unique_word_normals', 'terms', field='word_normal.keyword') vocab = [ dw.key for dw in dictionary_words.execute().aggregations.unique_word_normals.buckets ] data_folder = os.path.join(BASE_DAG_DIR, "mussabayev_tm_temp", name) distance_matrix = np.array(load_obj( os.path.join(data_folder, 'distance_matrix.pkl')), dtype=np.float32) cooccurrence_matrix = load_obj( os.path.join(data_folder, 'cooc_sparse_matrix.pkl')) matrix_dimensions = distance_matrix.shape[0] print("!!!", "Start all_clots", datetime.datetime.now()) # Запускаем процесс поиска сгустков по каждому из объектов # Увеличение d1 приводит к увеличению: окрестности поиска,количества и размеров получаемых сгустков, # времени вычисления и объёма использованной памяти a_clots = all_clots(distance_matrix, d1, d2, use_medoid) print('!!!', 'All clot count: ' + str(len(a_clots))) print("!!!", "Start unique_clots", datetime.datetime.now()) # Оставляем только уникальные сгустки clots = unique_clots(a_clots, min_clot_size) save_obj(data_folder, 'clots.pkl') print('!!!', 'Count of unique clots: ' + str(len(clots))) print("!!!", "Start clots_binding", datetime.datetime.now()) # Cвязывание сгустков в единые кластера по мере их пересечения clusters = clots_binding(clots, d3, -1) print('!!!', 'Cluster count: ' + str(len(clusters))) print("!!!", "Start object_weighting1", datetime.datetime.now()) # Взвешивание объектов внутри каждого кластера # Метод 1 - на основе взаимных расстояний object_weights1 = object_weighting1(clusters, distance_matrix) # Для каждого кластера определяем заданное nk количество ключевых объектов # по максимуму полученных весовых коэффициентов nk = 3 key_objects1 = n_key_objects(nk, clusters, object_weights1) # Вычисление коэффициентов соответствия кластеру для полного множества объектов по всему множеству кластеров # Т.е. для для всех объектов полного множества получаем коэффициенты соответствия каждому из # полученных кластеров # Для каждого кластера определяем заданное ne количество дополнительных объектов (кандидатов на включение в кластер) ne = 3 # Метод 1 - на основе взаимных расстояний full_weights1 = full_weighting1(clusters, distance_matrix) # На основе полученных коэффициентов осуществляем выбор в количестве ne объектов # имеющих максимальные коэффициенты соответствия кластеру но не включённых в рассматриваемый кластер extra_objects1 = n_extra_objects(ne, clusters, full_weights1) print("!!!", "Start cluster_weighting1", datetime.datetime.now()) # Взвешивание каждого из полученных кластеров по мере их соответствия полному исходному множеству объектов # Каждому кластеру присваивается весовой коэффициент отражающий насколько хорошо объекты входящие в кластер # согласуются с полным множеством всех объектов cluster_weights1 = cluster_weighting1(clusters, distance_matrix) # Method 1 save_clustering(clusters, cluster_weights1, object_weights1, key_objects1, extra_objects1, vocab, os.path.join(data_folder, "result_example.txt")) return f"Dictionary len={matrix_dimensions}, documents_len={'???TODO'}"
def generate_cooccurrence_codistance(**kwargs): from util.util import save_obj import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import pairwise_distances from util.service_es import search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_WORD max_dict_size = 30000 if 'max_dict_size' in kwargs: max_dict_size = kwargs['max_dict_size'] dictionary_words = search(ES_CLIENT, ES_INDEX_DICTIONARY_WORD, query=kwargs['dictionary_filters'], source=("word_normal", ), sort=('_id', ), get_search_obj=True, start=0, end=10) dictionary_words.aggs.bucket('unique_word_normals', 'terms', field='word_normal.keyword', size=max_dict_size) dictionary_words = dictionary_words.execute() documents_scan = search(ES_CLIENT, ES_INDEX_DOCUMENT, query=kwargs['document_filters'], source=("text_lemmatized", ), get_scan_obj=True, end=5000000) print("!!!", "Start count_vectorizing", datetime.datetime.now()) vectorizer = CountVectorizer(vocabulary=( dw.key for dw in dictionary_words.aggregations.unique_word_normals.buckets)) documents_vectorized = vectorizer.fit_transform( (d.text_lemmatized for d in documents_scan)) print("!!!", "Start dot product for coocurance matrix", datetime.datetime.now()) coocurance_matrix = documents_vectorized.T.dot( documents_vectorized).astype(np.uint32) print("!!!", "Saving coocurance matrix", datetime.datetime.now()) data_folder = os.path.join(BASE_DAG_DIR, "mussabayev_tm_temp") if not os.path.exists(data_folder): os.mkdir(data_folder) data_folder = os.path.join(data_folder, kwargs['name']) if not os.path.exists(data_folder): os.mkdir(data_folder) save_obj(coocurance_matrix, os.path.join(data_folder, 'cooc_sparse_matrix.pkl')) print("!!!", "Start distance matrix calc", datetime.datetime.now()) distance_matrix = pairwise_distances(coocurance_matrix, metric='cosine', n_jobs=4) print("!!!", "Save distance matrix ", datetime.datetime.now()) save_obj(distance_matrix, os.path.join(data_folder, 'distance_matrix.pkl')) return f"Dictionary len={len(vectorizer.vocabulary_.keys())}, documents_len={documents_vectorized.shape[0]}"
def preprocessing_raw_data(**kwargs): import re from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from lemminflect import getAllLemmas, getAllLemmasOOV from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from nltk.corpus import stopwords from stop_words import get_stop_words from util.service_es import search, update_generator from util.util import is_latin process_num = kwargs['process_num'] total_proc = kwargs['total_proc'] number_of_documents = int( Variable.get("lemmatize_number_of_documents_eng", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['id', 'text'], sort=['id'], get_search_obj=True) s = s.exclude('exists', field="is_english") stopwords = set( get_stop_words('ru') + get_stop_words('en') + stopwords.words('english')) success = 0 documents = [] for doc in s.params(raise_on_error=False).scan(): if int(doc.id) % total_proc != process_num: continue success += 1 if success > 50_000: break if success % 10_000 == 0: print(f"{success}/{50_000}") if not is_latin(doc.text): doc['is_english'] = False documents.append(doc) continue cleaned_doc = [ x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split() if not x in stopwords and len(x) > 2 ] result = "" for word in cleaned_doc: try: result += list(getAllLemmas(word).values())[0][0] + " " except IndexError: result += list(getAllLemmasOOV( word, upos="NOUN").values())[0][0] + " " doc['text_lemmatized_eng_lemminflect'] = result doc['is_english'] = True documents.append(doc)
def persist_embeddings(**kwargs): from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING from mainapp.models import Corpus, Document from preprocessing.models import ProcessedCorpus, ProcessedDocument, AnalysisUnit from elasticsearch_dsl import Search corpus = kwargs['corpus'] embedding_name = kwargs['embedding_name'] by_unit = kwargs['by_unit'] type_unit_int = kwargs['type_unit_int'] algorithm = kwargs['algorithm'] pooling = kwargs['pooling'] description = kwargs['description'] # Update embedding object to is_ready query = { "corpus": corpus.lower(), "name": embedding_name.lower(), # "is_ready": False, # TODO uncomment } embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1] ES_CLIENT.update(index=ES_INDEX_EMBEDDING, id=embedding.meta.id, body={"doc": { "is_ready": True }}) # Init processedCorpus pcs = ProcessedCorpus.objects.filter( corpus=Corpus.objects.get(name=corpus), name=embedding_name) if pcs.exists(): for pc in pcs: pc.delete() pc = ProcessedCorpus.objects.create(corpus=Corpus.objects.get(name=corpus), name=embedding_name, description=description) s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).source(['id', embedding_name ]).filter("term", corpus=corpus) def persist(batch_docs, batch_units, type): batch_docs = ProcessedDocument.objects.bulk_create(batch_docs) batch_units_objs = [] batch_size = 10000 for doc, embs in zip(batch_docs, batch_units): ind = 0 for emb in embs: batch_units_objs.append( AnalysisUnit(type=type, processed_document=doc, value=emb[by_unit], index=ind, embedding=emb['values'])) ind += 1 if len(batch_units_objs) >= batch_size: AnalysisUnit.objects.bulk_create(batch_units_objs) batch_units_objs = [] AnalysisUnit.objects.bulk_create(batch_units_objs) batch_size = 10000 batch_docs = [] batch_units = [] for document in s.scan(): batch_docs.append( ProcessedDocument(processed_corpus=pc, original_document_id=document.id)) embeddings = document[embedding_name] document_embeddings = [] if type_unit_int in [0, 1, 2]: for sent in embeddings: for token in sent: document_embeddings.append({ by_unit: token[by_unit], "values": token.layers[0].values }) elif type_unit_int in [3, 4]: for elem in embeddings: document_embeddings.append({ by_unit: elem[by_unit], "values": elem.layers[0].values }) elif type_unit_int in [5]: document_embeddings.append({ by_unit: embeddings[by_unit], "values": embeddings.layers[0].values }) else: raise Exception("Unknown Unit_by type") batch_units.append(document_embeddings) if len(batch_docs) >= batch_size: persist(batch_docs, batch_units, type_unit_int) batch_docs = [] batch_units = [] persist(batch_docs, batch_units, type=type_unit_int)
def generate_dictionary_batch(**kwargs): import datetime import re from elasticsearch.helpers import streaming_bulk from stop_words import get_stop_words from nltk.corpus import stopwords from util.util import is_kazakh, is_latin from util.service_es import search from nlpmonitor.settings import ES_INDEX_DOCUMENT, ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD, ES_CLIENT import logging es_logger = logging.getLogger('elasticsearch') es_logger.setLevel(logging.ERROR) name = kwargs['name'] process_num = kwargs['process_num'] total_proc = kwargs['total_proc'] corpuses = kwargs['corpuses'] max_n_gram_len = kwargs['max_n_gram_len'] min_relative_document_frequency = kwargs['min_relative_document_frequency'] field_to_parse = kwargs['field_to_parse'] query = { "name": name, "is_ready": False, } dictionary = search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, query)[-1] number_of_documents = dictionary.number_of_documents if not number_of_documents: raise Exception("No variable!") print("!!!", "Getting documents from ES", datetime.datetime.now()) documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={"corpus": corpuses}, source=[field_to_parse, 'id'], sort=['id'], get_search_obj=True, ) documents = documents.filter("exists", field=field_to_parse) number_of_documents = documents.count() # stopwords = set(get_stop_words('ru') + get_stop_words('en') + stopwords.words('english')) dictionary_words = {} print("!!!", "Iterating through documents", datetime.datetime.now()) for i, doc in enumerate(documents.params(raise_on_error=False).scan()): if i % 100_000 == 0: print(f"Processed {i} documents") print(f"Dictionary length is {len(dictionary_words)}") if int(doc.id) % total_proc != process_num: continue if len(doc[field_to_parse]) == 0: print("!!! WTF", doc.meta.id) continue if is_kazakh(doc[field_to_parse]): continue word_in_doc = set() cleaned_words = [x for x in ' '.join(re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc[field_to_parse]).split()).split()] if is_latin(doc[field_to_parse]): lang = "eng" elif is_kazakh((doc[field_to_parse])): lang = "kaz" else: lang = "rus" for n_gram_len in range(1, max_n_gram_len + 1): for n_gram in (cleaned_words[i:i + n_gram_len] for i in range(len(cleaned_words) - n_gram_len + 1)): word = "_".join(n_gram) is_first_upper = word[0].isupper() word = word.lower() # TEMP - DISABLED lemmatization # if lang == "eng": # parse = lemmatize_eng(word) # elif lang == "kaz": # continue # raise NotImplemented() # elif lang == "rus": # parse = lemmatize_ru(word) # else: # raise NotImplemented() if word not in dictionary_words: dictionary_words[word] = { "dictionary": name, "word": word, # "word_normal": parse["normal_form"], "word_normal": word, # "is_in_pymorphy2_dict": parse["is_known"], "is_in_pymorphy2_dict": True, # "is_multiple_normals_in_pymorphy2": parse["is_multiple_forms"], "is_multiple_normals_in_pymorphy2": False, # "is_stop_word": word in stopwords or parse["normal_form"] in stopwords, "is_stop_word": False, "is_latin": any([c in "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM" for c in word]), "is_kazakh": any([c in "ӘәҒғҚқҢңӨөҰұҮүІі" for c in word]) or lang == "kaz", "n_gram_len": n_gram_len, # "pos_tag": parse["pos_tag"], "pos_tag": "NA", "word_len": len(word), "word_frequency": 1, "document_frequency": 1, "word_first_capital_ratio": 1 if is_first_upper else 0, } else: dictionary_words[word]['word_frequency'] += 1 dictionary_words[word]['word_first_capital_ratio'] += 1 if word[0].isupper() else 0 if word not in word_in_doc: dictionary_words[word]['document_frequency'] += 1 word_in_doc.add(word)
def aggregate_dicts(**kwargs): import datetime from util.service_es import search from elasticsearch.helpers import streaming_bulk from elasticsearch_dsl import Search, Index from nlpmonitor.settings import ES_INDEX_DICTIONARY_INDEX, ES_INDEX_DICTIONARY_WORD, ES_CLIENT, ES_INDEX_DOCUMENT import logging es_logger = logging.getLogger('elasticsearch') es_logger.setLevel(logging.ERROR) name = kwargs['name'] corpuses = kwargs['corpuses'] min_relative_document_frequency = kwargs['min_relative_document_frequency'] query = { "dictionary": name, } dictionary_scan = search(ES_CLIENT, f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp" , query, get_scan_obj=True) dictionary_index = search(ES_CLIENT, ES_INDEX_DICTIONARY_INDEX, {"name": name})[-1] dictionary_words_final = {} dictionary_normal_words = {} print("!!!", "Iteration through scan", datetime.datetime.now()) for word in dictionary_scan: key = word['word'] key_normal = word['word_normal'] if not key in dictionary_words_final: dictionary_words_final[key] = word.to_dict() else: dictionary_words_final[key]['word_frequency'] += word['word_frequency'] dictionary_words_final[key]['word_first_capital_ratio'] += word['word_first_capital_ratio'] dictionary_words_final[key]['document_frequency'] += word['document_frequency'] if not key_normal in dictionary_normal_words: dictionary_normal_words[key_normal] = { "word_normal_frequency": word['word_frequency'], "word_normal_first_capital_ratio": word['word_first_capital_ratio'], "document_normal_frequency": word['document_frequency'] } else: dictionary_normal_words[key_normal]['word_normal_frequency'] += word['word_frequency'] dictionary_normal_words[key_normal]['word_normal_first_capital_ratio'] += word['word_first_capital_ratio'] dictionary_normal_words[key_normal]['document_normal_frequency'] += word['document_frequency'] print("!!!", "Forming final words dict", datetime.datetime.now()) for key in dictionary_words_final.keys(): dictionary_words_final[key]['word_normal_frequency'] = \ dictionary_normal_words[dictionary_words_final[key]['word_normal']]['word_normal_frequency'] dictionary_words_final[key]['word_normal_first_capital_ratio'] = \ dictionary_normal_words[dictionary_words_final[key]['word_normal']]['word_normal_first_capital_ratio'] dictionary_words_final[key]['document_normal_frequency'] = \ dictionary_normal_words[dictionary_words_final[key]['word_normal']]['document_normal_frequency'] dictionary_words_final[key]['word_first_capital_ratio'] /= \ dictionary_words_final[key]['word_frequency'] dictionary_words_final[key]['word_normal_first_capital_ratio'] /= \ dictionary_words_final[key]['word_normal_frequency'] dictionary_words_final[key]['word_frequency_relative'] = \ dictionary_words_final[key]['word_frequency'] / dictionary_index.number_of_documents dictionary_words_final[key]['word_normal_frequency_relative'] = \ dictionary_words_final[key]['word_normal_frequency'] / dictionary_index.number_of_documents dictionary_words_final[key]['document_frequency_relative'] = \ dictionary_words_final[key]['document_frequency'] / dictionary_index.number_of_documents dictionary_words_final[key]['document_normal_frequency_relative'] = \ dictionary_words_final[key]['document_normal_frequency'] / dictionary_index.number_of_documents success = 0 failed = 0 print("!!!", "Writing to ES", datetime.datetime.now()) len_dictionary = len(dictionary_words_final) s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpuses).source([])[:0] number_of_documents = s.count() print("!!!", "Number of documents", number_of_documents) print("!!! Min documents threshold", number_of_documents * min_relative_document_frequency) dictionary_words_final = filter(lambda x: x['document_frequency'] > number_of_documents * min_relative_document_frequency, dictionary_words_final.values()) for ok, result in streaming_bulk(ES_CLIENT, dictionary_words_final, index=f"{ES_INDEX_DICTIONARY_WORD}_{name}", chunk_size=1000, raise_on_error=True, max_retries=10): if not ok: failed += 1 else: success += 1 if success % 1000 == 0: print(f"{success}/{len_dictionary} processed, {datetime.datetime.now()}") if failed > 3: raise Exception("Too many failed!!") ES_CLIENT.update(index=ES_INDEX_DICTIONARY_INDEX, id=dictionary_index.meta.id, body={"doc": {"is_ready": True}}) es_index = Index(f"{ES_INDEX_DICTIONARY_WORD}_{name}_temp", using=ES_CLIENT) es_index.delete(ignore=404) return success
def generate_word_embeddings(**kwargs): from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_EMBEDDING start = kwargs['start'] end = kwargs['end'] # Get embedding object query = { "corpus": "main", # "is_ready": False, # TODO Uncomment "name": WORD_EMBEDDING_NAME.lower(), "by_unit": "word", "algorithm": "BERT".lower(), "pooling": "None".lower(), } embedding = search(ES_CLIENT, ES_INDEX_EMBEDDING, query)[-1] number_of_documents = embedding['number_of_documents'] # Get documents documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, {"corpus": "main"}, start=int(start / 100 * number_of_documents), end=int(end / 100 * number_of_documents), source=['id', 'text'], sort=['id']) # Embeddings themselves from textblob import TextBlob embeddings = [] documents_to_write = [] input_file_name = f"input-{start}-{end}.txt" output_file_name = f"output-{start}-{end}.json" batch_size = 10000 with tempfile.TemporaryDirectory() as tmpdir: for document in documents: # Write to input.txt with open(os.path.join(tmpdir, input_file_name), "w", encoding='utf-8') as f: text = TextBlob(document.text) for sentence in text.sentences: f.write(sentence.string.replace("\n", " ") + "\n") # Run bert subprocess.run([ "python", f"{os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings', 'bert', 'extract_features.py')}", f"--input_file={os.path.join(tmpdir, input_file_name)}", f"--output_file={os.path.join(tmpdir, output_file_name)}", f"--vocab_file={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'vocab.txt')}", f"--bert_config_file={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'bert_config.json')}", f"--init_checkpoint={os.path.join(BASE_DAG_DIR, 'dags', 'bert_embeddings','bert', 'models', 'rubert_cased_L-12_H-768_A-12_v1', 'bert_model.ckpt')}", "--layers=-2", "--max_seq_length=128", "--batch_size=1000" ]) # Read from output.json document_embeddings = [] with open(os.path.join(tmpdir, output_file_name), "r", encoding='utf-8') as f: for line in f.readlines(): embedding = json.loads(line) tokens = embedding['features'] words = [] # Pool tokens into words cur_token = "" cur_embed = [] for token in tokens[1:-1]: token_str = token['token'] token_emb = token['layers'][0]['values'] if not cur_token and not cur_embed: cur_token = token_str cur_embed.append(token_emb) elif "##" in token_str: cur_token += token_str.replace("##", "") cur_embed.append(token_emb) else: cur_embed = pool_vectors(cur_embed, "Average") words.append({ "layers": [{ "values": cur_embed, "index": -2 }], "word": cur_token }) cur_token = token_str cur_embed = [token_emb] if cur_token and cur_embed: cur_embed = pool_vectors(cur_embed, "Average") words.append({ "layers": [{ "values": cur_embed, "index": -2 }], "word": cur_token }) document_embeddings.append(words) embeddings.append(document_embeddings) documents_to_write.append(document) if len(embeddings) >= batch_size: persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT, documents_to_write, embeddings, WORD_EMBEDDING_NAME) embeddings = [] documents_to_write = [] persist_embeddings_to_es(ES_CLIENT, ES_INDEX_DOCUMENT, documents_to_write, embeddings, WORD_EMBEDDING_NAME)
def preprocess_data(**kwargs): """ :param kwargs: :return: """ import os import pickle import numpy as np from scipy.io import savemat from sklearn.feature_extraction.text import CountVectorizer from util.service_es import search from util.constants import BASE_DAG_DIR from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT from .utils import split_bow, remove_empty, create_bow, create_doc_indices, create_list_words corpus = kwargs.get('corpus', 'main') test_size = kwargs.get('test_size', 0.1) max_df = 0.7 min_df = 100 # choose desired value for min_df # Read data print('reading text file...') docs = search(client=ES_CLIENT, index=ES_INDEX_DOCUMENT, start=0, end=1_000_000, query={'corpus': corpus}, source=['text_lemmatized'], get_scan_obj=True) docs = [' '.join(doc) for doc in docs.text_lemmatized] # Create count vectorizer print('counting document frequency of words...') cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None) cvz = cvectorizer.fit_transform(docs).sign() # Get vocabulary print('building the vocabulary...') sum_counts = cvz.sum(axis=0) v_size = sum_counts.shape[1] sum_counts_np = np.zeros(v_size, dtype=int) for v in range(v_size): sum_counts_np[v] = sum_counts[0, v] word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_]) del cvectorizer print(' initial vocabulary size: {}'.format(v_size)) # Split in train/test/valid print('tokenizing documents and splitting into train/test/valid...') num_docs = cvz.shape[0] # Remove words not in train_data vocab = [word for word in word2id.keys()] print(' vocabulary after removing words not in train: {}'.format( len(vocab))) docs_tr = [[word2id[w] for w in docs[idx_d].split() if w in word2id] for idx_d in range(num_docs)] docs_ts = docs_tr[:test_size] del docs print(' number of documents (train): {} [this should be equal to {}]'. format(len(docs_tr), num_docs)) # Getting lists of words and doc_indices print('creating lists of words...') words_tr = create_list_words(docs_tr) words_ts = create_list_words(docs_ts) # Get doc indices print('getting doc indices...') doc_indices_tr = create_doc_indices(docs_tr) doc_indices_ts = create_doc_indices(docs_ts) # Remove empty documents print('removing empty documents...') docs_tr = remove_empty(docs_tr) docs_ts = remove_empty(docs_ts) # Number of documents in each set n_docs_tr = len(docs_tr) n_docs_ts = len(docs_ts) # Create bow representation print('creating bow representation...') bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab)) bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab)) # Save vocabulary to file path_save = os.path.join(BASE_DAG_DIR, 'etm_temp') if not os.path.isdir(path_save): os.system('mkdir -p ' + path_save) with open(os.path.join(path_save, 'vocab.pkl'), 'wb') as f: pickle.dump(vocab, f) # Split bow intro token/value pairs print('splitting bow intro token/value pairs and saving to disk...') bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr) savemat(os.path.join(path_save, 'bow_tr_tokens.mat'), {'tokens': bow_tr_tokens}, do_compression=True) savemat(os.path.join(path_save, 'bow_tr_counts.mat'), {'counts': bow_tr_counts}, do_compression=True) bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts) savemat(os.path.join(path_save, 'bow_ts_tokens.mat'), {'tokens': bow_ts_tokens}, do_compression=True) savemat(os.path.join(path_save, 'bow_ts_counts.mat'), {'counts': bow_ts_counts}, do_compression=True) print('Data ready !!')
def ngramize(**kwargs): import datetime from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from elasticsearch_dsl import Search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD, \ ES_INDEX_DICTIONARY_WORD from util.service_es import search, update_generator process_num = kwargs['process_num'] total_proc = kwargs['total_proc'] corpus = kwargs['corpus'] dict_name = kwargs['dict_name'] source_field = kwargs['source_field'] min_document_frequency_relative = kwargs['min_document_frequency_relative'] max_n_gram_len = kwargs['max_n_gram_len'] print("!!!", "Getting documents", datetime.datetime.now()) documents = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=(source_field, 'id'), sort=('id', ), get_search_obj=True) documents = documents.exclude('exists', field=f'text_ngramized_{dict_name}') documents = documents.filter('exists', field=source_field) documents = documents.filter('terms', corpus=corpus) print("!!!", "Getting dictionary", datetime.datetime.now()) s = Search(using=ES_CLIENT, index=f"{ES_INDEX_DICTIONARY_WORD}_{dict_name}") s = s.filter( "range", document_frequency_relative={"gt": min_document_frequency_relative}) s = s.filter("range", n_gram_len={"gte": 2}) s = s.source(("word", )) dict_words = set(w.word for w in s.scan()) print('!!! len dict', len(dict_words)) print("!!!", "Processing documents", datetime.datetime.now()) success = 0 documents_to_process = [] for doc in documents.params(raise_on_error=False).scan(): if int(doc.id) % total_proc != process_num: continue success += 1 if success > 50_000: break if success % 10_000 == 0: print(f"{success}/{50_000}") text_ngramized = doc[source_field] text_ngramized_split = text_ngramized.split() n_grams_to_append = [] for n_gram_len in range(2, max_n_gram_len + 1): n_grams = [ text_ngramized_split[i:i + n_gram_len] for i in range(len(text_ngramized_split) - n_gram_len + 1) ] for n_gram in n_grams: word = "_".join(n_gram) if word in dict_words: n_grams_to_append.append(word) doc[f'text_ngramized_{dict_name}'] = text_ngramized + " ".join( n_grams_to_append) documents_to_process.append(doc)
def preprocessing_raw_data(**kwargs): import re from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from elasticsearch_dsl import Search, Q from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from pymorphy2 import MorphAnalyzer from pymystem3 import Mystem from stop_words import get_stop_words from util.service_es import search, update_generator from util.util import is_latin, is_word start = kwargs['start'] end = kwargs['end'] number_of_documents = int( Variable.get("lemmatize_number_of_documents", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['text'], sort=['id'], get_search_obj=True) s = s.query(~Q('exists', field="text_lemmatized_yandex") | ~Q('exists', field="text_lemmatized")) s = s[int(start / 100 * number_of_documents):int(end / 100 * number_of_documents) + 1] documents = s.execute() print('!!! len docs', len(documents)) stopwords_ru = set(get_stop_words('ru')) stopwords_eng = set(get_stop_words('en') + stopwords.words('english')) lemmatizer = WordNetLemmatizer() morph = MorphAnalyzer() m = Mystem() s = Search(using=ES_CLIENT, index=ES_INDEX_CUSTOM_DICTIONARY_WORD) r = s[:1000000].scan() custom_dict = dict((w.word, w.word_normal) for w in r) for doc in documents: cleaned_doc = " ".join(x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split()) if is_latin(cleaned_doc): cleaned_words_list = [ lemmatizer.lemmatize(word) for word in cleaned_doc.split() if len(word) > 3 and word not in stopwords_eng ] doc['text_lemmatized_yandex'] = "" else: cleaned_words_list = [ morph_with_dictionary(morph, word, custom_dict) for word in cleaned_doc.split() if len(word) > 2 and word not in stopwords_ru ] cwl_yandex = filter( lambda word: is_word(word) and len(word) > 2 and word not in stopwords_ru, m.lemmatize(cleaned_doc)) cleaned_doc_yandex = " ".join(cwl_yandex) doc['text_lemmatized_yandex'] = cleaned_doc_yandex cleaned_doc = " ".join(cleaned_words_list) doc['text_lemmatized'] = cleaned_doc documents_processed = 0 failed = 0 for ok, result in streaming_bulk(ES_CLIENT, update_generator(ES_INDEX_DOCUMENT, documents), index=ES_INDEX_DOCUMENT, chunk_size=5000, raise_on_error=True, max_retries=10): if not ok: failed += 1 if failed > 5: raise Exception("Too many failed ES!!!") documents_processed += 1 return f"{documents_processed} Processed, {known_counter} in pymorphie dict, {custom_dict_counter} in custom dict, {not_in_dict_counter} not found"
def mapper(**kwargs): """ идем в мета дтм, тянем оттуда дтм для которого хотим получить маппинги потом идем по этому meta_dtm_name тянем все топик моделлинги, формируем два листа набора слов, скорим """ from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DYNAMIC_TOPIC_MODELLING, ES_INDEX_DYNAMIC_TOPIC_DOCUMENT from mainapp.documents import Mappings from util.service_es import search from util.util import parse_topics_field, mapper, validator import json meta_dtm_name = kwargs['meta_dtm_name'] datetime_from_tm_1 = kwargs['datetime_from_tm_1'] datetime_to_tm_1 = kwargs['datetime_to_tm_1'] datetime_from_tm_2 = kwargs['datetime_from_tm_2'] datetime_to_tm_2 = kwargs['datetime_to_tm_2'] number_of_topics = kwargs['number_of_topics'] theta_name_1 = ES_INDEX_DYNAMIC_TOPIC_DOCUMENT + "_" + kwargs['name_immutable'] + "_" + str( datetime_from_tm_1) + "_" + str(datetime_to_tm_1) theta_name_2 = ES_INDEX_DYNAMIC_TOPIC_DOCUMENT + "_" + kwargs['name_immutable'] + "_" + str( datetime_from_tm_2) + "_" + str(datetime_to_tm_2) # TODO fix meta_dtm_name issue tm_1 = search(client=ES_CLIENT, index=ES_INDEX_DYNAMIC_TOPIC_MODELLING, query={ 'meta_dtm_name.keyword': meta_dtm_name, 'datetime_from__gte': datetime_from_tm_1, 'datetime_to__lte': datetime_to_tm_1 }, source=['name', 'meta_dtm_name', 'datetime_from', 'datetime_to', 'topics', 'topic_doc'], ) tm_2 = search(client=ES_CLIENT, index=ES_INDEX_DYNAMIC_TOPIC_MODELLING, query={ 'meta_dtm_name.keyword': meta_dtm_name, 'datetime_from__gte': datetime_from_tm_2, 'datetime_to__lte': datetime_to_tm_2 }, source=['name', 'meta_dtm_name', 'datetime_from', 'datetime_to', 'topics', 'topic_doc'], ) tm_1_dict, tm_1_name = parse_topics_field(tm_1[0]) tm_2_dict, tm_2_name = parse_topics_field(tm_2[0]) topic_modelling_first_from = tm_1_name.split('_')[-2] topic_modelling_second_to = tm_2_name.split('_')[-1] thresholds = list(map(str, [0.3, 0.4, 0.5, 0.6, 0.7, 0.8])) mappings_dict, delta_words_dict, delta_count_dict = mapper(topic_seq_1=tm_1_dict, topic_seq_2=tm_2_dict, threshold_list=thresholds) scores = validator(mappings_dict=mappings_dict, client=ES_CLIENT, index_theta_one=theta_name_1, index_theta_two=theta_name_2, datetime_from_tm_2=datetime_from_tm_2, datetime_to_tm_1=datetime_to_tm_1, number_of_topics=number_of_topics) for threshold in thresholds: index = Mappings( threshold=threshold, meta_dtm_name=meta_dtm_name, topic_modelling_first=tm_1_name, topic_modelling_second=tm_2_name, topic_modelling_first_from=topic_modelling_first_from, topic_modelling_second_to=topic_modelling_second_to, mappings_dict=json.dumps(mappings_dict[threshold]), scores_list=scores[threshold], delta_words_dict=json.dumps(delta_words_dict[threshold]), delta_count_dict=json.dumps(delta_count_dict[threshold]), ) index.save() return 'Mapping created'