def lsi_model(self, num_topics: int = 10, stochastic: bool = False): """ Construct LSI topic models for each year in a corpus, given a set of parameters. """ if self.word_to_id is None or self.corpora is None: self.build_dictionaries_and_corpora() if self.tf_idf_models is None: self.build_tf_idf_models() results = num_dict(self.year_list) if not stochastic: for year in self.year_list[:-1]: results[year] = \ LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]], id2word=self.word_to_id[year], num_topics=num_topics ) else: for year in self.year_list[:-1]: results[year] = \ LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]], id2word=self.word_to_id[year], num_topics=num_topics, onepass=False ) return TopicResults(results, self.num_docs)
def lsi(self): self.tf_idf() if self.corpus_tf_idf and self.dictionary: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf] print self.lsi_model.print_topic(2) elif self.corpus_tf_idf: self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2) self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus
def getLsiFeature(documents, topicNum): ''' Funciton: generate lsi features by training lsi model Input: documents: list of preprocessed sentences topicNum: output vector dimension Output: lsi features(DataFrame format) ''' # get corpus # LogInfo(' Get corpus...') texts = [[word for word in document.split(' ')] for document in documents] dictionary = corpora.Dictionary(texts) corpusD = [dictionary.doc2bow(text) for text in texts] # train lsi model # LogInfo(' Train LSI model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) # generate lsi features LogInfo(' Generate LSI features...') lsiFeature = np.zeros((len(texts), topicNum)) i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 colName = getColName(topicNum, "qlsi") lsiFeature = pd.DataFrame(lsiFeature, columns = colName) return lsiFeature
def generate_docs_lsi(self, dictionary_file_path, tfidf_file_path, lsi_file_path, num_topics=100): """ 生成文档库lsi降维文件 :param dictionary_file_path: :param tfidf_file_path: :return: """ try: dictionary = corpora.Dictionary.load(dictionary_file_path) tfidf_corpus = corpora.MmCorpus(tfidf_file_path) print tfidf_corpus lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # lsi.print_topics(10) with open(lsi_file_path, 'wb') as f: pickle.dump(lsi, f) logger.info('lsi model file building finished') # doc_lsi = lsi[doc_bow] except Exception as e: logger.error( 'generate documents library lsi model file failed for %s' % str(e))
def lsi(documents, topicNum): texts = [[word for word in document.split(' ')] for document in documents] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts))) dictionary = corpora.Dictionary(texts) print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..') corpusD = [dictionary.doc2bow(text) for text in texts] print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...') tfidf = TfidfModel(corpusD) corpus_tfidf = tfidf[corpusD] model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1) lsiFeature = np.zeros((len(texts), topicNum)) print('translate...') i = 0 for doc in corpusD: topic = model[doc] for t in topic: lsiFeature[i, t[0]] = round(t[1],5) i = i + 1 if i%1000 == 1: print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i)) return lsiFeature
def fit_lda(X, vocab, num_topics=50, passes=1): """ Fit LDA from a scipy CSR matrix (X). """ print 'fitting lda...' return LsiModel(gensim.matutils.Sparse2Corpus(X, documents_columns=False), num_topics=num_topics, chunksize=10000, id2word=vocab)
def compute_lda(): # from gensim.models.ldamulticore import LdaMulticore from gensim.models.lsimodel import LsiModel keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words( ) try: len(corpus) except: for doc in iter(corpus): pass host = os.environ.get('pyro_ns_host', None) port = int(os.environ.get('pyro_ns_port', 0)) or None tfidf = compute_tfidf() with time_code('compute_lda'): corpus_tfidf = tfidf[corpus] lda = LsiModel(corpus_tfidf, num_topics=500, id2word=int2word, distributed=True, ns_conf=dict( host=host, port=port, broadcast=port and host, )) # lda = LdaMulticore(corpus_tfidf, num_topics=500, id2word=int2word, workers=None) return lda
def train_model(filename, output_name, data={}): output = data output['dataset'] = filename output['output_name'] = output_name df = pd.read_csv('./data/dataset/%s' % filename) lemmas_list = [] for lemmas in df['lemmas']: lemmas = str(lemmas) lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '') lemmas_list.append(lemmas.split()) dictionary = corpora.Dictionary(lemmas_list) make_dir('./data/dicts/') dictionary.save('./data/dicts/%s_corpus.dict' % output_name) output['dict'] = '%s_corpus.dict' % output_name clean_doc = [dictionary.doc2bow(text) for text in lemmas_list] tfidf = models.TfidfModel(clean_doc, normalize=True) lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200) make_dir('./data/models') lsi.save('./data/models/%s_model.txt' % output_name) output['model'] = '%s_model.txt' % output_name return output
def lsi_similarity(cps, cps1, cps2, dic): # 计算s1,s2词频LSI相似度 print("starting lsi similarity....") lsi = LsiModel(corpus=cps, num_topics=100, id2word=dic) s1_lsi = lsi[cps1] s2_lsi = lsi[cps2] sm = similarities.MatrixSimilarity(corpus=s1_lsi, num_features=lsi.num_topics) lsi_sm = np.diag(sm[s2_lsi]) return lsi_sm
def from_text_files_in_path(self, path, extension=".txt"): doc_id = 0 for tokens in self.training_documents_from_path(path, extension): document = {'id': "doc_" + str(doc_id), 'tokens': tokens} doc_id = doc_id + 1 if self.model: self.model.add_documents(document) else: self.model = LsiModel(document) return self.model
def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False): """ """ self.num_components = num_components # Train LSI based on training dataset self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \ num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation # Convert bow into LSI projections self.corpus_lsi = self.lsi[self.training_corpus]
def cluster(sentences): my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'} corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences] corpus = np.array(corpus) tf_vectorizer = TfidfVectorizer(decode_error='ignore', max_df=0.7, stop_words=my_stop_words.union(stop_words), ngram_range=(1, 1)) tf_mat = tf_vectorizer.fit_transform(corpus) id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())} n_topics = 5 lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T), num_topics=n_topics, id2word=id2word, onepass=False) gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)] lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T norm = Normalizer(copy=False) lsi_mat = norm.fit_transform(lsi_mat) valid_indices = np.where(lsi_mat.any(axis=1))[0] valid_sent = lsi_mat[valid_indices] n_clusters = 7 cluster = KMeans(n_clusters, n_init=100) cluster.fit(valid_sent) clusters = {} for i in range(n_clusters): clusters[i] = np.where(cluster.labels_ == i)[0] for i in clusters.keys(): if np.sum( np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i] )) > cluster.inertia_ / n_clusters: del clusters[i] last_cluster = [ valid_indices[clusters[i][np.where( np.sum(np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]), axis=1) < cluster.inertia_ / len(corpus))]].tolist() for i in clusters ] return last_cluster
def train(self, tokens): """ Trains the LSI model Parameters ---------- tokens: list of list of str e.g. [['hi', 'ho'], ['my', 'name', ...], ...] """ self.fill_dictionary(tokens) corpus = self.to_corpus(tokens) self.tfidf = TfidfModel(corpus) corpus = self.tfidf[corpus] self.lsi = LsiModel(corpus, num_topics=self.num_topics)
def __create_model(self, algo, topic_qtt): model = None if (algo == TopicModelingAlgorithm.LDA): model = LdaModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words, random_state=1) elif (algo == TopicModelingAlgorithm.LSA): model = LsiModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words) elif (algo == TopicModelingAlgorithm.NMF): model = Nmf(corpus=self.__corpus, num_topics=topic_qtt, random_state=1) return model
def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True, model='lda'): """ Trian the topic cluster model. Input value: data: pd.DataFrame format ['id','title','content','summary'] num_topics: (int) the number of topics iterations: (int) total number of iteration times example: >>> lda = LDA_Model >>> lda.train(text) """ data = load_data(str(path + '/output/data.csv')) self.original_data = data self.text = list(data['content']) self.num_topics = num_topics self.iterations = iterations self.model_name = model print('preprocessing...') self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words) self.id2word = Dictionary(self.token) self.corpus = [self.id2word.doc2bow(text) for text in self.token] if tfidf == True: print('calculate tfidf...') tfidf_model = TfidfModel(self.corpus) self.corpus = tfidf_model[self.corpus] if model == 'lda': self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics, iterations=self.iterations) if model == 'lsi': self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics) if model == 'hdp': self.model = HdpModel(corpus=self.corpus, id2word=self.id2word) self.num_topics = self.model.get_topics().shape[0] self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words']) self.doc_topic = self._doc_topic() self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id']) self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True): filepath = self.paths.get_lsa_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError('No LSA file exists but from_scratch is False') trigram_dictionary = self.lda_builder.get_corpus_dict() trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary) print('Building LSA model...') lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics) lsi.save(filepath) print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath)) else: print('Loading LSA model (n_topics={})...'.format(n_topics)) lsi = LsiModel.load(filepath) return lsi
def build_similarity(self, corpus: List[tuple], model='tfidf') -> None: """ Builds a similarity model for a bag of words corpus :param corpus: to build the similarity model :param model: strategy """ from gensim.models.tfidfmodel import TfidfModel from gensim.models.lsimodel import LsiModel from gensim import similarities self.dictionary.compactify() if model == 'tfidf': self.model = TfidfModel(corpus, id2word=self.dictionary) elif model == 'lsi': # todo: remove magic number self.model = LsiModel(corpus, id2word=self.dictionary, num_topics=2) feature_cnt = len(self.dictionary.token2id) self.index = similarities.SparseMatrixSimilarity( self.model[corpus], num_features=feature_cnt)
punc_free = "".join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized # Creating a list of documents from the complaints column list_of_docs = df["message"].tolist() # Implementing the function for all the complaints of list_of_docs doc_clean = [clean(doc).split() for doc in list_of_docs] # Code starts here # Creating the dictionary from our cleaned word list doc_clean dictionary = corpora.Dictionary(doc_clean) # Creating the corpus doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # Creating the LSi model lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary) pprint(lsimodel.print_topics()) # -------------- from gensim.models import LdaModel from gensim.models import CoherenceModel # doc_term_matrix - Word matrix created in the last task # dictionary - Dictionary created in the last task # Function to calculate coherence values def compute_coherence_values(dictionary, corpus, texts, limit,
print(lda[test_doc_bow2]) !pip install pyLDAvis import pyLDAvis.gensim pyLDAvis.enable_notebook() pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary) from gensim.models import CoherenceModel lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v') LDA_cm=lda_cm.get_coherence() LDA_cm from gensim.models.lsimodel import LsiModel lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20) lsi_topics = lsi.print_topics() for topic in lsi_topics: print(topic) test_doc = 'Journal of medicines and herbs' test_doc = custom_preprocess(test_doc) test_doc_bow = journals_dictionary.doc2bow(test_doc) print(test_doc_bow) print(lsi[test_doc_bow]) test_doc2 = 'Material and physics' test_doc2 = custom_preprocess(test_doc2) test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def lsi_transform(text, n_topics): dictionary = corpora.Dictionary(text) corpus = [dictionary.doc2bow(essay) for essay in text] lsi = LsiModel(corpus=corpus, num_topics=n_topics) return lsi, dictionary
def testar_num_topics( self, modelo, num_topicos=[20, 50, 100, 200, 300, 400, 500, 1000, 1500], perc_fichas=0.2, vetor_testes=None, tipo_teste='similaridade'): ''' Testa a coerência dos modelos gerados por tópicos para uma lista de quantidade de tópicos para encontrar o melhor número de tópicos para o modelo com relação ao corpus. Parâmetros: modelo (str) --> Modelo a ser testado: "lda", "lsi" ou "doc2vec". num_topicos (list de int) --> Lista de números de tópicos a serem testados (default: [20, 50, 100, 200, 300, 400, 500, 1000, 1500]) per_fichas (float) --> Percentual de fichas do corpus a serem considerados para o teste (default: 0.2) vetor_teste (list de tuple) --> Lista de pares de fichas para testes de similaridade. É ignorado se o teste é o "u_mass" (default: None) tipo_testes (str) --> Tipo de teste: "u_mass" ou "similaridade" (default: "similaridade") Retorno: um dicionário de dicionários. A chave do dicionário principal é o número de tópicos e, para cada número de tópicos, há outro dicionário com as seguintes chaves: "medida" --> Valor de coerência calculado para o modelo com aquele número de tópicos. "modelo" --> O modelo gerado para aquele número de tópicos ''' # Verifica se o teste para o modelo foi implantado if modelo not in ['lda', 'lsi', 'doc2vec']: print( f'O modelo {modelo} ou não é de tópico ou não foi implantado.') return if tipo_teste not in ['u_mass', 'similaridade']: print(f'O tipo de teste {tipo_teste} não foi implementado.') return if modelo == 'doc2vec' and tipo_teste == 'u_mass': print( 'O teste de coerência com u_mass não pode ser usado para o modelo doc2vec.' ) return # Iniciando as variáveis para os testes resultado = {} arq_index = os.path.join(self.corpus._pastas['indices'], f'{self.corpus._link_nome}_testes.idx') if vetor_testes: flat = list(zip(*vetor_testes)) fichas_incluir = set(flat[0]) fichas_incluir.update(flat[1]) else: fichas_incluir = None # Define os corpus de treinamento e o corpus parcial if modelo == 'lsi': bow = self.corpus.corpus(tipo='bow') corpus_parcial = bow.fatiar(perc_fichas=perc_fichas, incluir=fichas_incluir) model_tfidf = self['tfidf'] or TfidfModel( corpus=corpus_parcial, id2word=self.corpus.dicionario()) corpus_train = model_tfidf[corpus_parcial] elif modelo == 'lda': bow = self.corpus.corpus(tipo='bow') corpus_parcial = corpus_train = bow.fatiar(perc_fichas=perc_fichas, incluir=fichas_incluir) elif modelo == 'doc2vec': corpus_tagged = self.corpus.corpus(tipo='tagged') corpus_parcial = corpus_train = corpus_tagged.fatiar( perc_fichas=perc_fichas, incluir=fichas_incluir) # Obtém a relação dos ids_fichas do corpus parcial if fichas_incluir: ids_fichas = corpus_parcial.fichas() else: ids_fichas = list(range(len(corpus_parcial))) # Faz o teste para cada quantidade de tópicos for num in tqdm(num_topicos): print(f'Criando modelo "{modelo}" para num_topics={num}') # Treina os modelo solicitado if modelo == 'lda': model = LdaModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num) elif modelo == 'lsi': model = LsiModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num) elif modelo == 'doc2vec': model = Doc2Vec(vector_size=num, workers=mp.cpu_count() / 2, alpha=self._modelos[modelo]['alpha'], min_alpha=self._modelos[modelo]['min_alpha']) # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec model.build_vocab(corpus_train) # Treina o modelo Doc2Vec model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs) # Salva o modelo construído para o número de tópicos da iteração resultado[num] = {'modelo': model} # Realiza o teste de coerência if tipo_teste == 'u_mass': # Calcula a coerência do modelo para o número de tópicos setado print( f'Calculando o score de coerência do modelo "{modelo}" para num_topics={num}' ) cm = CoherenceModel(model=model, corpus=corpus_train, coherence='u_mass') resultado[num]['medida'] = cm.get_coherence() print(f'Score u_mass = {resultado[num]["medida"]}') # Realiza o teste de similaridade elif tipo_teste == 'similaridade': # Define o corpus para a matriz de similaridade if modelo == 'doc2vec': corpus = Doc2VecCorpus(model) else: corpus = model[corpus_train] # Calcula a similaridade do modelo para o número de tópicos setado print( f'Calculando o score de similaridade do modelo "{modelo}" para num_topics={num}' ) index = Similarity(output_prefix=arq_index, corpus=corpus, num_features=num) medidas = [] for ficha_query, ficha_target in vetor_testes: id_query = self.corpus.ficha2id(ficha_query) query = ids_fichas.index(id_query) id_target = self.corpus.ficha2id(ficha_target) target = ids_fichas.index(id_target) posicao, _ = self._obter_posicao_target( index, query, target) medidas.append(1 / posicao) valores = pd.Series(medidas) resultado[num]['medida'] = valores.median() print(f'Score similaridade = {resultado[num]["medida"]}') return resultado
for chunksize in np.arange(10000, 10001, 10000): lsi_models[num_topics][chunksize] = {} lsi_similarity_indices[num_topics][chunksize] = {} for power_iters in np.arange(1, 2): lsi_models[num_topics][chunksize][power_iters] = {} lsi_similarity_indices[num_topics][chunksize][power_iters] = {} for onepass in np.arange(1): print('Number of topics: {}. Chunksize: {}. Number of power iterations: {}. One-pass: {}' .format(num_topics, chunksize, power_iters, bool(onepass))) lsi = LsiModel(corpus, id2word=id2token, num_topics=num_topics, chunksize=chunksize, onepass=onepass, power_iters=bool(power_iters)) lsi_models[num_topics][chunksize][power_iters][onepass] = lsi lsi_similarity_indices[num_topics][chunksize][power_iters][onepass] = similarities.MatrixSimilarity( lsi[corpus], num_features=num_topics ) run_time = int((time.time() - start_time) / 60) print('Grid search took {} minutes.'.format(run_time)) with open('lsi_models.pickle', 'wb') as f: pickle.dump(lsi_models, f) print('Models saved.')
def gerar_modelo(self, modelo): ''' Treina o modelo selecionado, salvando-o. Após, cria a matrix de similaridade para o corpus transformado. Parâmetros: modelo (str) --> nome do modelo: "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: None ''' # Verifica se o modelo foi implementado if modelo not in self._modelos: print(f'O modelo "{modelo}" não foi implementado.') return # Define os nomes dos arquivos arq_model = os.path.join( self.corpus._pastas['modelos'], f'{self.corpus._link_nome}.{self._exts[modelo]}') arq_index = os.path.join(self.corpus._pastas['indices'], f'{self.corpus._link_nome}_{modelo}.idx') # Gera o modelo solicitado if modelo == 'tfidf': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario()) elif modelo == 'tfidf_pivot': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario(), smartirs='nfu', pivot=self.corpus.num_tokens / self.corpus.num_docs) elif modelo == 'lda': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self._modelos[modelo]['num_topics'] model = LdaModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'lsi': # Inicia o modelo corpus_train = self.corpus.corpus(tipo='tfidf') num_features = self._modelos[modelo]['num_topics'] model = LsiModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'doc2vec': # Instancia o modelo Doc2Vec corpus_train = self.corpus.corpus(tipo='tagged') num_features = self._modelos[modelo]['vector_size'] model = Doc2Vec(vector_size=num_features, workers=mp.cpu_count() / 2, alpha=self._modelos[modelo]['alpha'], min_alpha=self._modelos[modelo]['min_alpha']) # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec model.build_vocab(corpus_train) # Treina o modelo Doc2Vec model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs) else: print(f'O modelo "{modelo}" não foi implementado.') return # Salva o modelo treinado model.save(self._arqs['modelos'][modelo]) # Define o corpus para a matriz de similaridade if modelo == 'doc2vec': corpus = Doc2VecCorpus(model) else: corpus = model[corpus_train] # Gera o index a partir do modelo serializado index = Similarity(output_prefix=self._arqs['indices'][modelo], corpus=corpus, num_features=num_features) # Salva o índice index.save(self._arqs['indices'][modelo])
def main(): parser = ArgumentParser( description= 'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information' ) parser.add_argument('-ds', '--dataset', default='wiki', help='What kind of dataset to use. (wiki,es,file)') parser.add_argument('-d', '--dump-file', help='Wiki: bz2 dump file with wiki in it') parser.add_argument('-l', '--limit', help='Wiki: How many documents to extract from wiki') parser.add_argument('--model-id', default='model', help='Filename for created model.') parser.add_argument( '--model-type', default='lsi', help='Model type (lsi, lda, word2vec, hdp, vocabulary).') parser.add_argument('--n-topics', default=10, help='Number of topics to model.') parser.add_argument('--n-passes', default=1, help='Number of passes for LDA model.') parser.add_argument('--w2v-size', default=100, help='size of Word2Vec context.') parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.') parser.add_argument('-q', '--query', default=None, help='Elasticsearch: Query to use to fetch documents') parser.add_argument('--index', help='Elasticsearch: index to read from.') parser.add_argument('--doc_type', default='doc', help='Elasticsearch: data type in index.') parser.add_argument( '--data-dir', help='Directory to save the generated models and vocabularies into.') parser.add_argument( '--vocab', help= 'Prebuilt Vocabulary file. Use this to avoid having to generate one.') opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ['es', 'wiki', 'file']: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ['wiki']: logging.error('--dump-file required for wiki dataset') sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == 'es' and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = '%s_%s_%d' % (model_id, model_type, n_topics) if data_dir: model_fn = '%s/%s' % (data_dir, model_fn) if model_type == 'word2vec': w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == 'es': logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset(read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es) elif data_type == 'wiki': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == 'file': logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words('norwegian')) if not vocab_file or model_type == 'vocabulary': vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + '.vocab') else: vocab = Dictionary.load(vocab_file) if model_type == 'vocabulary': return tfidf = TfidfModel(dictionary=vocab) if model_type == 'lsi': corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == 'lda': corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == 'word2vec': corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == 'hdp': corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def train_lsi_model(corpus, dictionary): lsi = LsiModel(corpus=corpus, id2word=id2word(dictionary), num_topics=3, chunksize=10000,\ onepass=True) return lsi
dicto = corpora.Dictionary(texts) corpus = [dicto.doc2bow(text) for text in texts] lsi_models = {} lsi_similarity_indices = {} start_time = time.time() for chunksize in np.arange(5000, 30001, 5000): print('Chunksize: {}'.format(chunksize)) iter_start_time = time.time() lsi = LsiModel(corpus, id2word=id2token, num_topics=50, chunksize=chunksize, onepass=False, power_iters=2) lsi_models[chunksize] = lsi lsi_similarity_indices[chunksize] = similarities.MatrixSimilarity( lsi[corpus], num_features=100) print('{} seconds'.format(int(time.time() - iter_start_time))) run_time = int((time.time() - start_time) / 60) print('Parameter search took {} minutes.'.format(run_time)) with open('lsi_models_num_topics_chunksize.pickle', 'wb') as f: pickle.dump(lsi_models, f) print('Models saved.')
def run(): try: print("starting to build LSI Model") start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print("number_of_documents:", number_of_documents) texts = [tokenize(document) for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] print("texts:", len(texts), texts[:5]) dictionary = Dictionary(texts) #print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print("corpus:", type(corpus)) print("generating lsi model") lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print("saving LSI model") lsi.save(path_to_directory_of_this_file + "/model") # nullifyin all topics on features and places Feature.objects.exclude(topic=None).update(topic=None) Place.objects.exclude(topic=None).update(topic=None) Topic.objects.all().delete() print("deleted all topics") topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) print("bulk created all topics") """ # re-create topics for all features in database for feature in Feature.objects.exclude(text=None).exclude(text=""): words = tokenize(feature.text) if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities: topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0] if topic_id: feature.topic_id = topic_id feature.save() # assign as topic to each place based on most popular topic found in features for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True): counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id")) print "counter:", counter """ except Exception as e: print(e)
# corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_tfidf', corpus_dev_word_seg_tfidf) # corpus_test_word_seg_tfidf = model.__getitem__(corpus_test_word_seg) # corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_tfidf', corpus_test_word_seg_tfidf) corpus_train_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_train_word_seg_tfidf') corpus_dev_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_dev_word_seg_tfidf') corpus_test_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_test_word_seg_tfidf') corpus_word_seg_tfidf = [] corpus_word_seg_tfidf.extend(corpus_train_word_seg_tfidf) corpus_word_seg_tfidf.extend(corpus_dev_word_seg_tfidf) corpus_word_seg_tfidf.extend(corpus_test_word_seg_tfidf) # lsi print('Start train lsi...') lsi_model = LsiModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=400) lsi_model.save('../topic_model/word_seg_lsi_model') corpus_train_word_seg_lsi = lsi_model[corpus_train_word_seg_tfidf] corpus_dev_word_seg_lsi = lsi_model[corpus_dev_word_seg_tfidf] corpus_test_word_seg_lsi = lsi_model[corpus_test_word_seg_tfidf] corpora.MmCorpus.serialize('../topic_model/corpus_train_word_seg_lsi', corpus_train_word_seg_lsi) corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_lsi', corpus_dev_word_seg_lsi) corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_lsi', corpus_test_word_seg_lsi) #lda print('Start train lda...') lda_model = LdaModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=100, update_every=1, chunksize=1000, passes=1) lda_model.save('../topic_model/word_seg_lda_model') corpus_train_word_seg_lda = lda_model[corpus_train_word_seg_tfidf] corpus_dev_word_seg_lda = lda_model[corpus_dev_word_seg_tfidf]