示例#1
0
    def lsi_model(self, num_topics: int = 10, stochastic: bool = False):
        """
        Construct LSI topic models for each year in a
        corpus, given a set of parameters.
        """

        if self.word_to_id is None or self.corpora is None:
            self.build_dictionaries_and_corpora()

        if self.tf_idf_models is None:
            self.build_tf_idf_models()

        results = num_dict(self.year_list)

        if not stochastic:

            for year in self.year_list[:-1]:
                results[year] = \
                    LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]],
                             id2word=self.word_to_id[year],
                             num_topics=num_topics
                             )

        else:

            for year in self.year_list[:-1]:
                results[year] = \
                    LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]],
                             id2word=self.word_to_id[year],
                             num_topics=num_topics,
                             onepass=False
                             )

        return TopicResults(results, self.num_docs)
示例#2
0
 def lsi(self):
     self.tf_idf()
     if self.corpus_tf_idf and self.dictionary:
         self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
         self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
         print self.lsi_model.print_topic(2)
     elif self.corpus_tf_idf:
         self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
         self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus
def getLsiFeature(documents, topicNum):
    '''
     Funciton:
         generate lsi features by training lsi model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lsi features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpusD = [dictionary.doc2bow(text) for text in texts]
    
    # train lsi model
#     LogInfo(' Train LSI model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
    model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

    # generate lsi features
    LogInfo(' Generate LSI features...')
    lsiFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpusD:
        topic = model[doc]
        for t in topic:
             lsiFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlsi")
    lsiFeature = pd.DataFrame(lsiFeature, columns = colName)
    return lsiFeature
示例#5
0
 def generate_docs_lsi(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lsi_file_path,
                       num_topics=100):
     """
     生成文档库lsi降维文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         print tfidf_corpus
         lsi = LsiModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100)
         # lsi.print_topics(10)
         with open(lsi_file_path, 'wb') as f:
             pickle.dump(lsi, f)
         logger.info('lsi model file building finished')
         # doc_lsi = lsi[doc_bow]
     except Exception as e:
         logger.error(
             'generate documents library lsi model file failed for %s' %
             str(e))
def lsi(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]

	model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

	lsiFeature = np.zeros((len(texts), topicNum))
	print('translate...')
	i = 0

	for doc in corpusD:
		topic = model[doc]
		
		for t in topic:
			 lsiFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return lsiFeature
示例#7
0
def fit_lda(X, vocab, num_topics=50, passes=1):
    """ Fit LDA from a scipy CSR matrix (X). """
    print 'fitting lda...'
    return LsiModel(gensim.matutils.Sparse2Corpus(X, documents_columns=False),
                    num_topics=num_topics,
                    chunksize=10000,
                    id2word=vocab)
def compute_lda():
    # from gensim.models.ldamulticore import LdaMulticore
    from gensim.models.lsimodel import LsiModel

    keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words(
    )

    try:
        len(corpus)
    except:
        for doc in iter(corpus):
            pass

    host = os.environ.get('pyro_ns_host', None)
    port = int(os.environ.get('pyro_ns_port', 0)) or None

    tfidf = compute_tfidf()
    with time_code('compute_lda'):
        corpus_tfidf = tfidf[corpus]
        lda = LsiModel(corpus_tfidf,
                       num_topics=500,
                       id2word=int2word,
                       distributed=True,
                       ns_conf=dict(
                           host=host,
                           port=port,
                           broadcast=port and host,
                       ))
        # lda = LdaMulticore(corpus_tfidf, num_topics=500, id2word=int2word, workers=None)

    return lda
示例#9
0
def train_model(filename, output_name, data={}):
    output = data

    output['dataset'] = filename
    output['output_name'] = output_name

    df = pd.read_csv('./data/dataset/%s' % filename)
    lemmas_list = []

    for lemmas in df['lemmas']:
        lemmas = str(lemmas)
        lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '')
        lemmas_list.append(lemmas.split())

    dictionary = corpora.Dictionary(lemmas_list)
    make_dir('./data/dicts/')
    dictionary.save('./data/dicts/%s_corpus.dict' % output_name)

    output['dict'] = '%s_corpus.dict' % output_name

    clean_doc = [dictionary.doc2bow(text) for text in lemmas_list]

    tfidf = models.TfidfModel(clean_doc, normalize=True)

    lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200)
    make_dir('./data/models')
    lsi.save('./data/models/%s_model.txt' % output_name)
    output['model'] = '%s_model.txt' % output_name

    return output
示例#10
0
def lsi_similarity(cps, cps1, cps2, dic):
    # 计算s1,s2词频LSI相似度
    print("starting lsi similarity....")
    lsi = LsiModel(corpus=cps, num_topics=100, id2word=dic)
    s1_lsi = lsi[cps1]
    s2_lsi = lsi[cps2]
    sm = similarities.MatrixSimilarity(corpus=s1_lsi, num_features=lsi.num_topics)
    lsi_sm = np.diag(sm[s2_lsi])
    return lsi_sm
示例#11
0
 def from_text_files_in_path(self, path, extension=".txt"):
     doc_id = 0
     for tokens in self.training_documents_from_path(path, extension):
         document = {'id': "doc_" + str(doc_id), 'tokens': tokens}
         doc_id = doc_id + 1
         if self.model:
             self.model.add_documents(document)
         else:
             self.model = LsiModel(document)
     return self.model
示例#12
0
	def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False):
		"""
		
		"""

		self.num_components = num_components
		# Train LSI based on training dataset
		self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \
		                           num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation
		# Convert bow into LSI projections
		self.corpus_lsi = self.lsi[self.training_corpus]
示例#13
0
def cluster(sentences):

    my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'}

    corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences]

    corpus = np.array(corpus)
    tf_vectorizer = TfidfVectorizer(decode_error='ignore',
                                    max_df=0.7,
                                    stop_words=my_stop_words.union(stop_words),
                                    ngram_range=(1, 1))

    tf_mat = tf_vectorizer.fit_transform(corpus)
    id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())}
    n_topics = 5

    lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T),
                   num_topics=n_topics,
                   id2word=id2word,
                   onepass=False)
    gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)]
    lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T
    norm = Normalizer(copy=False)
    lsi_mat = norm.fit_transform(lsi_mat)

    valid_indices = np.where(lsi_mat.any(axis=1))[0]
    valid_sent = lsi_mat[valid_indices]

    n_clusters = 7

    cluster = KMeans(n_clusters, n_init=100)
    cluster.fit(valid_sent)

    clusters = {}
    for i in range(n_clusters):
        clusters[i] = np.where(cluster.labels_ == i)[0]

    for i in clusters.keys():
        if np.sum(
                np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]
                          )) > cluster.inertia_ / n_clusters:
            del clusters[i]

    last_cluster = [
        valid_indices[clusters[i][np.where(
            np.sum(np.square(valid_sent[clusters[i]] -
                             cluster.cluster_centers_[i]),
                   axis=1) < cluster.inertia_ / len(corpus))]].tolist()
        for i in clusters
    ]
    return last_cluster
示例#14
0
    def train(self, tokens):
        """ Trains the LSI model

        Parameters
        ----------
        tokens: list of list of str
            e.g. [['hi', 'ho'], ['my', 'name', ...], ...]

        """
        self.fill_dictionary(tokens)
        corpus = self.to_corpus(tokens)
        self.tfidf = TfidfModel(corpus)
        corpus = self.tfidf[corpus]
        self.lsi = LsiModel(corpus, num_topics=self.num_topics)
示例#15
0
    def __create_model(self, algo, topic_qtt):
        model = None

        if (algo == TopicModelingAlgorithm.LDA):
            model = LdaModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words,
                             random_state=1)
        elif (algo == TopicModelingAlgorithm.LSA):
            model = LsiModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words)
        elif (algo == TopicModelingAlgorithm.NMF):
            model = Nmf(corpus=self.__corpus,
                        num_topics=topic_qtt,
                        random_state=1)

        return model
示例#16
0
    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
示例#17
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
示例#18
0
    def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No LSA file exists but from_scratch is False')

            trigram_dictionary = self.lda_builder.get_corpus_dict()
            trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary)

            print('Building LSA model...')
            lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics)

            lsi.save(filepath)
            print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath))
        else:
            print('Loading LSA model (n_topics={})...'.format(n_topics))
            lsi = LsiModel.load(filepath)

        return lsi
示例#19
0
    def build_similarity(self, corpus: List[tuple], model='tfidf') -> None:
        """
        Builds a similarity model for a bag of words corpus
        :param corpus: to build the similarity model
        :param model: strategy
        """

        from gensim.models.tfidfmodel import TfidfModel
        from gensim.models.lsimodel import LsiModel
        from gensim import similarities

        self.dictionary.compactify()

        if model == 'tfidf':
            self.model = TfidfModel(corpus, id2word=self.dictionary)
        elif model == 'lsi':
            # todo: remove magic number
            self.model = LsiModel(corpus,
                                  id2word=self.dictionary,
                                  num_topics=2)

        feature_cnt = len(self.dictionary.token2id)
        self.index = similarities.SparseMatrixSimilarity(
            self.model[corpus], num_features=feature_cnt)
示例#20
0
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()
# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]
# Code starts here
# Creating the dictionary from our cleaned word list doc_clean
dictionary = corpora.Dictionary(doc_clean)
# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

# --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
print(lda[test_doc_bow2])

!pip install pyLDAvis

import pyLDAvis.gensim                             
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary)

from gensim.models import CoherenceModel
lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v')
LDA_cm=lda_cm.get_coherence()
LDA_cm

from gensim.models.lsimodel import LsiModel

lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20)

lsi_topics = lsi.print_topics()
for topic in lsi_topics:
  print(topic)

test_doc = 'Journal of medicines and herbs'
test_doc = custom_preprocess(test_doc)
test_doc_bow = journals_dictionary.doc2bow(test_doc)
print(test_doc_bow)

print(lsi[test_doc_bow])

test_doc2 = 'Material and physics'
test_doc2 = custom_preprocess(test_doc2)
test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def lsi_transform(text, n_topics):
    dictionary = corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(essay) for essay in text]

    lsi = LsiModel(corpus=corpus, num_topics=n_topics)
    return lsi, dictionary
示例#23
0
 def testar_num_topics(
         self,
         modelo,
         num_topicos=[20, 50, 100, 200, 300, 400, 500, 1000, 1500],
         perc_fichas=0.2,
         vetor_testes=None,
         tipo_teste='similaridade'):
     '''
     Testa a coerência dos modelos gerados por tópicos para uma lista de quantidade de tópicos para encontrar
     o melhor número de tópicos para o modelo com relação ao corpus.
     Parâmetros:
         modelo (str) --> Modelo a ser testado: "lda", "lsi" ou "doc2vec".
         num_topicos (list de int) --> Lista de números de tópicos a serem testados
                 (default: [20, 50, 100, 200, 300, 400, 500, 1000, 1500])
         per_fichas (float) --> Percentual de fichas do corpus a serem considerados para o teste (default: 0.2)
         vetor_teste (list de tuple) --> Lista de pares de fichas para testes de similaridade. É ignorado se o teste
             é o "u_mass" (default: None)
         tipo_testes (str) --> Tipo de teste: "u_mass" ou "similaridade" (default: "similaridade")
     Retorno: um dicionário de dicionários. A chave do dicionário principal é o número de tópicos e, para cada número de
         tópicos, há outro dicionário com as seguintes chaves:
             "medida" --> Valor de coerência calculado para o modelo com aquele número de tópicos.
             "modelo" --> O modelo gerado para aquele número de tópicos
     '''
     # Verifica se o teste para o modelo foi implantado
     if modelo not in ['lda', 'lsi', 'doc2vec']:
         print(
             f'O modelo {modelo} ou não é de tópico ou não foi implantado.')
         return
     if tipo_teste not in ['u_mass', 'similaridade']:
         print(f'O tipo de teste {tipo_teste} não foi implementado.')
         return
     if modelo == 'doc2vec' and tipo_teste == 'u_mass':
         print(
             'O teste de coerência com u_mass não pode ser usado para o modelo doc2vec.'
         )
         return
     # Iniciando as variáveis para os testes
     resultado = {}
     arq_index = os.path.join(self.corpus._pastas['indices'],
                              f'{self.corpus._link_nome}_testes.idx')
     if vetor_testes:
         flat = list(zip(*vetor_testes))
         fichas_incluir = set(flat[0])
         fichas_incluir.update(flat[1])
     else:
         fichas_incluir = None
     # Define os corpus de treinamento e o corpus parcial
     if modelo == 'lsi':
         bow = self.corpus.corpus(tipo='bow')
         corpus_parcial = bow.fatiar(perc_fichas=perc_fichas,
                                     incluir=fichas_incluir)
         model_tfidf = self['tfidf'] or TfidfModel(
             corpus=corpus_parcial, id2word=self.corpus.dicionario())
         corpus_train = model_tfidf[corpus_parcial]
     elif modelo == 'lda':
         bow = self.corpus.corpus(tipo='bow')
         corpus_parcial = corpus_train = bow.fatiar(perc_fichas=perc_fichas,
                                                    incluir=fichas_incluir)
     elif modelo == 'doc2vec':
         corpus_tagged = self.corpus.corpus(tipo='tagged')
         corpus_parcial = corpus_train = corpus_tagged.fatiar(
             perc_fichas=perc_fichas, incluir=fichas_incluir)
     # Obtém a relação dos ids_fichas do corpus parcial
     if fichas_incluir: ids_fichas = corpus_parcial.fichas()
     else: ids_fichas = list(range(len(corpus_parcial)))
     # Faz o teste para cada quantidade de tópicos
     for num in tqdm(num_topicos):
         print(f'Criando modelo "{modelo}" para num_topics={num}')
         # Treina os modelo solicitado
         if modelo == 'lda':
             model = LdaModel(corpus=corpus_train,
                              id2word=self.corpus.dicionario(),
                              num_topics=num)
         elif modelo == 'lsi':
             model = LsiModel(corpus=corpus_train,
                              id2word=self.corpus.dicionario(),
                              num_topics=num)
         elif modelo == 'doc2vec':
             model = Doc2Vec(vector_size=num,
                             workers=mp.cpu_count() / 2,
                             alpha=self._modelos[modelo]['alpha'],
                             min_alpha=self._modelos[modelo]['min_alpha'])
             # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec
             model.build_vocab(corpus_train)
             # Treina o modelo Doc2Vec
             model.train(corpus_train,
                         total_examples=model.corpus_count,
                         epochs=model.epochs)
         # Salva o modelo construído para o número de tópicos da iteração
         resultado[num] = {'modelo': model}
         # Realiza o teste de coerência
         if tipo_teste == 'u_mass':
             # Calcula a coerência do modelo para o número de tópicos setado
             print(
                 f'Calculando o score de coerência do modelo "{modelo}" para num_topics={num}'
             )
             cm = CoherenceModel(model=model,
                                 corpus=corpus_train,
                                 coherence='u_mass')
             resultado[num]['medida'] = cm.get_coherence()
             print(f'Score u_mass = {resultado[num]["medida"]}')
         # Realiza o teste de similaridade
         elif tipo_teste == 'similaridade':
             # Define o corpus para a matriz de similaridade
             if modelo == 'doc2vec': corpus = Doc2VecCorpus(model)
             else: corpus = model[corpus_train]
             # Calcula a similaridade do modelo para o número de tópicos setado
             print(
                 f'Calculando o score de similaridade do modelo "{modelo}" para num_topics={num}'
             )
             index = Similarity(output_prefix=arq_index,
                                corpus=corpus,
                                num_features=num)
             medidas = []
             for ficha_query, ficha_target in vetor_testes:
                 id_query = self.corpus.ficha2id(ficha_query)
                 query = ids_fichas.index(id_query)
                 id_target = self.corpus.ficha2id(ficha_target)
                 target = ids_fichas.index(id_target)
                 posicao, _ = self._obter_posicao_target(
                     index, query, target)
                 medidas.append(1 / posicao)
             valores = pd.Series(medidas)
             resultado[num]['medida'] = valores.median()
             print(f'Score similaridade = {resultado[num]["medida"]}')
     return resultado
    for chunksize in np.arange(10000, 10001, 10000):
        lsi_models[num_topics][chunksize] = {}
        lsi_similarity_indices[num_topics][chunksize] = {}

        for power_iters in np.arange(1, 2):
            lsi_models[num_topics][chunksize][power_iters] = {}
            lsi_similarity_indices[num_topics][chunksize][power_iters] = {}

            for onepass in np.arange(1):
                print('Number of topics: {}. Chunksize: {}. Number of power iterations: {}. One-pass: {}'
                      .format(num_topics, chunksize, power_iters, bool(onepass)))

                lsi = LsiModel(corpus,
                               id2word=id2token,
                               num_topics=num_topics,
                               chunksize=chunksize,
                               onepass=onepass,
                               power_iters=bool(power_iters))

                lsi_models[num_topics][chunksize][power_iters][onepass] = lsi
                lsi_similarity_indices[num_topics][chunksize][power_iters][onepass] = similarities.MatrixSimilarity(
                                                                                        lsi[corpus],
                                                                                        num_features=num_topics
                                                                                      )
run_time = int((time.time() - start_time) / 60)
print('Grid search took {} minutes.'.format(run_time))

with open('lsi_models.pickle', 'wb') as f:
    pickle.dump(lsi_models, f)
print('Models saved.')
示例#25
0
 def gerar_modelo(self, modelo):
     '''
     Treina o modelo selecionado, salvando-o. Após, cria a matrix de similaridade para o corpus transformado.
     Parâmetros:
         modelo (str) --> nome do modelo: "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec"
     Retorno: None
     '''
     # Verifica se o modelo foi implementado
     if modelo not in self._modelos:
         print(f'O modelo "{modelo}" não foi implementado.')
         return
     # Define os nomes dos arquivos
     arq_model = os.path.join(
         self.corpus._pastas['modelos'],
         f'{self.corpus._link_nome}.{self._exts[modelo]}')
     arq_index = os.path.join(self.corpus._pastas['indices'],
                              f'{self.corpus._link_nome}_{modelo}.idx')
     # Gera o modelo solicitado
     if modelo == 'tfidf':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self.corpus.num_tokens
         model = TfidfModel(corpus=corpus_train,
                            id2word=self.corpus.dicionario())
     elif modelo == 'tfidf_pivot':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self.corpus.num_tokens
         model = TfidfModel(corpus=corpus_train,
                            id2word=self.corpus.dicionario(),
                            smartirs='nfu',
                            pivot=self.corpus.num_tokens /
                            self.corpus.num_docs)
     elif modelo == 'lda':
         # Inicializa o modelo
         corpus_train = self.corpus.corpus(tipo='bow')
         num_features = self._modelos[modelo]['num_topics']
         model = LdaModel(corpus=corpus_train,
                          id2word=self.corpus.dicionario(),
                          num_topics=num_features)
     elif modelo == 'lsi':
         # Inicia o modelo
         corpus_train = self.corpus.corpus(tipo='tfidf')
         num_features = self._modelos[modelo]['num_topics']
         model = LsiModel(corpus=corpus_train,
                          id2word=self.corpus.dicionario(),
                          num_topics=num_features)
     elif modelo == 'doc2vec':
         # Instancia o modelo Doc2Vec
         corpus_train = self.corpus.corpus(tipo='tagged')
         num_features = self._modelos[modelo]['vector_size']
         model = Doc2Vec(vector_size=num_features,
                         workers=mp.cpu_count() / 2,
                         alpha=self._modelos[modelo]['alpha'],
                         min_alpha=self._modelos[modelo]['min_alpha'])
         # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec
         model.build_vocab(corpus_train)
         # Treina o modelo Doc2Vec
         model.train(corpus_train,
                     total_examples=model.corpus_count,
                     epochs=model.epochs)
     else:
         print(f'O modelo "{modelo}" não foi implementado.')
         return
     # Salva o modelo treinado
     model.save(self._arqs['modelos'][modelo])
     # Define o corpus para a matriz de similaridade
     if modelo == 'doc2vec': corpus = Doc2VecCorpus(model)
     else: corpus = model[corpus_train]
     # Gera o index a partir do modelo serializado
     index = Similarity(output_prefix=self._arqs['indices'][modelo],
                        corpus=corpus,
                        num_features=num_features)
     # Salva o índice
     index.save(self._arqs['indices'][modelo])
def main():
    parser = ArgumentParser(
        description=
        'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information'
    )
    parser.add_argument('-ds',
                        '--dataset',
                        default='wiki',
                        help='What kind of dataset to use. (wiki,es,file)')
    parser.add_argument('-d',
                        '--dump-file',
                        help='Wiki: bz2 dump file with wiki in it')
    parser.add_argument('-l',
                        '--limit',
                        help='Wiki: How many documents to extract from wiki')
    parser.add_argument('--model-id',
                        default='model',
                        help='Filename for created model.')
    parser.add_argument(
        '--model-type',
        default='lsi',
        help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
    parser.add_argument('--n-topics',
                        default=10,
                        help='Number of topics to model.')
    parser.add_argument('--n-passes',
                        default=1,
                        help='Number of passes for LDA  model.')
    parser.add_argument('--w2v-size',
                        default=100,
                        help='size of Word2Vec context.')
    parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.')
    parser.add_argument('-q',
                        '--query',
                        default=None,
                        help='Elasticsearch: Query to use to fetch documents')
    parser.add_argument('--index', help='Elasticsearch: index to read from.')
    parser.add_argument('--doc_type',
                        default='doc',
                        help='Elasticsearch: data type in index.')
    parser.add_argument(
        '--data-dir',
        help='Directory to save the generated models and vocabularies into.')
    parser.add_argument(
        '--vocab',
        help=
        'Prebuilt Vocabulary file. Use this to avoid having to generate one.')

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ['es', 'wiki', 'file']:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ['wiki']:
        logging.error('--dump-file required for wiki dataset')
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == 'es' and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = '%s/%s' % (data_dir, model_fn)
    if model_type == 'word2vec':
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == 'es':
        logging.info("Using data type %s with index %s, doc_type %s query %s" %
                     (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(read_index=index,
                                       read_doc_type=doc_type,
                                       query=query,
                                       normalize_func=normalize_es)
    elif data_type == 'wiki':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn,
                                   num_articles=limit,
                                   normalize_func=normalize_wiki)
    elif data_type == 'file':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn,
                              num_articles=limit,
                              normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words('norwegian'))
    if not vocab_file or model_type == 'vocabulary':
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + '.vocab')
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == 'vocabulary':
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == 'lsi':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         id2word=vocab)
    elif model_type == 'lda':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         passes=n_passes,
                         id2word=vocab)

    elif model_type == 'word2vec':
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == 'hdp':
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
示例#27
0
def train_lsi_model(corpus, dictionary):
    lsi = LsiModel(corpus=corpus, id2word=id2word(dictionary), num_topics=3, chunksize=10000,\
                   onepass=True)
    return lsi
示例#28
0
dicto = corpora.Dictionary(texts)
corpus = [dicto.doc2bow(text) for text in texts]

lsi_models = {}
lsi_similarity_indices = {}

start_time = time.time()

for chunksize in np.arange(5000, 30001, 5000):
    print('Chunksize: {}'.format(chunksize))
    iter_start_time = time.time()

    lsi = LsiModel(corpus,
                   id2word=id2token,
                   num_topics=50,
                   chunksize=chunksize,
                   onepass=False,
                   power_iters=2)

    lsi_models[chunksize] = lsi
    lsi_similarity_indices[chunksize] = similarities.MatrixSimilarity(
        lsi[corpus], num_features=100)
    print('{} seconds'.format(int(time.time() - iter_start_time)))

run_time = int((time.time() - start_time) / 60)
print('Parameter search took {} minutes.'.format(run_time))

with open('lsi_models_num_topics_chunksize.pickle', 'wb') as f:
    pickle.dump(lsi_models, f)
print('Models saved.')
示例#29
0
def run():
  try:
    print("starting to build LSI Model")

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print("number_of_documents:", number_of_documents)

    texts = [tokenize(document) for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    print("texts:", len(texts), texts[:5])

    dictionary = Dictionary(texts)
    #print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print("corpus:", type(corpus))

    print("generating lsi model")
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print("saving LSI model")
    lsi.save(path_to_directory_of_this_file + "/model")


    # nullifyin all topics on features and places
    Feature.objects.exclude(topic=None).update(topic=None)
    Place.objects.exclude(topic=None).update(topic=None)

    Topic.objects.all().delete()
    print("deleted all topics")
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)
    print("bulk created all topics")


    """
    # re-create topics for all features in database
    for feature in Feature.objects.exclude(text=None).exclude(text=""):
        words = tokenize(feature.text)
        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
                topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0]
                if topic_id:
                    feature.topic_id = topic_id
                    feature.save()

    # assign as topic to each place based on most popular topic found in features
    for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True):
        counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id"))
        print "counter:", counter
    """


  except Exception as e:
    print(e)
示例#30
0
    # corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_tfidf', corpus_dev_word_seg_tfidf)
    # corpus_test_word_seg_tfidf = model.__getitem__(corpus_test_word_seg)
    # corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_tfidf', corpus_test_word_seg_tfidf)

    corpus_train_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_train_word_seg_tfidf')
    corpus_dev_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_dev_word_seg_tfidf')
    corpus_test_word_seg_tfidf = corpora.MmCorpus('../topic_model/corpus_test_word_seg_tfidf')
    corpus_word_seg_tfidf = []
    corpus_word_seg_tfidf.extend(corpus_train_word_seg_tfidf)
    corpus_word_seg_tfidf.extend(corpus_dev_word_seg_tfidf)
    corpus_word_seg_tfidf.extend(corpus_test_word_seg_tfidf)


    # lsi
    print('Start train lsi...')
    lsi_model = LsiModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=400)
    lsi_model.save('../topic_model/word_seg_lsi_model')
    corpus_train_word_seg_lsi = lsi_model[corpus_train_word_seg_tfidf]
    corpus_dev_word_seg_lsi = lsi_model[corpus_dev_word_seg_tfidf]
    corpus_test_word_seg_lsi = lsi_model[corpus_test_word_seg_tfidf]
    corpora.MmCorpus.serialize('../topic_model/corpus_train_word_seg_lsi', corpus_train_word_seg_lsi)
    corpora.MmCorpus.serialize('../topic_model/corpus_dev_word_seg_lsi', corpus_dev_word_seg_lsi)
    corpora.MmCorpus.serialize('../topic_model/corpus_test_word_seg_lsi', corpus_test_word_seg_lsi)

    #lda
    print('Start train lda...')
    lda_model = LdaModel(corpus=corpus_word_seg_tfidf, id2word=dictionary_word_seg, num_topics=100, update_every=1,
                         chunksize=1000, passes=1)
    lda_model.save('../topic_model/word_seg_lda_model')
    corpus_train_word_seg_lda = lda_model[corpus_train_word_seg_tfidf]
    corpus_dev_word_seg_lda = lda_model[corpus_dev_word_seg_tfidf]