def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus
Exemplo n.º 2
0
Arquivo: lsi.py Projeto: ilanfri/kpmg1
def createLsiModelforCorpus(corpusfile, dictfile, numtop):
    print "\nLoading dictionary..."
    dict = corpora.Dictionary.load_from_text(dictfile)
    print(dict)
    print "\nLoading corpus..."
    corpus = corpora.MmCorpus(corpusfile)
    print(corpus)
    print "\nPerforming Latent Semantic Indexing..."
    lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False)
    ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault)
    #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics)
    corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False)

    rootdir=os.getcwd()
    foldername='lsi_output'
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==True):
        shutil.rmtree(folderpath)
        os.makedirs(folderpath)
    else:
        os.makedirs(folderpath)
    os.chdir(folderpath)
    lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
    lsi.save(lsimodelfile)
    filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
    with open(filename1,'wb') as output:
        pickle.dump(corpustopics, output)
    os.chdir(rootdir)

    return corpustopics, lsi
Exemplo n.º 3
0
    def lsi_model(self, num_topics: int = 10, stochastic: bool = False):
        """
        Construct LSI topic models for each year in a
        corpus, given a set of parameters.
        """

        if self.word_to_id is None or self.corpora is None:
            self.build_dictionaries_and_corpora()

        if self.tf_idf_models is None:
            self.build_tf_idf_models()

        results = num_dict(self.year_list)

        if not stochastic:

            for year in self.year_list[:-1]:
                results[year] = \
                    LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]],
                             id2word=self.word_to_id[year],
                             num_topics=num_topics
                             )

        else:

            for year in self.year_list[:-1]:
                results[year] = \
                    LsiModel(corpus=self.tf_idf_models[year][self.corpora[year]],
                             id2word=self.word_to_id[year],
                             num_topics=num_topics,
                             onepass=False
                             )

        return TopicResults(results, self.num_docs)
Exemplo n.º 4
0
def train_model(filename, output_name, data={}):
    output = data

    output['dataset'] = filename
    output['output_name'] = output_name

    df = pd.read_csv('./data/dataset/%s' % filename)
    lemmas_list = []

    for lemmas in df['lemmas']:
        lemmas = str(lemmas)
        lemmas = lemmas.replace('[', '').replace(']', '').replace(',', '').replace('\'', '')
        lemmas_list.append(lemmas.split())

    dictionary = corpora.Dictionary(lemmas_list)
    make_dir('./data/dicts/')
    dictionary.save('./data/dicts/%s_corpus.dict' % output_name)

    output['dict'] = '%s_corpus.dict' % output_name

    clean_doc = [dictionary.doc2bow(text) for text in lemmas_list]

    tfidf = models.TfidfModel(clean_doc, normalize=True)

    lsi = LsiModel(corpus=tfidf[clean_doc], id2word=dictionary, num_topics=200)
    make_dir('./data/models')
    lsi.save('./data/models/%s_model.txt' % output_name)
    output['model'] = '%s_model.txt' % output_name

    return output
Exemplo n.º 5
0
 def lsi(self):
     self.tf_idf()
     if self.corpus_tf_idf and self.dictionary:
         self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
         self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
         print self.lsi_model.print_topic(2)
     elif self.corpus_tf_idf:
         self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
         self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
Exemplo n.º 6
0
	def encoder_lsi(self, num_components=100, chunksize=500, is_tfidf=False):
		"""
		
		"""

		self.num_components = num_components
		# Train LSI based on training dataset
		self.lsi = LsiModel(corpus=self.training_corpus, id2word=self.dictionary, \
		                           num_topics=num_components, chunksize=chunksize) # initialize an LSI transformation
		# Convert bow into LSI projections
		self.corpus_lsi = self.lsi[self.training_corpus]
Exemplo n.º 7
0
    def train(self, tokens):
        """ Trains the LSI model

        Parameters
        ----------
        tokens: list of list of str
            e.g. [['hi', 'ho'], ['my', 'name', ...], ...]

        """
        self.fill_dictionary(tokens)
        corpus = self.to_corpus(tokens)
        self.tfidf = TfidfModel(corpus)
        corpus = self.tfidf[corpus]
        self.lsi = LsiModel(corpus, num_topics=self.num_topics)
    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model
Exemplo n.º 9
0
def compute_lda():
    # from gensim.models.ldamulticore import LdaMulticore
    from gensim.models.lsimodel import LsiModel

    keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words(
    )

    try:
        len(corpus)
    except:
        for doc in iter(corpus):
            pass

    host = os.environ.get('pyro_ns_host', None)
    port = int(os.environ.get('pyro_ns_port', 0)) or None

    tfidf = compute_tfidf()
    with time_code('compute_lda'):
        corpus_tfidf = tfidf[corpus]
        lda = LsiModel(corpus_tfidf,
                       num_topics=500,
                       id2word=int2word,
                       distributed=True,
                       ns_conf=dict(
                           host=host,
                           port=port,
                           broadcast=port and host,
                       ))
        # lda = LdaMulticore(corpus_tfidf, num_topics=500, id2word=int2word, workers=None)

    return lda
def getLsiFeature(documents, topicNum):
    '''
     Funciton:
         generate lsi features by training lsi model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lsi features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpusD = [dictionary.doc2bow(text) for text in texts]
    
    # train lsi model
#     LogInfo(' Train LSI model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
    model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

    # generate lsi features
    LogInfo(' Generate LSI features...')
    lsiFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpusD:
        topic = model[doc]
        for t in topic:
             lsiFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlsi")
    lsiFeature = pd.DataFrame(lsiFeature, columns = colName)
    return lsiFeature
Exemplo n.º 11
0
def lsi(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]

	model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

	lsiFeature = np.zeros((len(texts), topicNum))
	print('translate...')
	i = 0

	for doc in corpusD:
		topic = model[doc]
		
		for t in topic:
			 lsiFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return lsiFeature
Exemplo n.º 12
0
def fit_lda(X, vocab, num_topics=50, passes=1):
    """ Fit LDA from a scipy CSR matrix (X). """
    print 'fitting lda...'
    return LsiModel(gensim.matutils.Sparse2Corpus(X, documents_columns=False),
                    num_topics=num_topics,
                    chunksize=10000,
                    id2word=vocab)
Exemplo n.º 13
0
 def generate_docs_lsi(self,
                       dictionary_file_path,
                       tfidf_file_path,
                       lsi_file_path,
                       num_topics=100):
     """
     生成文档库lsi降维文件
     :param dictionary_file_path:
     :param tfidf_file_path:
     :return:
     """
     try:
         dictionary = corpora.Dictionary.load(dictionary_file_path)
         tfidf_corpus = corpora.MmCorpus(tfidf_file_path)
         print tfidf_corpus
         lsi = LsiModel(corpus=tfidf_corpus,
                        id2word=dictionary,
                        num_topics=100)
         # lsi.print_topics(10)
         with open(lsi_file_path, 'wb') as f:
             pickle.dump(lsi, f)
         logger.info('lsi model file building finished')
         # doc_lsi = lsi[doc_bow]
     except Exception as e:
         logger.error(
             'generate documents library lsi model file failed for %s' %
             str(e))
Exemplo n.º 14
0
    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics
Exemplo n.º 15
0
def run():
  try:
    print "starting to build LSI Model"

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print "number_of_documents:", number_of_documents

    stopwords = []
    stopwords += [month.lower() for month in month_to_number.keys()]
    stopwords += nltk_stopwords.words('english')
    print "stopwords:", len(stopwords)
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])
    stopwords = set(stopwords)

    texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    dictionary = Dictionary(texts)
    print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print "corpus:", type(corpus)

    print "generating lsi model"
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print "saving LSI model"
    lsi.save(path_to_directory_of_this_file + "/model")

    Topic.objects.all().delete()
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)

  except Exception as e:
    print e
Exemplo n.º 16
0
def lsi_similarity(cps, cps1, cps2, dic):
    # 计算s1,s2词频LSI相似度
    print("starting lsi similarity....")
    lsi = LsiModel(corpus=cps, num_topics=100, id2word=dic)
    s1_lsi = lsi[cps1]
    s2_lsi = lsi[cps2]
    sm = similarities.MatrixSimilarity(corpus=s1_lsi, num_features=lsi.num_topics)
    lsi_sm = np.diag(sm[s2_lsi])
    return lsi_sm
Exemplo n.º 17
0
 def from_text_files_in_path(self, path, extension=".txt"):
     doc_id = 0
     for tokens in self.training_documents_from_path(path, extension):
         document = {'id': "doc_" + str(doc_id), 'tokens': tokens}
         doc_id = doc_id + 1
         if self.model:
             self.model.add_documents(document)
         else:
             self.model = LsiModel(document)
     return self.model
Exemplo n.º 18
0
 def representation(self):
     if not self.model:
         print("LOAD MODEL...")
         self.model = LsiModel.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.model'))
         self.dictionary = Dictionary.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.dic'))
     pass
Exemplo n.º 19
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
Exemplo n.º 20
0
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
Exemplo n.º 21
0
    def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No LSA file exists but from_scratch is False')

            trigram_dictionary = self.lda_builder.get_corpus_dict()
            trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary)

            print('Building LSA model...')
            lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics)

            lsi.save(filepath)
            print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath))
        else:
            print('Loading LSA model (n_topics={})...'.format(n_topics))
            lsi = LsiModel.load(filepath)

        return lsi
Exemplo n.º 22
0
def cluster(sentences):

    my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'}

    corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences]

    corpus = np.array(corpus)
    tf_vectorizer = TfidfVectorizer(decode_error='ignore',
                                    max_df=0.7,
                                    stop_words=my_stop_words.union(stop_words),
                                    ngram_range=(1, 1))

    tf_mat = tf_vectorizer.fit_transform(corpus)
    id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())}
    n_topics = 5

    lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T),
                   num_topics=n_topics,
                   id2word=id2word,
                   onepass=False)
    gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)]
    lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T
    norm = Normalizer(copy=False)
    lsi_mat = norm.fit_transform(lsi_mat)

    valid_indices = np.where(lsi_mat.any(axis=1))[0]
    valid_sent = lsi_mat[valid_indices]

    n_clusters = 7

    cluster = KMeans(n_clusters, n_init=100)
    cluster.fit(valid_sent)

    clusters = {}
    for i in range(n_clusters):
        clusters[i] = np.where(cluster.labels_ == i)[0]

    for i in clusters.keys():
        if np.sum(
                np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]
                          )) > cluster.inertia_ / n_clusters:
            del clusters[i]

    last_cluster = [
        valid_indices[clusters[i][np.where(
            np.sum(np.square(valid_sent[clusters[i]] -
                             cluster.cluster_centers_[i]),
                   axis=1) < cluster.inertia_ / len(corpus))]].tolist()
        for i in clusters
    ]
    return last_cluster
Exemplo n.º 23
0
class MyModel:
    def __init__(self, dict_file=None, corpus_model=None, corpus_file=None):
        self.dict_file = dict_file
        self.dictionary = None
        self.corpus = None

        if dict_file is not None:
            self.dictionary = corpora.Dictionary.load(dict_file)
        if corpus_model:
            self.corpus = self.corpus_model
        elif corpus_file:
            self.corpus = corpora.MmCorpus(corpus_file)

        self.tf_idf_model = None
        self.corpus_tf_idf = None
        self.lsi_model = None
        self.corpus_lsi = None

        self.lda_model = None
        self.corpus_lda = None

    def tf_idf(self):
        self.tf_idf_model = models.TfidfModel(corpus=self.corpus, normalize=True)
        # corpus_vector = [vector for vector in self.corpus]
        self.corpus_tf_idf = self.tf_idf_model[self.corpus]

    def lsi(self):
        self.tf_idf()
        if self.corpus_tf_idf and self.dictionary:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]
            print self.lsi_model.print_topic(2)
        elif self.corpus_tf_idf:
            self.lsi_model = LsiModel(self.corpus_tf_idf, num_topics=2)
            self.corpus_lsi = self.lsi_model[self.corpus_tf_idf]

    def lda(self):
        self.lda_model = models.LsiModel(corpus=self.corpus)
        self.corpus_lda = self.lda_model[self.corpus]

    def add_document_lsi(self, addition_corpus_tf_idf, addition_vector_tf_idf):
        self.lsi_model.add_documents(addition_corpus_tf_idf)
        lsi_vector = self.lsi_model[addition_vector_tf_idf]
        return lsi_vector

    def save_lsi(self, name='/serialise/model.lsi'):
        self.lsi_model.save(name)

    def save_lda(self, name='/serialise/model.lda'):
        self.lda_model.save(name)

    @staticmethod
    def load_lsi(name='/tmp/model.lsi'):
        my_model = MyModel()
        my_model.lsi_model = models.LsiModel.load(name)
        return my_model
Exemplo n.º 24
0
    def __create_model(self, algo, topic_qtt):
        model = None

        if (algo == TopicModelingAlgorithm.LDA):
            model = LdaModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words,
                             random_state=1)
        elif (algo == TopicModelingAlgorithm.LSA):
            model = LsiModel(corpus=self.__corpus,
                             num_topics=topic_qtt,
                             id2word=self.__id2_words)
        elif (algo == TopicModelingAlgorithm.NMF):
            model = Nmf(corpus=self.__corpus,
                        num_topics=topic_qtt,
                        random_state=1)

        return model
Exemplo n.º 25
0
 def __getitem__(self, modelo):
     '''
     Retorna o modelo correspondente.
     Parâmetros:
         modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec"
     Retorno: o modelo solicitado, se existir
     '''
     if not os.path.isfile(self._arqs['modelos'][modelo]):
         print(f'O modelo "{modelo} não foi implementado ou montado."')
         return None
     if modelo in ['tfidf', 'tfidf_pivot']:
         model = TfidfModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lsi':
         model = LsiModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lda':
         model = LdaModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'doc2vec':
         model = Doc2Vec.load(self._arqs['modelos'][modelo])
     return model
Exemplo n.º 26
0
    def train(self, path, num_topics=20, iterations=1000, n_gram=True, lemmatization=True, stop_words=True, tfidf=True,
              model='lda'):
        """
        Trian the topic cluster model.
        Input value: data: pd.DataFrame format ['id','title','content','summary']
                     num_topics: (int) the number of topics
                     iterations: (int) total number of iteration times
        example:
        >>> lda = LDA_Model
        >>> lda.train(text)
        """
        data = load_data(str(path + '/output/data.csv'))
        self.original_data = data
        self.text = list(data['content'])
        self.num_topics = num_topics
        self.iterations = iterations
        self.model_name = model

        print('preprocessing...')
        self.token = self._preprocess(self.text,lemma = lemmatization, stop_words = stop_words)

        self.id2word = Dictionary(self.token)
        self.corpus = [self.id2word.doc2bow(text) for text in self.token]
        if tfidf == True:
            print('calculate tfidf...')
            tfidf_model = TfidfModel(self.corpus)
            self.corpus = tfidf_model[self.corpus]

        if model == 'lda':
            self.model = LdaModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics,
                                  iterations=self.iterations)
        if model == 'lsi':
            self.model = LsiModel(corpus=self.corpus, id2word=self.id2word, num_topics=self.num_topics)
        if model == 'hdp':
            self.model = HdpModel(corpus=self.corpus, id2word=self.id2word)
            self.num_topics = self.model.get_topics().shape[0]

        self.topic_key = pd.DataFrame(self._topic_key(), columns=['topic_id', 'key_words'])
        self.doc_topic = self._doc_topic()
        self.topic_doc = pd.DataFrame(self._topic_doc(), columns=['topic_id', 'document_id'])
        self.topic_sent = pd.DataFrame(self._readable_topic(), columns=['topic_id', 'most relative sentence'])
Exemplo n.º 27
0
    def build_similarity(self, corpus: List[tuple], model='tfidf') -> None:
        """
        Builds a similarity model for a bag of words corpus
        :param corpus: to build the similarity model
        :param model: strategy
        """

        from gensim.models.tfidfmodel import TfidfModel
        from gensim.models.lsimodel import LsiModel
        from gensim import similarities

        self.dictionary.compactify()

        if model == 'tfidf':
            self.model = TfidfModel(corpus, id2word=self.dictionary)
        elif model == 'lsi':
            # todo: remove magic number
            self.model = LsiModel(corpus,
                                  id2word=self.dictionary,
                                  num_topics=2)

        feature_cnt = len(self.dictionary.token2id)
        self.index = similarities.SparseMatrixSimilarity(
            self.model[corpus], num_features=feature_cnt)
Exemplo n.º 28
0
parser.add_argument("--nwords","-nw",help="Input desired number of words to show per topic",default=10, required=False, type=int)


args = parser.parse_args()

start_time=time.time()

print "\nLoading dictionary..."
dict = corpora.Dictionary.load_from_text(args.dict)
print(dict)
print "\nLoading corpus..."
corpus = corpora.MmCorpus(args.corpus)
print(corpus)

print "\nPerforming Latent Semantic Indexing..."
lsi = LsiModel(corpus=corpus, num_topics=args.ntopics, id2word=dict, distributed=False)

## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault)
#lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics)

#if len(args.query)!=1:
#print corpus[args.query]
queryresult = lsi[corpus[args.query]]
sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True)
#screenqueryresult = sorted(list(queryresult), key=itemgetter(1))

#screenoutput = lsi.print_topics(num_topics=10, num_words=1)
#output = lsi.print_topics(num_topics=10, num_words=10)
#print "\nResult:"
#pp.pprint(screenoutput)
#lsi.save('lsi_result.txt')
Exemplo n.º 29
0
 def modelSelectionLSI(self): 
     """
     Lets find the optimal parameters for LSI for all fields. We see the optimal 
     number of parameters for the training set of experts. 
     """
    
     coverages = numpy.zeros((len(self.ks), len(self.minDfs), len(self.gammas), len(self.fields)))
     logging.getLogger('gensim').setLevel(logging.INFO) 
     maxK = numpy.max(self.ks)
     
     logging.debug("Starting model selection for LSI")       
    
     for t, minDf in enumerate(self.minDfs): 
         logging.debug("Using minDf=" + str(minDf))
         self.minDf = minDf
         
         self.vectoriseDocuments()
         self.loadVectoriser()
         corpus = gensim.corpora.mmcorpus.MmCorpus(self.docTermMatrixFilename + ".mtx")
         id2WordDict = dict(zip(range(len(self.vectoriser.get_feature_names())), self.vectoriser.get_feature_names()))
         
         logging.debug("Running LSI with " + str(maxK) + " dimensions")
         lsi = LsiModel(corpus, num_topics=maxK, id2word=id2WordDict, chunksize=self.chunksize, distributed=False, onepass=False)    
         
         for i, k in enumerate(self.ks): 
             lsi.num_topics = k
             logging.debug("Creating index")
             index = gensim.similarities.docsim.Similarity(self.indexFilename, lsi[corpus], num_features=k)
             
             for j, field in enumerate(self.fields): 
                 logging.debug("k="+str(k) + " and field=" + str(field))                
                 newX = self.vectoriser.transform([field])
                 newX = [(s, newX[0, s])for s in newX.nonzero()[1]]
                 result = lsi[newX]             
                 similarities = index[result]
                 
                 for u, gamma in enumerate(self.gammas): 
                     self.gamma = gamma 
                     expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field)
                     
                     expertMatches = self.matchExperts(expertsByDocSimilarity, set(self.trainExpertDict[field]))
                     coverages[i, t, u, j] = float(len(expertMatches))/len(self.trainExpertDict[field])
             
             for u, gamma in enumerate(self.gammas):
                 logging.debug("Mean coverage for gamma=" + str(gamma) + " " + str(numpy.mean(coverages[i, t, u, :])))
         
     meanCoverges = numpy.mean(coverages, 3)
     logging.debug(meanCoverges)
     
     bestInds = numpy.unravel_index(numpy.argmax(meanCoverges), meanCoverges.shape)           
     
     self.k = self.ks[bestInds[0]]
     logging.debug("Chosen k=" + str(self.k))
     
     self.minDf = self.minDfs[bestInds[1]]
     logging.debug("Chosen minDf=" + str(self.minDf))
     
     self.gamma = self.gammas[bestInds[2]]
     logging.debug("Chosen gamma=" + str(self.gamma))   
     
     logging.debug("Coverage = " + str(numpy.max(meanCoverges)))
     
     return meanCoverges
     
     
Exemplo n.º 30
0
Arquivo: lsi.py Projeto: ilanfri/kpmg1
def main():

    start_time=time.time()

    rootdir=os.getcwd()
    foldername='lsi_output'
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==False or (os.path.exists(folderpath)==True and args.force==True)):
        topics, lsi = createLsiModelforCorpus(args.corpus, args.dict, args.ntopics)
    else:
        os.chdir(folderpath)
        lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
        topicsfile=(str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
        modelpath=os.path.join(folderpath,lsimodelfile)
        topicspath=os.path.join(folderpath,topicsfile)
        lsi = LsiModel.load(modelpath)
        topics=pickle.load(open(topicspath,'r'))
        f = open('lsi_corpus_topics.txt','w')
        f.seek(0)
        f.write(str(topics))
        f.close()
        os.chdir(rootdir)
        
    pp.pprint(lsi.show_topics(num_topics=args.ntopics, num_words=10, log=False, formatted=True))

    corpus = corpora.MmCorpus(args.corpus)

    if args.query!=-1:
        queryresult = lsi[corpus[args.query]]
        sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True)
        print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query)
        pp.pprint(sortedqueryresult)

    
    # Generate topic probability-document matrix, along with vector containing most probable topic (assumed to be the label) for each document
    #os.chdir(folderpath)
    outlabel_name = 'lsi_document_labels_{0}.txt'.format((args.corpus).replace('.mm',''))

    outtopic_name = 'lsi_topic_vectors_{0}.txt'.format((args.corpus).replace('.mm',''))

    outlabelpath=os.path.join(folderpath,outlabel_name)
    outtopicpath=os.path.join(folderpath,outtopic_name)
    if (os.path.exists(outlabelpath)==False or os.path.exists(outtopicpath)==False):

        outtopic = open(outtopic_name, 'w')
        outlabel = open(outlabel_name, 'w')

        for idx,doc in enumerate(corpus):
    
            tops = lsi[doc]
            doc_tops=[]
            for j in range(args.ntopics):
                search = [v[1] for v in tops if v[0] == j]

                if len(search)>0:
                    doc_tops.append(search[0])
                else:
                    doc_tops.append(0.)

            most_important = doc_tops.index(max(doc_tops))
            outlabel.write('{0}\n'.format(most_important))
            outtopic.write('\t'.join([str(d) for d in doc_tops])+'\n')

        outlabel.close()
        outtopic.close()

    shutil.move(outlabel_name,folderpath)
    shutil.move(outtopic_name,folderpath)


    #os.chdir(rootdir)
 
    end_time=time.time()
    runtime=end_time-start_time
    print "\nRuntime: {0} seconds\n".format(runtime)
    for chunksize in np.arange(10000, 10001, 10000):
        lsi_models[num_topics][chunksize] = {}
        lsi_similarity_indices[num_topics][chunksize] = {}

        for power_iters in np.arange(1, 2):
            lsi_models[num_topics][chunksize][power_iters] = {}
            lsi_similarity_indices[num_topics][chunksize][power_iters] = {}

            for onepass in np.arange(1):
                print('Number of topics: {}. Chunksize: {}. Number of power iterations: {}. One-pass: {}'
                      .format(num_topics, chunksize, power_iters, bool(onepass)))

                lsi = LsiModel(corpus,
                               id2word=id2token,
                               num_topics=num_topics,
                               chunksize=chunksize,
                               onepass=onepass,
                               power_iters=bool(power_iters))

                lsi_models[num_topics][chunksize][power_iters][onepass] = lsi
                lsi_similarity_indices[num_topics][chunksize][power_iters][onepass] = similarities.MatrixSimilarity(
                                                                                        lsi[corpus],
                                                                                        num_features=num_topics
                                                                                      )
run_time = int((time.time() - start_time) / 60)
print('Grid search took {} minutes.'.format(run_time))

with open('lsi_models.pickle', 'wb') as f:
    pickle.dump(lsi_models, f)
print('Models saved.')
Exemplo n.º 32
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    # load model and corpus
    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['run'], p['dict_extension']))

    model_path = path.join(result_path, p['run'], p['lsi_ext'])
    logger.info('load model from: %s' % model_path)
    lsi = LsiModel.load(model_path)
    pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext']))

    logging.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (only pre model)')
    corpus_pre = pre[bow_lee_texts]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file']))
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    max_topics = lsi.num_topics

    logger.info("iterate from %d to %d dimensions (stepsize: %d)" %
                (p['min_dim'], max_topics, p['dim_step']))

    iter_range = range(p['min_dim'], max_topics, p['dim_step'])
    res = np.zeros(len(iter_range))
    for k, l in enumerate(iter_range):

        # do the lower dimensionality transformation
        lsi.num_topics = l
        corpus_lsi = lsi[corpus_pre]

        # compute pairwise similarity matrix of transformed corpus
        sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                sim_matrix[i, j] = matutils.cossim(par1, par2)
        sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

        # compute correlations
        cor = np.corrcoef(sim_vector, human_sim_vector)
        logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1]))
        res[k] = cor[0, 1]

    plt.figure()
    plt.plot(iter_range, res)
    plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension']))
    plt.close()
    np.save(path.join(output_dir, 'model_dim_res.npy'), res)

    dif = datetime.now() - start
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemplo n.º 33
0
k = 40 # wanted number of topics


### SVD DECOMPOSITION (LSA) ##
### USING GENSIM #############
ans = raw_input("Start Latent Semantic Analysis with Gensim ? ")
if ans != "y":
    exit()

from gensim.models.lsimodel import LsiModel
from gensim.matutils import Sparse2Corpus, corpus2dense

co = Sparse2Corpus(X, documents_columns = False)

lsi = LsiModel(corpus=co, num_topics=k)
list_topics = lsi.show_topics(formatted=False)
topics = map(lambda li : [(value, feature_names[int(key)]) for (value, key) in li] ,list_topics)
print(topics)

genreMat = []

for genre in Genre.objects.all():
    index = filmsbygenre[genre.name]
    if index != []:
        obj = lsi[Sparse2Corpus(X[index, :], documents_columns = False)]
        E = corpus2dense(obj, k).transpose()
        genreMat.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) )
    else:
        genreMat.append( np.hstack([ [genre.name] , np.zeros(k) ] ))
genreMat = np.vstack(genreMat)
Exemplo n.º 34
0
matrices = {}

logging.info('load the articles pickle')
with open(results_path + "sparql_wiki.pickle", 'r') as f:
    articles = pickle.load(f)

logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) 
            for article in articles[key].values()]
    sim_matrix = np.zeros((len(text_list), len(text_list)))

    logging.info('transform the textlist')
    text_list = lsi[log_ent[text_list]]

    logging.info('compute similarity matrix')
    for i, par1 in enumerate(text_list):
Exemplo n.º 35
0
		if  formula.find('=') == -1 :
                	print "invalid formula"
		else: 
               		query = ("SELECT sentence,sentence_id from sentences where sentence_id between %s and %s")
               		cursor.execute(query,(sent_id-1,sent_id))
               		sent_list = cursor.fetchall() 
			sentence =''
               		for sent in sent_list:
				sentence+=' '
				sentence+=sent[0]
                yield 	dictionary.doc2bow(cleanSent(sentence).lower().split())

corpus=MyCorpus()
print(dictionary)

lsi = LsiModel(corpus, num_topics=50,id2word=dictionary)
print(lsi[doc_tfidf]) # project some document into LSI space
lsi.add_documents(corpus2) # update LSI on additional documents
print(lsi[doc_tfidf])

lsi.show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
print lsi.projection.u 


# finding embeddings of valid formulae 


V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s

import numpy as np
np.asarray
Exemplo n.º 36
0
from datetime import datetime
from date_extractor import month_to_number
from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from nltk.corpus import stopwords as nltk_stopwords
from os.path import dirname, realpath

try:
    path_to_directory_of_this_file = dirname(realpath(__file__))

    stopwords = []
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])   
    stopwords = set(stopwords)

    lsi = LsiModel.load(path_to_directory_of_this_file + "/model")
   
    dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary")
except Exception as e:
    print e

def run(text):

    try:

        words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split()
        words = [word for word in words if len(word) > 3 and word not in stopwords]

        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
Exemplo n.º 37
0
new_vec = dictionary.doc2bow(new_doc.lower().split())
#print(new_vec)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
print corpus         
# tfidf = models.TfidfModel(corpus)
# vec = [(0, 1), (4, 1)]
# print(tfidf[vec])
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
# sims = index[tfidf[vec]]
# print(list(enumerate(sims)))   
corpora.MmCorpus.save_corpus('file.mm', corpus)
#id2word= corpora.Dictionary.load('deerwester.dict')
mmCorpus = corpora.MmCorpus("file.mm")
print mmCorpus
lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10)
print "lsi:"
#print(lsi[new_vec])
lsi.print_debug(4, 4)
lsi.print_topics(4,2)
lsi.show_topic(10, 10)

lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10)
lda.print_topics(4,4)
doc_lda = lda[new_vec]

print "lda:"
#print doc_lda
         
# corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
#            [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
print(lda[test_doc_bow2])

!pip install pyLDAvis

import pyLDAvis.gensim                             
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary)

from gensim.models import CoherenceModel
lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v')
LDA_cm=lda_cm.get_coherence()
LDA_cm

from gensim.models.lsimodel import LsiModel

lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20)

lsi_topics = lsi.print_topics()
for topic in lsi_topics:
  print(topic)

test_doc = 'Journal of medicines and herbs'
test_doc = custom_preprocess(test_doc)
test_doc_bow = journals_dictionary.doc2bow(test_doc)
print(test_doc_bow)

print(lsi[test_doc_bow])

test_doc2 = 'Material and physics'
test_doc2 = custom_preprocess(test_doc2)
test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def main():
    parser = ArgumentParser(
        description=
        'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information'
    )
    parser.add_argument('-ds',
                        '--dataset',
                        default='wiki',
                        help='What kind of dataset to use. (wiki,es,file)')
    parser.add_argument('-d',
                        '--dump-file',
                        help='Wiki: bz2 dump file with wiki in it')
    parser.add_argument('-l',
                        '--limit',
                        help='Wiki: How many documents to extract from wiki')
    parser.add_argument('--model-id',
                        default='model',
                        help='Filename for created model.')
    parser.add_argument(
        '--model-type',
        default='lsi',
        help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
    parser.add_argument('--n-topics',
                        default=10,
                        help='Number of topics to model.')
    parser.add_argument('--n-passes',
                        default=1,
                        help='Number of passes for LDA  model.')
    parser.add_argument('--w2v-size',
                        default=100,
                        help='size of Word2Vec context.')
    parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.')
    parser.add_argument('-q',
                        '--query',
                        default=None,
                        help='Elasticsearch: Query to use to fetch documents')
    parser.add_argument('--index', help='Elasticsearch: index to read from.')
    parser.add_argument('--doc_type',
                        default='doc',
                        help='Elasticsearch: data type in index.')
    parser.add_argument(
        '--data-dir',
        help='Directory to save the generated models and vocabularies into.')
    parser.add_argument(
        '--vocab',
        help=
        'Prebuilt Vocabulary file. Use this to avoid having to generate one.')

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ['es', 'wiki', 'file']:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ['wiki']:
        logging.error('--dump-file required for wiki dataset')
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == 'es' and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = '%s/%s' % (data_dir, model_fn)
    if model_type == 'word2vec':
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == 'es':
        logging.info("Using data type %s with index %s, doc_type %s query %s" %
                     (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(read_index=index,
                                       read_doc_type=doc_type,
                                       query=query,
                                       normalize_func=normalize_es)
    elif data_type == 'wiki':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn,
                                   num_articles=limit,
                                   normalize_func=normalize_wiki)
    elif data_type == 'file':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn,
                              num_articles=limit,
                              normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words('norwegian'))
    if not vocab_file or model_type == 'vocabulary':
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + '.vocab')
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == 'vocabulary':
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == 'lsi':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         id2word=vocab)
    elif model_type == 'lda':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         passes=n_passes,
                         id2word=vocab)

    elif model_type == 'word2vec':
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == 'hdp':
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Exemplo n.º 40
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    model_path = path.join(base_path,
                           p['result_path'],
                           p['model_label'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # train the model on the small marketing corpus
    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if not p['model_label']:
        cor = TextFilesCorpus(path.join(base_path, p['corpus_path']),
                              no_below=p['no_below'],
                              no_above=p['no_above'],
                              preprocess=preprocess)
        dictionary = cor.dictionary

        pre = LogEntropyModel(cor, id2word=dictionary, normalize=True)
        lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics'])
    else:
        dictionary = Dictionary.load(path.join(model_path, p['dict_name']))
        pre = SaveLoad.load(path.join(model_path, 'pre.model'))
        lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
        lsi.num_topics = p['num_topics']

    test_cor_path = path.join(base_path, p['test_cor_path'])
    test_answers, gold_answers, ratings = [], [], []


    flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt'))
    for file in flist:
        match = re.search('data3_(\d)_\d+.txt', file)
        ratings.append(int(match.group(1)))
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            test_answers.append(corpus)
    flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt'))
    for file in flist:
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            gold_answers.append(corpus)


    sim = MatrixSimilarity(test_answers)[gold_answers]
    mean_sim = np.mean(sim, axis=0)
    print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1]
    print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
Exemplo n.º 41
0
dicto = corpora.Dictionary(texts)
corpus = [dicto.doc2bow(text) for text in texts]

lsi_models = {}
lsi_similarity_indices = {}

start_time = time.time()

for chunksize in np.arange(5000, 30001, 5000):
    print('Chunksize: {}'.format(chunksize))
    iter_start_time = time.time()

    lsi = LsiModel(corpus,
                   id2word=id2token,
                   num_topics=50,
                   chunksize=chunksize,
                   onepass=False,
                   power_iters=2)

    lsi_models[chunksize] = lsi
    lsi_similarity_indices[chunksize] = similarities.MatrixSimilarity(
        lsi[corpus], num_features=100)
    print('{} seconds'.format(int(time.time() - iter_start_time)))

run_time = int((time.time() - start_time) / 60)
print('Parameter search took {} minutes.'.format(run_time))

with open('lsi_models_num_topics_chunksize.pickle', 'wb') as f:
    pickle.dump(lsi_models, f)
print('Models saved.')
class TextProcessor:
    def __init__(self, n_users, n_samples, n_dims):
        self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims
        self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None

        self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\
            conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \
            conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')

    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model

    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus

    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus

    def lda_transform(self,
                      corpus_tf_idf,
                      train_separated=False,
                      is_update=False):
        """
        Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it.
        :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix.
        :param train_separated: The model is going to be train with all corpus one time or some of them separately one time.
        :param is_update: Whether the training to be perform is to construct a new model or update one existed.
        :return: lda corpus.
        """
        logger.info('Training lda model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        if is_update:
            # A ldaModel had been trained before and now update the model with other corpus.
            if self.ldaModel is None:
                self.load_model('lda')
            self.ldaModel.update(corpus_tf_idf)
            logger.info('Lda model has been updated successfully.')
            return self.ldaModel[corpus_tf_idf]

        if train_separated:
            # corpus = []
            # spacing = 10000
            # for i in range(int(len(corpus_tf_idf)/spacing)):
            #     corpus.append(corpus_tf_idf[i*spacing: i])
            # self.ldaModel = LdaModel()
            pass

        self.ldaModel = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)

        conf.mk_dir(self.ldaPath)
        self.ldaModel.save(self.ldaPath)
        logger.info('lda model has been saved in %s' % self.ldaPath)

        lda_corpus = self.ldaModel[corpus_tf_idf]
        lda_corpus_path = conf.get_filename_via_tpl('lda',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lda_corpus_path)
        corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus)
        logger.info('Lda corpus with a shape of %s has been saved in %s.' %
                    (np.array(lda_corpus).shape, lda_corpus_path))

        return lda_corpus

    def w2v_transform(self, sentences):
        """
        Perform word2vec on texts and obtain a w2v model.
        :param sentences: Sentences that each one of it contains a list of words of a text.
        :return: W2v model.
        """
        logger.info('Training w2v model with a dim of %d...' % self.nDims)
        # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path)
        # sentences = []
        # for sen in file.readlines():
        #     sentences.append(sen.strip().split(' '))
        # print(sentences)
        self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0)

        conf.mk_dir(self.w2vPath)
        self.w2vModel.save(self.w2vPath)
        self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False)
        # print(model['['])

        # Construct w2v corpus
        w2v_corpus = []
        for sen in sentences:
            vec = [0] * self.nDims
            if len(sen) > 0:
                for word in sen:
                    vec = list(
                        map(lambda m, n: m + n, vec, self.w2vModel[word]))
                    # vec += self.w2vModel[word]
            w2v_corpus.append(vec)

        w2v_corpus_path = conf.get_filename_via_tpl('w2v',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims)
        conf.mk_dir(w2v_corpus_path)

        with open(w2v_corpus_path, 'w') as fp:
            csv_writer = csv.writer(fp)
            for line in w2v_corpus:
                csv_writer.writerow(line)
        logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path)

        return w2v_corpus

    def load_corpus(self, model_type, dense=False):
        corpus = None
        try:
            if model_type == 'tfidf':
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl('tfidf',
                                              n_users=self.nUsers,
                                              postfix='mm',
                                              n_samples=self.nSamples))
            elif model_type in ['lsi', 'lda']:
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl(model_type,
                                              n_users=self.nUsers,
                                              n_samples=self.nSamples,
                                              n_dims=self.nDims,
                                              postfix='mm'))
            elif model_type == 'w2v':
                corpus = np.loadtxt(conf.get_filename_via_tpl(
                    model_type,
                    n_users=self.nUsers,
                    n_samples=self.nSamples,
                    n_dims=self.nDims),
                                    dtype=np.float,
                                    delimiter=',')

            logger.info('%s corpus with a shape of %s has been loaded. ' %
                        (model_type, np.array(corpus).shape))

            if dense and model_type in ['tfidf', 'lsi', 'lda']:
                corpus = matutils.corpus2dense(corpus,
                                               self.nDims,
                                               self.nSamples * self.nUsers,
                                               dtype=np.float).T
            else:
                corpus = np.array(corpus)
        except Exception as e:
            raise e
        return corpus

    @staticmethod
    def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float):
        return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T

    def load_vec(self, vec_type):
        logger.info('Loading %s vectors...' % vec_type)
        try:
            corpus_vec = self.load_corpus(vec_type, True)
        except Exception as e:
            raise e
        data = []
        for i in range(self.nUsers):
            data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples])
        data = np.array(data, dtype=np.float)
        return data
Exemplo n.º 43
0
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()
# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]
# Code starts here
# Creating the dictionary from our cleaned word list doc_clean
dictionary = corpora.Dictionary(doc_clean)
# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

# --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
def main():
    parser = ArgumentParser(
        description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information"
    )
    parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)")
    parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it")
    parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki")
    parser.add_argument("--model-id", default="model", help="Filename for created model.")
    parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).")
    parser.add_argument("--n-topics", default=10, help="Number of topics to model.")
    parser.add_argument("--n-passes", default=1, help="Number of passes for LDA  model.")
    parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.")
    parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.")
    parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents")
    parser.add_argument("--index", help="Elasticsearch: index to read from.")
    parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.")
    parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.")
    parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.")

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ["es", "wiki", "file"]:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ["wiki"]:
        logging.error("--dump-file required for wiki dataset")
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == "es" and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = "%s_%s_%d" % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = "%s/%s" % (data_dir, model_fn)
    if model_type == "word2vec":
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == "es":
        logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(
            read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es
        )
    elif data_type == "wiki":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki)
    elif data_type == "file":
        logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words("norwegian"))
    if not vocab_file or model_type == "vocabulary":
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + ".vocab")
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == "vocabulary":
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == "lsi":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab)
    elif model_type == "lda":
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab)

    elif model_type == "word2vec":
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == "hdp":
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)