def run(self):
		if self.clean_level in ('raw','clean','stopwords'):
			kind = self.clean_level
		else:
			kind = 'stopwords'

		for idioma in self.output()['langs'].iterkeys():
			dicc_path = self.input()['dict']['langs'][idioma].path
			corp_path = self.input()['corp']['langs'][idioma].path
			print '=============================='
			print 'Corriendo LDA de %s con nivel de limpieza %s' % (idioma, kind)
			print '=============================='

			# Cargar diccionario y corpus
			dicc = corpora.Dictionary.load(dicc_path)
			corpus = corpora.MmCorpus(corp_path)

			# Correr LDA del idioma para cada numero de topicos
			for n_topics in self.output()['langs'][idioma].iterkeys():
				print 'Número de tópicos: ' + str(n_topics)
				if self.by_chunks:
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, update_every=self.update_e, chunksize=self.chunk_size, passes=self.n_passes)
				else:
					lda = LdaModel(corpus, id2word=dicc, num_topics=n_topics, passes=1)
				lda.save(self.output()['langs'][idioma][n_topics].path)
示例#2
0
class LDA(object):
    def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
        self._model_file = model
        self._dict_file = vocab
        self._corpus_file = corpus
        self._topics = topics
        self._passes = passes

    def train(self):
        self._corpus = SentenceDocCorpus(self._corpus_file)
        self._lda = LdaModel(self._corpus,
                             num_topics=self._topics,
                             id2word=self._corpus.dictionary,
                             passes=self._passes)
        self._dictionary = self._corpus.dictionary

        self._lda.save(self._model_file)
        self._dictionary.save(self._dict_file)

    def load(self):
        self._lda = LdaModel.load(self._model_file)
        self._dictionary = Dictionary.load(self._dict_file)

    def topics(self, words):
        return self._lda[self._dictionary.doc2bow(common.filter(words))]

    def topic_vector(self, words):
        return np.array([
            v for k, v in self._lda.__getitem__(
                self._dictionary.doc2bow(common.filter(words)), eps=0)
        ])
示例#3
0
def train_lda(recipe_file,num_topics,output_file):
    corpus = RecipeCorpus(recipe_file)
    
    corpora.MmCorpus.serialize(output_file+'.corpus.mm', corpus)
    lda = LdaModel(corpus, id2word=corpus.dictionary, num_topics=int(num_topics), distributed=False)
    lda.save(output_file)
    return lda
示例#4
0
def create_models(df):
    ''' creates/saves two LDA models (one genre, one subgenre) in a folder called lda_models '''
    df = get_all_genres()
    id2word = corpora.Dictionary(df.genres)
    word2id = {v: k for k, v in id2word.items()}
    corpus = [id2word.doc2bow(genres) for genres in df.genres]
    # captures subgenres with 50 categories
    subgenre_model = LdaModel(corpus=corpus,
                              id2word=id2word,
                              num_topics=50,
                              random_state=100,
                              update_every=1,
                              passes=5,
                              alpha='auto',
                              per_word_topics=True)
    # capture main genres with 10 categories
    genre_model = LdaModel(corpus=corpus,
                           id2word=id2word,
                           num_topics=10,
                           random_state=100,
                           update_every=1,
                           passes=5,
                           alpha='auto',
                           per_word_topics=True)
    subgenre_model.save('lda_models/subgenre.model')
    genre_model.save('lda_models/genre.model')
示例#5
0
class LDA(object):
    def __init__(self, model, vocab, corpus=None, topics=200, passes=1):
        self._model_file = model
        self._dict_file = vocab
        self._corpus_file = corpus
        self._topics = topics
        self._passes = passes

    def train(self):
        self._corpus = SentenceDocCorpus(self._corpus_file)
        self._lda = LdaModel(self._corpus, num_topics = self._topics, id2word = self._corpus.dictionary, passes = self._passes)
        self._dictionary = self._corpus.dictionary
        
        self._lda.save(self._model_file)
        self._dictionary.save(self._dict_file)

    def load(self):
        self._lda = LdaModel.load(self._model_file)
        self._dictionary = Dictionary.load(self._dict_file)

    def topics(self, words):
        return self._lda[self._dictionary.doc2bow(common.filter(words))]

    def topic_vector(self, words):
        return np.array([v for k, v in self._lda.__getitem__(self._dictionary.doc2bow(common.filter(words)), eps=0)])
示例#6
0
def trainModel():
    """ Train a model
    """
    if args.mode == 'Random':
        return args.topics, 0
    # need to train on dump
    files = [
        f"{args.input}/{f}" for f in os.listdir(args.input)
        if os.path.isfile(os.path.join(args.input, f))
    ]
    if args.mode == 'LDA':
        # create dictionary
        with open(files[0], "r", encoding='utf-8') as f:
            dct = Dictionary([' '.join(f.readlines()).split()])
        for filename in files[1:]:
            with open(filename, "r", encoding='utf-8') as f:
                dct.add_documents([' '.join(f.readlines()).split()])
        # create corpus
        corpus = []
        for filename in files:
            with open(filename, "r", encoding='utf-8') as f:
                corpus.append(dct.doc2bow(' '.join(f.readlines()).split()))
        lda = LdaModel(corpus, num_topics=args.topics)
        lda.save("./models/LDAdump.model")
        dct.save("./models/LDAdump.dct")
        return lda, dct
    if args.mode == 'loadLDA':
        return LdaModel.load("./models/LDAdump.model"), Dictionary.load(
            "./models/LDAdump.dct")
class Model:
    def __init__(self, num_categories=20):
        self.num_categories = num_categories
        self.ldamodel = None

    def create_model(self,
                     doc_matrix,
                     term_dictionary,
                     model_path,
                     save_model=True,
                     language='language_na'):
        """
        Creates an LDA model based on a set of documents
        :param model_path:
        :param doc_matrix:
        :param term_dictionary:
        :param save_model:
        :param language:
        :return LDA model:
        """
        self.language = language
        start = time()
        self.ldamodel = LdaModel(doc_matrix,
                                 num_topics=self.num_categories,
                                 id2word=term_dictionary,
                                 passes=50)

        if save_model:
            self.save_model(model_path=os.path.join(
                model_path, 'models', self.language,
                '%s_%s_category_lda.model' %
                (language, str(self.num_categories))))

        logging.info('Training lasted: {:.2f}s'.format(time() - start))
        return self.ldamodel

    def load_model(self, model_path='lda.model'):
        """
        Loads a pretrained LDA model
        :param model_path:
        :return LDA model:
        """
        return LdaModel.load(model_path)

    def save_model(self, model_path):
        """
        Saves trained LDA model
        :param model_path:
        :return:
        """
        if not os.path.isdir('models'):
            os.mkdir('models')
        if not os.path.isdir(os.path.join('models', self.language)):
            os.mkdir(os.path.join('models', self.language))
        self.ldamodel.save(model_path)
        logging.info("Model Saved")
示例#8
0
 def fetch_model(dictionary):
     corpus = my_corpus
     lda = LdaModel(corpus,
                    num_topics=100,
                    update_every=1,
                    chunksize=1000,
                    passes=15)
     #lda = LdaModel(corpus,num_topics=100,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
     lda.save('Topic/lda.loc')
     return lda
def main(vocab_file, inv_vocab_file, infiles):
    vocab = load_pickled(vocab_file)
    inv_vocab = load_pickled(inv_vocab_file)

    lda = LdaModel(id2word=inv_vocab, num_topics=200)

    for f in infiles:
        tc = TweetCorpus(f, vocab)
        lda.update(tc)

    lda.save('topics.lda')
示例#10
0
def cross_val_topics_50p(no_of_topics, corpus, dictionary, validation_50p):
    model = LdaModel(corpus,
                     num_topics=no_of_topics,
                     id2word=dictionary,
                     passes=50,
                     alpha='auto',
                     eval_every=2000)
    globals()['ldamodel_%dt_50p_autoalpha_val' % no_of_topics] = model
    model.save('lda_{}t_50p_autoalpha_val.model'.format(no_of_topics))
    validation_50p[no_of_topics] = model.print_topics(num_topics=-1,
                                                      num_words=50)
示例#11
0
def perform_lda(x):
    texts = x.values.tolist()
    words = []
    for text in texts:
        words.append([x for x in text[0].split()])
    dictionary = corpora.Dictionary(words)
    corpus = [dictionary.doc2bow(word) for word in words]
    ldamodel = LdaModel(corpus, num_topics=2, id2word=dictionary,
                           passes=50)
    ldamodel.save('lda.model')
    print(ldamodel.print_topics(num_topics=2, num_words=15))
    return corpus
def generate_lda_model():
    corpus = pre_process_data()

    dictionary = corpora.Dictionary(corpus)

    bow_corpus = [dictionary.doc2bow(text) for text in corpus]

    model = LdaModel(bow_corpus, num_topics=10, id2word=dictionary, passes=3)

    model.save(lda_model_file)

    return model
示例#13
0
def getLdaModel(bow_corpus, dictionary, useSavedTill):
    if useSavedTill >= USESAVED.lda_model:
        common_logger.info("loading LDA model from file")
        return LdaModel.load(file_lda_model)
    else:
        common_logger.info("Training LDA model")
        num_topics = int(math.log(len(bow_corpus)) + 1)  # assumption:
        lda_model = LdaModel(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=numPasses)
        common_logger.info("Saving LDA model")
        lda_model.save(file_lda_model)
        common_logger.info("Done creating LDA model")
        return lda_model
	def fetch_model(dictionary):
		print "Fetching LDA Model... ",
		try:
			lda = LdaModel.load('Topic/lda.tm')
			print "LDA Model loaded!"
		except IOError:
			print "Model not found, building LDA..."
			corpus=MyCorpus()
			#lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
			lda = LdaModel(corpus,num_topics=50,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
			print "LDA Built!"
			lda.save('Topic/lda.tm')
		return lda
def train_LDA_model(ntopics, dictionary, doc_term_matrix, output_path=None):
    '''Receives topic number '''
    print(f"Training LDA model with {ntopics} topics")
    model = LdaModel(doc_term_matrix,
                     num_topics=ntopics,
                     id2word=dictionary,
                     passes=20,
                     eval_every=1,
                     iterations=50)
    if output_path:
        print(f"Saving in {output_path}")
        model.save(output_path)
    return model
示例#16
0
def gensim_lda(d):
    from gensim import corpora, models
    from gensim.models.ldamodel import LdaModel
    list_doc = []
    for i in range(0,len(d)):
        list_doc = list_doc + d[i]

    dictionary = corpora.Dictionary(list_doc)
    model = LdaModel(num_topics = 20, id2word = dictionary)
    for i in range(0, len(d)):
        print 'Generating corpus and updating model ', i
        corpus = [dictionary.doc2bow(doc) for doc in d[i]]
        model.update(corpus)

    model.save('model_20')
    print model.show_topics(num_topics = 20, num_words = 10)
示例#17
0
def lda_model(tokenized_corpus, vec_size):
    
    """create LDA model 
    Args:
        tokenized_corpus : tokenized documents
        vec_size : vector size
    """
    
    dictionary = corpora.HashDictionary(tokenized_corpus)
    corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
    temp_file = os.path.join(NLP_PATH,"lda_model")
    if os.path.isfile(temp_file):
        LDA = LdaModel.load(temp_file)#LdaMulticore.load(temp_file)
    else:
        LDA = LdaModel(corpus,id2word=dictionary,num_topics=vec_size)
        LDA.save(temp_file)
    return LDA,corpus
 def fetch_model(dictionary):
     print "Fetching LDA Model... ",
     try:
         lda = LdaModel.load('Topic/lda.tm')
         print "LDA Model loaded!"
     except IOError:
         print "Model not found, building LDA..."
         corpus = MyCorpus()
         #lda = LdaModel(corpus,num_topics=50,update_every=1,chunksize=1000,passes=15)
         lda = LdaModel(corpus,
                        num_topics=50,
                        id2word=dictionary,
                        update_every=1,
                        chunksize=1000,
                        passes=50)
         print "LDA Built!"
         lda.save('Topic/lda.tm')
     return lda
示例#19
0
def build_all():
    """
	build and save a bunch of models to evaluate
	"""

    with sqlite3.connect('../database/chat.db') as conn:

        # get vocabulary
        MIN_OCCURANCE = 100
        vocab = Dictionary([
            pd.read_sql(
                'select word from words where freq >= {}'.format(
                    MIN_OCCURANCE), conn)['word'].tolist()
        ])

        # models for different number of topics
        N_EPOCHS = 10
        for n_topics in [i for i in range(5, 26, 5)]:

            # one model per each aggregation style
            for style, sql in zip(['basic', 'user', 'user_day_room'],
                                  get_model_sql()):

                # init model
                lda_model = LdaModel(id2word=vocab,
                                     num_topics=n_topics,
                                     alpha='auto',
                                     per_word_topics=True)

                # do training
                print('training model_{0}_{1}'.format(style, n_topics))
                for epoch in range(N_EPOCHS):
                    print('\tepoch', epoch, '...', end='\r')
                    for chunk in pd.read_sql(sql, conn, chunksize=10000):
                        chunk_corpa = [
                            vocab.doc2bow(text)
                            for text in chunk['lemma'].str.split(' ').tolist()
                        ]
                        lda_model.update(chunk_corpa)
                    print('\tepoch', epoch, '... done!')

                # Save model to disk.
                lda_model.save("saved_models/model_{0}_{1}".format(
                    style, n_topics))
示例#20
0
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None):
    doc_clean = [min_char(doc).split() for doc in doc_clean]

    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6)
    if tfidfmodel:
       tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc')
       corpus = tfidf[corpus]

    ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True)
    print("#Tópicos LDA")
    for i in range(0, n_topics):
        temp = ldamodel.show_topic(i, n_words)
        terms = []
        for term in temp:
            terms.append(term)
        print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms]))
    print('Bound: ',ldamodel.bound(corpus))
    # Compute Perplexity
    print('Perplexity: ',ldamodel.log_perplexity(corpus))
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    if unseen_docs:
        corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs]
        for i, unseen_doc in enumerate(corpus_new):
            topic = None
            score = 0
            inference_doc = ldamodel[unseen_doc]
            print(unseen_docs[i])
            for index,tmpScore in inference_doc[0]:
                if tmpScore > score:
                    score = tmpScore
                    topic = ldamodel.print_topic(index, 5)
            print ("Score: {}\t Topic: {}".format(score, topic))
        print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new))

    print_result(ldamodel, doc_clean, corpus, n_topics, description)
    pickle.dump(corpus, open(description+'.pkl', 'wb'))
    dictionary.save(description+'dictionary.gensim')
    ldamodel.save(description+'_ldamodel.gensim')
示例#21
0
def main(argv):
    if len(argv) < 4:
        print 'python train_lda.py group_id num_topics passes'
        sys.exit(1)
        
    group_id = argv[1]
    num_topics = int(argv[2])
    passes = int(argv[3])
    log.info('Prepare corpus for group: %s' % group_id)

    base_path = 'tables/' + group_id + '/'
    model_base_path = 'ldamodels/' + group_id + '/'
    
    # buid dict and corpus
    #now = datetime.now()
    indicator = 'title-comment'
    source_path = base_path + 'corpus-topic-comment'
    
    corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm'
    dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict'
    
    log.info('Building the dict...')
    build_dict_corpus(source_path, corpus_path, dict_path)
    
    log.info('Loading dict from pre-saved file...')
    dictionary = corpora.Dictionary.load(dict_path)
    log.info('Done')
    
    #dictionary.save_as_text(base_path + 'text-dict.txt')
    
    log.info('Build a lda model...')
    log.info('Loading corpus from pre-saved .mm file...')
    mmcorpus = corpora.MmCorpus(corpus_path)
    log.info('Done')
    
    log.info('Training lda model...')
    model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes)
    model_path = model_base_path + indicator + '-' + group_id + '.ldamodel'
    model.save(model_path)
    log.info('Done.')
    
    model = LdaModel.load(model_path)
    model.show_topics(topics=num_topics, topn=10, log=True)
示例#22
0
def generate_model():
    np.set_printoptions(precision=2)
    corpus = []
    corpus += load_expo_cdc()
    corpus += load_lago()
    corpus += load_news()
    corpus += load_news_ic()
    corpus += load_palestras()
    corpus = preprocessing(corpus)
    dictionary = corpora.Dictionary(corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in corpus]

    dictionary.save(DICT)
    corpora.MmCorpus.serialize(BOW_CORPUS, bow_corpus)

    bow2 = np.concatenate((bow_corpus, bow_corpus), axis=0)
    bow2 = np.concatenate((bow2, bow2), axis=0)
    bow2 = np.concatenate((bow2, bow2), axis=0)
    TOPICS = 20
    model = LdaModel(bow2, id2word=dictionary, num_topics=TOPICS, iterations=100, passes=15)
    model.save(MODEL)

    lda_corpus = [model[vector] for vector in bow2]
    lda_dense = gensim.matutils.corpus2dense(lda_corpus, num_terms=TOPICS).transpose()
    """
    tfidf = models.TfidfModel(bow_corpus)
    tfidf_corpus = [tfidf[vector] for vector in bow_corpus]
    tfidf_dense = gensim.matutils.corpus2dense(tfidf_corpus, num_terms=len(dictionary)).transpose()
    """
    classifier = LogisticRegression()
    labels = load_labels()
    labels2 = labels
    labels2 += labels2
    labels2 += labels2
    labels2 += labels2
    classifier.fit(lda_dense, labels2)
    joblib.dump(classifier, CLASSIFIER, compress=9)
    #print "LDA results"
    probs = classifier.predict_proba(lda_dense)
示例#23
0
 def SNAP_generateLDAForTopic(self, topic, numTopics = 5):
   if (topic == 'all'):
     topics = ['syria', 'ufo', 'movie', 'celebrity', 'russia'] # bieber, cyrus
     for t in topics:
       for nt in [5, 10]:
         self.SNAP_generateLDAForTopic(t, nt)
     return
   id2word = self.SNAP_id2word()
   mmPath = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     "gensim_snap_mmcorpus_%s.mm" % topic
   )
   outPath = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     "gensim_snap_lda_%s_%d" % (topic, numTopics)
   )
   mm = MmCorpus(mmPath)
   lda = LdaModel(corpus=mm, id2word=id2word, num_topics=numTopics, update_every=1, chunksize=10000, passes=1)
   lda.save(outPath)
   return
示例#24
0
def build_lda_model(corpus, dictionary, num_topics=10):
    file_name = None

    if corpus == None:
        corpus = get_corpus()
    if dictionary == None:
        dictionary = get_dictionary()

    if num_topics == 10:
        file_name = LDA_FILE_10
    elif num_topics == 30:
        file_name = LDA_FILE_30
    elif num_topics == 60:
        file_name = LDA_FILE_60
    elif num_topics == 120:
        file_name = LDA_FILE_120
    else:
        raise ValueError("bad number of topics")
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=1)
    lda.save(file_name)
    for topic in range(10):
        print "Topic {0}: {1}".format(topic, lda.print_topic(topic))
    return lda
示例#25
0
文件: LDA.py 项目: pbamotra/cgrnnlm
def train(refresh=True):
    if refresh:
        ptb = BracketParseCorpusReader(Corpus.DATA_DIR, Corpus.FILE_PATTERN)
        train_folders = [str(i) + str(j) for i in range(2) for j in range(10)]
        train_folders += [str(i) + str(j) for i in range(2, 3) for j in range(5)]

        dictionary = corpora.dictionary.Dictionary()
        train_documents = list()

        logger.debug('Starting to parse training documents')
        for folder in train_folders:
            for ptb_file in os.listdir(os.path.join(Corpus.DATA_DIR, folder)):
                document_sentences = ptb.sents(fileids=[os.path.join(folder, ptb_file)])
                if len(document_sentences) > DOC_LEN_THRESHOLD:
                    doc2sentence = list(chain.from_iterable(document_sentences))
                    doc2sentence = clean_text(doc2sentence)
                    dictionary.add_documents([doc2sentence])
                    train_documents.append(doc2sentence)
        logger.debug('Parsed all training documents')

        dictionary.filter_extremes(no_below=1, no_above=0.5)
        dictionary.save(DICTIONARY_FILE)

        logger.debug('Creating corpus for training data')
        corpus = [dictionary.doc2bow(text) for text in train_documents]
        logger.debug('Finished creating corpus')

        logger.debug('Training LDA model on corpus')
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=N_TOPICS, passes=20)
        logger.debug('Completed LDA training')

        lda.save(LDA_MODEL_FILE)
    else:
        dictionary = corpora.dictionary.Dictionary.load(DICTIONARY_FILE)
        lda = LdaModel.load(LDA_MODEL_FILE)

    return lda, dictionary
import numpy as np
import time

corpusType = "sraa2_"
subDirectory = 'run_sraa'
t1 = time.time()

corpus = corpora.MmCorpus(subDirectory + '/' + corpusType + 'corpus.mm')
dictionary = corpora.dictionary.Dictionary.load(subDirectory + '/' +
                                                corpusType + 'dictionary.dict')
classes = np.loadtxt(subDirectory + '/' + corpusType + 'classes.dat',
                     dtype=int)

t2 = time.time()
print 'data loaded ... seconds: ',
print(t2 - t1)

ldaModel = LdaModel(corpus, num_topics=30, id2word=dictionary, passes=20)
ldaModel.save(subDirectory + '/' + corpusType + 'sraa.lda_model')

t3 = time.time()
print 'ldaModel is finished... seconds:',
print(t3 - t2)

tfidfModel = models.TfidfModel(corpus)
tfidfModel.save(subDirectory + '/' + corpusType + 'sraa.tfidf_model')

t4 = time.time()
print 'tfidfModel is finished... seconds:',
print(t4 - t3)
示例#27
0
for idx, doc in enumerate(allmydocs):
    # if idx > num_docs:
    #     break
    doc = doc.lower()
    doc = re.split(' |, |\n|: |(|)', doc)
    doc = [elt for elt in doc if elt is not None]
    tokens = []
    for words in doc:
        cleaned = ''.join([i for i in words if i.isalpha()])
        if cleaned not in stop_words and 2 < len(cleaned):
            tokens.append(cleaned)
    cleaned_docs.append(tokens[:])

# Create a corpus from a list of texts
common_dictionary = Dictionary(cleaned_docs)
common_corpus = [common_dictionary.doc2bow(text) for text in cleaned_docs]
random.shuffle(common_corpus)
train = common_corpus[:int(len(common_corpus)*0.8)]
test = common_corpus[int(len(common_corpus)*0.8):]

lda = LdaModel(common_corpus, num_topics=25, iterations=10000, eval_every=2, chunksize=10000, passes=10)


perplex = lda.log_perplexity(common_corpus)
print('perplex', perplex)

# Save model to disk.
temp_file = datapath("model")
lda.save(temp_file)

示例#28
0
class CustomLda(object):
    def __init__(self, data=None, dictionary=None):
        """ initialize, data should be provided, only when unpickling class object it is not needed!"""
        self.data = data
        self.model = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.dictionary = dictionary
        if self.data is not None:
            if self.dictionary is None:
                self.dictionary = Dictionary(self.data)
            self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
        else:
            self.dictionary = None
            self.corpus = None
        self.distributed = None
        self.chuncksize = None
        self.passes = None
        self.update_every = None
        self.alpha = None
        self.eta = None
        self.decay = None
        self.offset = None
        self.eval_every = None
        self.gamma_threshold = None
        self.minimum_probability = None
        self.ns_conf = None
        self.minimum_phi_value = None
        self.per_word_topics = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.model = None
        self.coherence_model = None
        self.coherence = None
        self.coherence_type = None

    def train(self,
              num_topics,
              iterations=1500,
              random_state=1,
              distributed=False,
              chunksize=2000,
              passes=1,
              update_every=1,
              alpha='symmetric',
              eta=None,
              decay=0.5,
              offset=1.0,
              eval_every=10,
              gamma_threshold=0.001,
              minimum_probability=0.01,
              ns_conf=None,
              minimum_phi_value=0.01,
              per_word_topics=False,
              workers=1):
        """train lda model. If workers >1, goes multicore"""

        self.distributed = distributed
        self.chuncksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.alpha = alpha
        self.eta = eta
        self.decay = decay
        self.offset = offset
        self.eval_every = eval_every
        self.gamma_threshold = gamma_threshold
        self.minimum_probability = minimum_probability
        self.ns_conf = ns_conf
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics
        self.num_topics = num_topics
        self.iterations = iterations
        self.random_state = random_state
        self.workers = workers

        if self.workers > 1:
            self.model = LdaMulticore(
                workers=3,
                corpus=self.corpus,
                id2word=self.dictionary,
                iterations=self.iterations,
                num_topics=self.num_topics,
                random_state=self.
                random_state,  # distributed=self.distributed,
                chunksize=self.chuncksize,
                passes=self.passes,  # update_every= self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.
                minimum_probability,  # ns_conf=self.ns_conf,
                minimum_phi_value=self.minimum_phi_value,
                per_word_topics=self.per_word_topics)
        else:
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.dictionary,
                                  iterations=self.iterations,
                                  num_topics=self.num_topics,
                                  random_state=self.random_state,
                                  distributed=self.distributed,
                                  chunksize=self.chuncksize,
                                  passes=self.passes,
                                  update_every=self.update_every,
                                  alpha=self.alpha,
                                  eta=self.eta,
                                  decay=self.decay,
                                  offset=self.offset,
                                  eval_every=self.eval_every,
                                  gamma_threshold=self.gamma_threshold,
                                  minimum_probability=self.minimum_probability,
                                  ns_conf=self.ns_conf,
                                  minimum_phi_value=self.minimum_phi_value,
                                  per_word_topics=self.per_word_topics)
        print('Trained!')

    def _train_coherence_model(self, coherence_type='u_mass'):
        """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'"""
        self.coherence_model = CoherenceModel(model=self.model,
                                              texts=self.data,
                                              dictionary=self.dictionary,
                                              coherence=coherence_type)

    def _calculate_coherence(self, coherence_type='u_mass'):
        self._train_coherence_model(coherence_type=coherence_type)
        self.coherence = self.coherence_model.get_coherence()

    def get_coherence(self, coherence_type='u_mass'):
        if coherence_type != self.coherence_type:
            self._calculate_coherence(coherence_type=coherence_type)
        return self.coherence

    def get_topic_terms(self, num, topn=10):
        return self.model.get_topic_terms(num, topn=topn)

    def get_preplexity(self):
        return self.model.log_perplexity(self.corpus)

    def get_topics(self, num):
        return self.model.show_topics(num)

    def _make_visualization(self):
        """prepare visualisation for display/saving"""
        return pyLDAvis.gensim.prepare(self.model,
                                       self.corpus,
                                       self.dictionary,
                                       sort_topics=False)

    def display(self):
        """display LDAvis in notebook"""
        visualisation = self._make_visualization()
        return pyLDAvis.display(visualisation)

    def save_ldavis(self, filename='topic.html'):
        """save LDAvis to .html"""
        ldavis = self._make_visualization()
        pyLDAvis.save_html(ldavis, filename)

    def save_lda(self, filename):
        """save lda model only"""
        self.model.save(filename)

    def pickle(self, filename):
        """save class instance to file"""
        f = open(filename, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    @staticmethod
    def unpickle(filename):
        """read class instance from file"""
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def predict_topic(self, doc_list):
        """predict topic of document list (consists of strings"""
        topic_list = []
        for doc in doc_list:
            bow = self.dictionary.doc2bow(str(doc).split())
            topics_probs = self.model.get_document_topics(bow)
            topics_probs.sort(key=lambda tup: tup[1], reverse=True)
            topic_list.append(topics_probs)
        return topic_list
示例#29
0
文件: lda.py 项目: pmantica1/lda
from gensim.models.ldaseqmodel import LdaSeqModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from nltk import word_tokenize
from tqdm import tqdm
from csv import DictReader
from collections import defaultdict
import pprint

pp = pprint.PrettyPrinter(indent=4)

id2word = corpora.Dictionary.load('tokens.dict')
mm = corpora.MmCorpus('messages.mm')
ldaseq = LdaModel(corpus=mm, id2word=id2word, num_topics=15)
pp.pprint(ldaseq.print_topics())
ldaseq.save("lda_model")
import logging
import gensim
from gensim.test.utils import datapath
from gensim.models.ldamodel import LdaModel

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

id2word = gensim.corpora.Dictionary.load_from_text(
    'WikiCorpus/wiki_wordids.txt')
mm = gensim.corpora.MmCorpus('WikiCorpus/wiki_tfidf.mm')
print(mm)

lda = LdaModel(corpus=mm,
               id2word=id2word,
               num_topics=130,
               update_every=1,
               passes=1)

model_location = datapath("D:/HazMat/Projects/ML/Models/model_130")
lda.save(model_location)
示例#31
0
class Topics:
    def __init__(self, data, num_topics, itertations):
        '''
		initilizes the a Topics instance and tokenizes the input document
		'''
        # self.text = open(filename, 'r').read()
        # self.filename = filename[0:-4]

        self.wholeDocument = data

        # if not os.path.exists(self.output_directory):
        # 	os.makedirs(self.output_directory)

        # self.filename = filename
        if isinstance(data, dict):
            self.Document = data
            self.text = ''
            for key in data:
                self.text += data[key] + ' '

        else:
            self.text = data

        self.languageModel = spacy.load('en')
        self.lemmatizer = WordNetLemmatizer()
        self.en_stop = set(nltk.corpus.stopwords.words('english')).union(
            set(['the', 'The']))
        self.num_topics = num_topics
        self.itertations = itertations

        self.tokenize()
        self.vocab_size = len(self.document)

    def tokenize(self):
        '''
		function that returns all the sentences in the document after:
			1- lemmatizing every word
			2- removing stop words
			3- removing short words
		'''
        self.document = self.languageModel(self.text)
        self.sentences = []
        # counter = 0
        for sent in self.document.sents:
            sentence = [
                self.lemmatizer.lemmatize(word.lower())
                for word in sent.text.split()
                if (word.lower() not in self.en_stop and len(word) >= 3)
            ]
            self.sentences.append(sentence)
            # counter+=1
        # print(counter)
        # print(len(self.sentences))

    def tokenize_testDoc(self, testDoc):
        '''
		function that returns all the sentences in a test document in the same way as tokenize.
		'''
        # text = open(testDoc, 'r').read()
        text = testDoc
        document = self.languageModel(text)
        sentences = []
        for sent in document.sents:
            sentence = [
                self.lemmatizer.lemmatize(word.lower())
                for word in sent.text.split()
                if word.lower() not in self.en_stop and len(word) > 3
            ]
            sentences.append(sentence)
        return sentences

    def get_model(self):
        '''
		function that builds a LDA model for the whole document being summarized
		the function also saved the model based on the document's name
		'''
        dictionary = corpora.Dictionary(self.sentences)
        corpus = [dictionary.doc2bow(sentence) for sentence in self.sentences]
        self.ldamodel = LdaModel(corpus,
                                 num_topics=self.num_topics,
                                 id2word=dictionary,
                                 passes=self.itertations,
                                 random_state=0)

        self.print_model()

    def print_model(self):
        '''
		function that prints topic distributions over all words in the original document
		and saves them into a json file
		'''
        topics = self.ldamodel.print_topics(num_words=self.vocab_size)

        self.topics_dict = {}
        for topic in topics:
            topic_nb = topic[0]
            distribution = topic[1].split('+')
            topic_dict = {}
            for word_prob in distribution:
                word_prob_ = word_prob.split('*')
                if len(word_prob_) > 1:
                    topic_dict[word_prob_[1][1:-2]] = float(word_prob_[0])

            self.topics_dict[topic_nb] = topic_dict

        # with open(self.output_directory + self.filename + '_topics_dict.json', 'w') as output:
        # 	json.dump(self.topics_dict, output)

        # for key in self.topics_dict:
        # 	print("Topic ", key, " Distribution: ", self.topics_dict[key], '\n')

    def save_model(self):
        '''
		functions that saves an LDA model
		'''
        self.ldamodel.save(self.output_directory + self.filename +
                           '_ldaModel.gensim')

    def load_model(self):
        '''
		functions that loads an already existing LDA model
		'''
        self.ldamodel = LdaModel.load(self.output_directory + self.filename +
                                      '_ldaModel.gensim')

    def get_topic_dist(self, doc):
        '''
		funciton that can be used to get the topic distribution for every paragraph
	 	of the original document
		'''
        paragraph = self.tokenize_testDoc(doc)
        # print(len(paragraph))
        '''
		reuse the dictionay built based on the original document
		for our case this dict should cover all the the paragraphs
		'''
        dictionary = corpora.Dictionary(self.sentences)
        corpus = [dictionary.doc2bow(sentence) for sentence in paragraph]
        dists = []
        for i, cor in enumerate(corpus):
            dist = self.ldamodel[cor]
            dists.append(dist)

        # print(dists)
        return dists

    def get_coverage(self):
        '''
		this function finds how much each topic is covered in every paragraph of the document
		'''
        self.coverage = {}
        for paragraph in list(self.Document.keys()):
            if len(self.Document[paragraph]) != 0:
                self.coverage[paragraph] = self.get_paragraph_coverage(
                    paragraph)
            else:
                self.coverage[paragraph] = np.zeros([self.num_topics]).tolist()

        # self.print_coverage()

        # with open(self.output_directory + self.filename + '_paragraph_coverage_dict.json', 'w') as output:
        # 	json.dump(self.coverage, output)

        return self.coverage

    def get_paragraph_coverage(self, paragraph_key):
        '''
		this function finds topic coverage per paragraph 
		by averaging over topic coverage per sentence

		'''
        paragraph = self.Document[paragraph_key]
        dists = self.get_topic_dist(paragraph)
        np_dist = []
        for dist in dists:
            np_sent_dist = []
            if len(dist) == self.num_topics:
                for tup in dist:
                    np_sent_dist.append(tup[1])
                np_dist.append(np_sent_dist)
            else:
                np_sent_dist = np.zeros([self.num_topics])
                for tup in dist:
                    np_sent_dist[int(tup[0])] = tup[1]
                np_dist.append(np_sent_dist.tolist())

        paragraph_coverage = np.average(np.array(np_dist), axis=0)
        return paragraph_coverage.tolist()
示例#32
0
			run_id = "ldaU_K{K}_a{alpha_frac}-K_b{beta}_iter{iter}.gensim".format(K=num_topics, alpha_frac=alpha_frac, beta=beta, iter=num_iterations)
			print run_id

			output_file = output_file_template.format(run_id=run_id)

			# Train and save
			print 'Training...'
			model = LdaModel(corpus, 
				alpha=alpha, eta=beta,
				id2word=dictionary, num_topics=num_topics, iterations=num_iterations
			)
			# model = LdaMulticore(corpus, 
			# 	alpha=alpha, eta=beta,
			# 	id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2
			# )
			print 'Done training.'
			model.save(output_file)

			# Print top 10 words in topics, if desired
			if print_topics:
				topics = model.show_topics(num_topics=100, formatted=False)
				for topic in topics:
					for tup in topic[1]:
						print tup[0] + ": " + str(tup[1])
					print '\n'

			# Evaluate perplexity
			ll = model.log_perplexity(test_corpus)
			print "LL:   "+str(ll)
			print "Perp: "+str(np.exp2(-ll))
		def fetch_model(dictionary):
			corpus=my_corpus
			lda = LdaModel(corpus,num_topics=100,update_every=1,chunksize=1000,passes=15)
			#lda = LdaModel(corpus,num_topics=100,id2word=dictionary,update_every=1,chunksize=1000,passes=50)
			lda.save('Topic/lda.loc')
			return lda
示例#34
0
def train_lda(corpus, vocab_dict, n_topics, n_iter, save_model):
    lda = LdaModel(corpus, num_topics=n_topics, id2word=vocab_dict, \
        passes=n_iter, minimum_probability=1e-3)
    lda.save(save_model)

    return lda
示例#35
0
data = pd.read_csv('nyt.csv')

text_clean = []

for text in data['News_content']:
    text_clean.append(pptext(text).split())

print(text_clean[:3])

dictionary = Dictionary(text_clean)
corpus = [dictionary.doc2bow(text) for text in text_clean]
pickle.dump(corpus, open('topicModels//corpus2.pkl', 'wb'))
dictionary.save('topicModels//dictionary2.gensim')

ldamodel = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15)
ldamodel.save('topicModels//model15.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

# Visaulization works only on Jupyter Notebook
# type jupyter notebook or jupyter console
dictionary = Dictionary.load('topicModels//dictionary2.gensim')
corpus = pickle.load(open('topicModels//corpus2.pkl', 'rb'))
ldamd = LdaModel.load('topicModels//model15.gensim')

lda_display = pyLDAvis.gensim.prepare(ldamd,
                                      corpus,
                                      dictionary,
                                      sort_topics=False)
pyLDAvis.display(lda_display)
示例#36
0
dictionary = gensim.corpora.Dictionary.load_from_text(
    bz2file.BZ2File('wiki_wordids.txt.bz2'))
corpus = gensim.corpora.MmCorpus('wiki_tfidf.mm')

with open('mcdi_word.csv', 'rb') as f:
    reader = csv.reader(f)
    wlist = []
    for row in reader:
        wlist.append(row)

idlist = []

for row in wlist:
    idrow = []
    for key in dictionary.iteritems():
        if key[1].encode('utf-8') in row:
            idrow.append(key[0])
    idlist.append(idrow)

a = 0.05
ntopic = 75
eta_arr = ones((ntopic, len(dictionary))) * 0.5
for x in range(0, len(idlist)):
    for id in idlist[x]:
        eta_arr[x, id] *= 100000

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=ntopic)
topiclist = lda.print_topics(num_topics=75, num_words=50)
lda.save('wiki_file_75.model')
示例#37
0
        #result_stem = [stemmer.stem(item) for item in result_lower]
        result_clean = [
            item for item in result_lower
            if '\'' not in item and '_' not in item and len(item) > 1
        ]
        result = [item for item in result_clean if item not in sw]
        filew.extend(result)
    resultlist.append(filew)
print(resultlist[0])

dictionary = corpora.Dictionary(resultlist)
corpus = [dictionary.doc2bow(text) for text in resultlist]

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50)
topiclist = lda.print_topics(num_topics=50, num_words=50)
lda.save('childs_file_50.model')
'''
aniP = lda.get_document_topics(animal)
print aniP

rlist = []
for topic in topiclist:
    rlist.append(topic[1].split("+"))

nlist = []
for n in rlist:
    xlist = []
    for k in n:
        k = k.split("*")
        k = k[1].encode('utf-8')
        k = k.split('\"')[1]
class TextProcessor:
    def __init__(self, n_users, n_samples, n_dims):
        self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims
        self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None

        self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \
            conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\
            conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \
            conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \
            conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt')

    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model

    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus

    def lsi_transform(self, corpus_tf_idf):
        logger.info('Training lsi model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        self.lsiModel = LsiModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)
        # print self.lsiModel[corpus]

        conf.mk_dir(self.lsiPath)

        self.lsiModel.save(self.lsiPath)
        logger.info('Lsi model has been saved in %s.' % self.lsiPath)

        lsi_corpus = self.lsiModel[corpus_tf_idf]
        lsi_corpus_path = conf.get_filename_via_tpl('lsi',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lsi_corpus_path)
        corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus)
        logger.info('Lsi corpus with a shape of %s has been saved in %s.' %
                    (np.array(lsi_corpus).shape, lsi_corpus_path))

        return lsi_corpus

    def lda_transform(self,
                      corpus_tf_idf,
                      train_separated=False,
                      is_update=False):
        """
        Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it.
        :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix.
        :param train_separated: The model is going to be train with all corpus one time or some of them separately one time.
        :param is_update: Whether the training to be perform is to construct a new model or update one existed.
        :return: lda corpus.
        """
        logger.info('Training lda model with a n_dims of %d...' % self.nDims)
        if self.dictionary is None and os.path.exists(self.dictPath):
            self.dictionary = corpora.Dictionary.load(self.dictPath)

        if is_update:
            # A ldaModel had been trained before and now update the model with other corpus.
            if self.ldaModel is None:
                self.load_model('lda')
            self.ldaModel.update(corpus_tf_idf)
            logger.info('Lda model has been updated successfully.')
            return self.ldaModel[corpus_tf_idf]

        if train_separated:
            # corpus = []
            # spacing = 10000
            # for i in range(int(len(corpus_tf_idf)/spacing)):
            #     corpus.append(corpus_tf_idf[i*spacing: i])
            # self.ldaModel = LdaModel()
            pass

        self.ldaModel = LdaModel(corpus=corpus_tf_idf,
                                 num_topics=self.nDims,
                                 id2word=self.dictionary)

        conf.mk_dir(self.ldaPath)
        self.ldaModel.save(self.ldaPath)
        logger.info('lda model has been saved in %s' % self.ldaPath)

        lda_corpus = self.ldaModel[corpus_tf_idf]
        lda_corpus_path = conf.get_filename_via_tpl('lda',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims,
                                                    postfix='mm')
        conf.mk_dir(lda_corpus_path)
        corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus)
        logger.info('Lda corpus with a shape of %s has been saved in %s.' %
                    (np.array(lda_corpus).shape, lda_corpus_path))

        return lda_corpus

    def w2v_transform(self, sentences):
        """
        Perform word2vec on texts and obtain a w2v model.
        :param sentences: Sentences that each one of it contains a list of words of a text.
        :return: W2v model.
        """
        logger.info('Training w2v model with a dim of %d...' % self.nDims)
        # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path)
        # sentences = []
        # for sen in file.readlines():
        #     sentences.append(sen.strip().split(' '))
        # print(sentences)
        self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0)

        conf.mk_dir(self.w2vPath)
        self.w2vModel.save(self.w2vPath)
        self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False)
        # print(model['['])

        # Construct w2v corpus
        w2v_corpus = []
        for sen in sentences:
            vec = [0] * self.nDims
            if len(sen) > 0:
                for word in sen:
                    vec = list(
                        map(lambda m, n: m + n, vec, self.w2vModel[word]))
                    # vec += self.w2vModel[word]
            w2v_corpus.append(vec)

        w2v_corpus_path = conf.get_filename_via_tpl('w2v',
                                                    n_users=self.nUsers,
                                                    n_samples=self.nSamples,
                                                    n_dims=self.nDims)
        conf.mk_dir(w2v_corpus_path)

        with open(w2v_corpus_path, 'w') as fp:
            csv_writer = csv.writer(fp)
            for line in w2v_corpus:
                csv_writer.writerow(line)
        logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path)

        return w2v_corpus

    def load_corpus(self, model_type, dense=False):
        corpus = None
        try:
            if model_type == 'tfidf':
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl('tfidf',
                                              n_users=self.nUsers,
                                              postfix='mm',
                                              n_samples=self.nSamples))
            elif model_type in ['lsi', 'lda']:
                corpus = corpora.MmCorpus(
                    conf.get_filename_via_tpl(model_type,
                                              n_users=self.nUsers,
                                              n_samples=self.nSamples,
                                              n_dims=self.nDims,
                                              postfix='mm'))
            elif model_type == 'w2v':
                corpus = np.loadtxt(conf.get_filename_via_tpl(
                    model_type,
                    n_users=self.nUsers,
                    n_samples=self.nSamples,
                    n_dims=self.nDims),
                                    dtype=np.float,
                                    delimiter=',')

            logger.info('%s corpus with a shape of %s has been loaded. ' %
                        (model_type, np.array(corpus).shape))

            if dense and model_type in ['tfidf', 'lsi', 'lda']:
                corpus = matutils.corpus2dense(corpus,
                                               self.nDims,
                                               self.nSamples * self.nUsers,
                                               dtype=np.float).T
            else:
                corpus = np.array(corpus)
        except Exception as e:
            raise e
        return corpus

    @staticmethod
    def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float):
        return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T

    def load_vec(self, vec_type):
        logger.info('Loading %s vectors...' % vec_type)
        try:
            corpus_vec = self.load_corpus(vec_type, True)
        except Exception as e:
            raise e
        data = []
        for i in range(self.nUsers):
            data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples])
        data = np.array(data, dtype=np.float)
        return data
示例#39
0
文件: train.py 项目: biddyweb/news-1
if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'

print 'creating dictionary...'
N = 23307  # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)

print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)

print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
    print i,topic

print 'saving model...'
lda.save(lda_file)
from gensim import corpora,models, similarities
import numpy as np
import time

corpusType = "sraa2_";
subDirectory = 'run_sraa'
t1 = time.time()

corpus = corpora.MmCorpus(subDirectory+'/'+corpusType+'corpus.mm')
dictionary = corpora.dictionary.Dictionary.load(subDirectory+'/'+corpusType+'dictionary.dict')
classes = np.loadtxt(subDirectory+'/'+corpusType+'classes.dat',dtype=int)

t2 = time.time()
print 'data loaded ... seconds: ',
print (t2-t1)

ldaModel = LdaModel(corpus, num_topics=30, id2word = dictionary, passes=20)
ldaModel.save(subDirectory+'/'+corpusType+'sraa.lda_model')

t3 = time.time()
print 'ldaModel is finished... seconds:',
print (t3-t2)

tfidfModel = models.TfidfModel(corpus)
tfidfModel.save(subDirectory+'/'+corpusType+'sraa.tfidf_model')

t4 = time.time()
print 'tfidfModel is finished... seconds:',
print (t4-t3)

示例#41
0
def create_models(db, lsi_num_topics=10, lda_num_topics=5, num_bars=None):
    """
    Create and save a lsi object
    using data in the database.
    Save this object, along with the
    dictionary and the corpus, to disk
    """

    bars = db['bars']

    if num_bars == None:
        locations = bars.find({ 'nymag.review' : {'$ne':None}, 
                                'foursquare.tips' : {'$exists':True}, 
                                'foursquare.tips' : {'$ne':None} 
                                })
    else:
        locations = bars.find({ 'nymag.review' : {'$ne':None}, 
                                'foursquare.tips' : {'$exists':True}, 
                                'foursquare.tips' : {'$ne':None} 
                                }).limit(num_bars)

    ignorechars = '''!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'''
    stopwords = get_stopwords()

    texts = []
    bar_idx_map = {}
    idx_bar_map = {}

    save_directory = "assets/"

    print "Fetching texts from database and tokenizing"
    for idx, location in enumerate(locations):
        bar_name = location['nymag']['name']
        bar_idx_map[bar_name] = idx
        idx_bar_map[int(idx)] = bar_name
        text = create_string_from_database(location)
        tokens = tokenize_document(text, stopwords, ignorechars)
        texts.append(tokens)

    # Do some cleaning
    print "Cleaning texts"
    texts = remove_words_appearing_once(texts)

    # Create the counter
    word_counts = Counter()
    for text in texts: 
        word_counts.update(text)       

    # Create and save the dictionary
    print "Creating dictionary"
    dictionary = corpora.Dictionary(texts)
    dictionary.save(save_directory + 'keywords.dict')

    # Create and save the corpus
    print "Creating Corpus matrix"
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize(save_directory + 'corpus.mm', corpus) 

    # Term Frequency, Inverse Document Frequency
    print "Applying TFIDF"
    tfidf = models.TfidfModel(corpus) 
    tfidf.save(save_directory + "tfidf.model")

    # Map TFIDF on the corpus
    print "Mapping TFIDF on corpus"
    corpus_tfidf = tfidf[corpus]
    corpora.MmCorpus.serialize(save_directory + 'corpus_tfidf.mm', corpus_tfidf) 

    # Create the LSI
    print "Creating LSI with %s topics" % lsi_num_topics
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=lsi_num_topics) 
    lsi.save(save_directory + 'lsi.model')

    # Map LSI on the corpus
    corpus_lsi_tfidf = lsi[corpus_tfidf]
    corpora.MmCorpus.serialize(save_directory + 'corpus_lsi_tfidf.mm', corpus_lsi_tfidf)

    # Create the index
    #index = similarities.MatrixSimilarity(lsi[corpus_tfidf])
    index = similarities.MatrixSimilarity(corpus_lsi_tfidf)
    index.save(save_directory + 'lsi_tfidf.index')

    # Create the LDA (on the raw corpus)
    print "Creating LDA with %s topics" % lda_num_topics
    lda = LdaModel(corpus, num_topics=lda_num_topics, id2word=dictionary, 
                   update_every=0, passes=30)
    #lda.show_topics(10, 20, formatted=False)
    lda.save(save_directory + 'lda.model')

    # Create the lda corpus
    corpus_lda = lda[corpus]
    corpora.MmCorpus.serialize(save_directory + 'corpus_lda.mm', corpus_lda)

    # Save some additional info
    with open(save_directory + 'bar_idx_map.json', 'wb') as fp:
        json.dump(bar_idx_map, fp)

    with open(save_directory + 'idx_bar_map.json', 'wb') as fp:
        json.dump(idx_bar_map, fp)

    with open(save_directory + 'model_info.txt', 'wb') as fp:
        ts = time.time()
        st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
        fp.write("LSI Model %s\n" % st)
        info = "Number of Docs: %s\n" % len(corpus)
        info += "Number of key words: %s\n" % len(dictionary)
        info += "Number of LSI Topics: %s\n" % lsi_num_topics
        info += "Number of LDA Topics: %s\n" % lda_num_topics
        fp.write(info)

        # Print Stop Words
        words_per_row = 10
        fp.write("\n\nStop Words:\n")
        for i, word in enumerate(stopwords):
            if i % words_per_row == 0:
                fp.write('\n')
            fp.write("%s " % word)

        # Print Corpus
        fp.write("\n\nWords Encountered:\n")
        for word, count in word_counts.most_common():
            idx = dictionary.token2id[word]
            line = "%s (index=%s) freq: %s \n" % (word, idx, count)
            fp.write(line)
示例#42
0
#filename = 'data/lkmlLinusAll.txt'


with open(filename, encoding='utf-8') as f:
    documents = f.readlines()

texts = [[word for word in document.lower().split()
        if word not in STOPWORDS and word.isalnum()]
        for document in documents]

dictionary = corpora.Dictionary(texts)
dictionary.save('lkml.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('lkml.mm', corpus)

lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, passes=passes)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=num_words))
lda.save('lkml.gensim')
#newlda = LdaModel.load('lkml.gensim', mmap='r')
#pp.pprint(lda.print_topics(num_words=num_words))


unseenText = 'data/lkmlSingleNewEmail.txt'
with open(unseenText, encoding='utf-8') as fnew:
    newdoc = fnew.read()

newcorpus = dictionary.doc2bow(newword for newword in newdoc.lower().split()
                                if newword not in STOPWORDS and newword.isalnum())
pp.pprint(lda[newcorpus])
示例#43
0
# How much does each bill fall under each topic?
def get_bill_topics(model):
    topics = pd.DataFrame()

    for i, row in enumerate(model[corpus]):
        # Proportion that document falls into each topic
        topics = pd.concat(
            [topics, pd.DataFrame(model[corpus][i]).set_index(0)], axis=1)

    topics = topics.transpose().reset_index(drop=True)

    # Integer index of dominant topic
    dominant_topic = topics.idxmax(axis=1).rename('dominant_topic')

    # Percentage that document represents dominant topic
    max_perc = topics.max(axis=1, skipna=True).rename('max_perc')

    return pd.concat([
        bills[['session', 'bill_id', 'title', 'text']], dominant_topic,
        max_perc, topics
    ],
                     axis=1)


bill_topics = get_bill_topics(lda_model)

print('Exporting topic model...')
lda_model.save('models/topic_model_' + subject.lower())

print('Done!')
def make_lda_result():
    lda = LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=2, iterations=1000)

    # save LDA result
    lda.save(LDA_FILE)
示例#45
0
def get_params(files):
    print "Converting data to features..."
    tweets = imap(lambda f: open(f).read(), files)
    features = [to_features(tweet) for tweet in tweets]
    # features = json.load(open("models/lda_features.json"))

    print "Converting features to bag of words..."
    dictionary = corpora.Dictionary(features)
    corpus = [dictionary.doc2bow(text) for text in features]
    # corpus = json.load(open("models/lda_corpus.json"))

    return corpus, features, dictionary


if __name__ == "__main__":
    print "Loading file names..."
    files = glob.glob("../tweets/*")
    corpus, features, dictionary = get_params(files)

    print "Creating LDA Model..."
    lda = LdaModel(corpus, id2word=dictionary, num_topics=30, iterations=1000, alpha='auto', chunksize=50)
    lda_topic_distribution = [l for l in lda[corpus]]

    print "Saving model..."
    lda.save("lda_model_unigrams.dat")

    print "Saving distribution..."
    f = open("lda_topic_distribution.json", 'w')
    json.dump(lda_topic_distribution, f)
    f.close()
示例#46
0
for text in newTexts:
    for token in text:
        newfrequency[token] += 1

# In[87]:

logger.info("Generating topics from LDA")

num_topics=100
model=LdaModel(num_topics=num_topics,corpus=corpus,id2word=dictionary,iterations=1500)
#model=LdaMulticore(num_topics=100,workers=3,corpus=corpus,id2word=dictionary,iterations=3000)
#model=HdpModel(corpus=corpus, id2word=dictionary)

# In[94]:

model.save('cache/model.pkl')


# In[96]:

cursor = db.movies.find({},{"movieId":1})
movieId=[]
for doc in cursor:
    movieId.append(doc['movieId'])


# In[97]:

movieDict={}
for i,val in enumerate(movieId):
    movieDict[val]=newTexts[i]
示例#47
0
def train_lda(path):
    # from gensim.test.utils import common_texts
    # from gensim.corpora.dictionary import Dictionary
    # from gensim.models.ldamodel import LdaModel
    # from gensim.test.utils import datapath

    # common_dictionary = Dictionary(common_texts)
    # common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

    # print(common_dictionary.get(80))

    # lda = LdaModel(common_corpus, num_topics=5)
    # temp_file = datapath(path)
    # lda.save(temp_file)
    # lda = LdaModel.load(temp_file)

    documents = [
        "Amazon sells many things ", "Apple is releasing a new product ",
        "Microsoft announces Nokia acquisition ",
        'Julie loves me more than Linda loves me ',
        'Jane likes me more than Julie loves me'
    ]

    documents = [rm_special_chars(s) for s in documents]
    stoplist = [
        'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
        'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
        'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
        'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who',
        'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
        'are', 'we', 'these', 'your', 'his', 'through', "don't", 'nor', 'me',
        'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our',
        'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she',
        'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and',
        'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
        'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
        'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too',
        'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom',
        'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it',
        'how', 'further', 'was', 'here', 'than'
    ]

    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    # print(texts)
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # print(dictionary.get(0))

    num_topics = 3
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=num_topics,
                   update_every=1,
                   random_state=100,
                   chunksize=100,
                   passes=20,
                   alpha='auto')

    temp_file = datapath(path + 'lda_model')
    lda.save(temp_file)
    lda = LdaModel.load(temp_file)

    dictionary.save(path + 'dict')

    return lda, dictionary
示例#48
0
文件: lda.py 项目: tc-qaq/NLP_lda-w2v
        word = term[1].strip().split();
        data_list.append(word);

dic = corpora.Dictionary(data_list);               #construc dictionary
corpus = [dic.doc2bow(text) for text in data_list];#for each word sparse vec
tfidf = models.TfidfModel(corpus);                 #statistic ftidf

corpus_tfidf = tfidf[corpus];                      #get each text tfidf->sparse matrix

#500 can't generate a model so again 200;
#the max num of this is 300;
topic_nums = [10, 50, 80,100,150];
corpus_ldas = [];
for t_num in topic_nums:
    lda = LdaModel(corpus_tfidf,id2word = dic, num_topics = t_num); #generate LDA
    lda.save('weibo_lda'+str(t_num)+'.model');
    corpus_ldas.append(lda[corpus_tfidf]);

print("LDA has done!!!")

for corpus_lda in corpus_ldas:
    num = 0;
    for doc in corpus_lda:
        wstr = "";
        for i in range(len(doc)):
            item  = doc[i];
            wstr += str(item[0]) + ","+str(item[1])[0:7]+"/";
        fo.write(id_list[num]+"\t"+wstr[0:-1]+"\n");
        num += 1;
fr.close();
fo.close();