Пример #1
0
def train_model(texts, **kwargs):

  # parse args
  filter_stopwords = kwargs.get('filter_stopwords', True)
  normalizer = kwargs.get('normalizer', 'porter')
  tfidf = kwargs.get('tfidf', True)
  num_topics = kwargs.get('num_topics', 20)
  min_freq = kwargs.get('min_freq', 2)
  use_pickle = kwargs.get('use_pickle', True)
  update_pickle = kwargs.get('update_pickle', True)
  report = kwargs.get('report', True)
  distributed = kwargs.get('distributed', False)
  
  # build corpus or read it in from pickle
  if use_pickle:
    print "INFO: loading pickled corpus and word hash"
    corpus = pickle.load( open( "pickles/corpus.p", "rb" ) )
    id2word = pickle.load( open( "pickles/id2word.p", "rb" ) )
            
  else:
    print "INFO: processing text and building corpus..."
    corpus, id2word = process_texts(
      texts = texts, 
      filter_stopwords = filter_stopwords,
      normalizer = normalizer,
      min_freq = min_freq
    )

    if update_pickle:
      # pickle files
      print "INFO: updating pickled coprus and word hash"
      pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) )
      pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) )

  # optional tfidf transformation
  if tfidf:
    print "INFO: applying tfidf transformation..."
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]

  # fit model
  print "INFO: fitting model..."
  lda = LdaModel(
    corpus = corpus, 
    id2word = id2word, 
    num_topics = num_topics,
    distributed = distributed
  )

  # report
  if report:
    perplexity = lda.bound(corpus)
    print "RESULTS:"
    print "\nperplexity: ", perplexity, "\n"
    topics = lda.show_topics(num_topics)
    for i, t in enumerate(topics):
      print "topic %d:" % i
      print t

  return lda, corpus, id2word
Пример #2
0
def train_model(texts, **kwargs):

    # parse args
    filter_stopwords = kwargs.get('filter_stopwords', True)
    normalizer = kwargs.get('normalizer', 'porter')
    tfidf = kwargs.get('tfidf', True)
    num_topics = kwargs.get('num_topics', 20)
    min_freq = kwargs.get('min_freq', 2)
    use_pickle = kwargs.get('use_pickle', True)
    update_pickle = kwargs.get('update_pickle', True)
    report = kwargs.get('report', True)
    distributed = kwargs.get('distributed', False)

    # build corpus or read it in from pickle
    if use_pickle:
        print "INFO: loading pickled corpus and word hash"
        corpus = pickle.load(open("pickles/corpus.p", "rb"))
        id2word = pickle.load(open("pickles/id2word.p", "rb"))

    else:
        print "INFO: processing text and building corpus..."
        corpus, id2word = process_texts(texts=texts,
                                        filter_stopwords=filter_stopwords,
                                        normalizer=normalizer,
                                        min_freq=min_freq)

        if update_pickle:
            # pickle files
            print "INFO: updating pickled coprus and word hash"
            pickle.dump(corpus, open("pickles/corpus.p", "wb"))
            pickle.dump(id2word, open("pickles/id2word.p", "wb"))

    # optional tfidf transformation
    if tfidf:
        print "INFO: applying tfidf transformation..."
        tfidf = TfidfModel(corpus)
        corpus = tfidf[corpus]

    # fit model
    print "INFO: fitting model..."
    lda = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=num_topics,
                   distributed=distributed)

    # report
    if report:
        perplexity = lda.bound(corpus)
        print "RESULTS:"
        print "\nperplexity: ", perplexity, "\n"
        topics = lda.show_topics(num_topics)
        for i, t in enumerate(topics):
            print "topic %d:" % i
            print t

    return lda, corpus, id2word
Пример #3
0
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None):
    doc_clean = [min_char(doc).split() for doc in doc_clean]

    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6)
    if tfidfmodel:
       tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc')
       corpus = tfidf[corpus]

    ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True)
    print("#Tópicos LDA")
    for i in range(0, n_topics):
        temp = ldamodel.show_topic(i, n_words)
        terms = []
        for term in temp:
            terms.append(term)
        print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms]))
    print('Bound: ',ldamodel.bound(corpus))
    # Compute Perplexity
    print('Perplexity: ',ldamodel.log_perplexity(corpus))
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    if unseen_docs:
        corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs]
        for i, unseen_doc in enumerate(corpus_new):
            topic = None
            score = 0
            inference_doc = ldamodel[unseen_doc]
            print(unseen_docs[i])
            for index,tmpScore in inference_doc[0]:
                if tmpScore > score:
                    score = tmpScore
                    topic = ldamodel.print_topic(index, 5)
            print ("Score: {}\t Topic: {}".format(score, topic))
        print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new))

    print_result(ldamodel, doc_clean, corpus, n_topics, description)
    pickle.dump(corpus, open(description+'.pkl', 'wb'))
    dictionary.save(description+'dictionary.gensim')
    ldamodel.save(description+'_ldamodel.gensim')
sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2]

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

print('topic_num,avg,bound,perplexity,coherence')

for i in range(1, max_topic_num + 1):

    lda = LdaModel(corpus=corpus,
                   id2word=dic,
                   num_topics=i,
                   alpha=alpha,
                   random_state=1)

    avg_topics = mean([len(t) for t in [lda[c] for c in corpus]])

    bound = lda.bound(corpus)

    perwordbound = lda.log_perplexity(corpus)
    perplexity = np.exp2(-perwordbound)

    cm = CoherenceModel(model=lda,
                        corpus=corpus,
                        coherence='u_mass',
                        processes=1)
    coherence = cm.get_coherence()

    print(f"{i},{avg_topics},{bound},{perplexity},{coherence}")