Exemplo n.º 1
0
 def __init__(self, corpus, dct, df):
     self.dct = dct
     self.corpus = corpus
     self.model = HdpModel(corpus, dct)
     self.df = df
     self.lda = None
     self.topic_dist = None
Exemplo n.º 2
0
def doHDP(parent, directory):
  # requires that dictionary and corpus file already exist
  files = os.listdir(parent + directory)
  dictionary = corpora.Dictionary.load("../processed/" + parent[9:-1] + "_lda/" + directory + ".dict")
  review_corpus = corpora.MmCorpus("../processed/" + parent[9:-1] + "_lda/" + directory + ".mm")

  numTopics = [3, 5, 10]
  for numTopic in numTopics:
    print "Running HDP for", directory, "for", numTopic, "topics\n"
    hdp = HdpModel(corpus=review_corpus, id2word=dictionary, T=numTopic, K=10, gamma = 0.8, alpha = 1)
    hdp.save("../processed/" + parent[9:-1] + "_hdp/" + directory + "_" + str(numTopic) + "_topicModel.hdp")
    showHDPTopics(hdp)  
Exemplo n.º 3
0
def hldaperplexity(train, test):
    corpus = gensim.matutils.Dense2Corpus(train.astype(int),
                                          documents_columns=False)
    corpusTest = gensim.matutils.Dense2Corpus(test.astype(int),
                                              documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        c = Chrono().start()
        hlda = HdpModel(corpus, dictionary)
        c.end()

    corpus_words = sum(cnt for document in corpusTest for _, cnt in document)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        ll = hlda.evaluate_test_corpus(corpusTest)
    perwordbound = ll / corpus_words
    print(
        "LDA %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words"
        % (perwordbound, numpy.exp2(-perwordbound), len(corpusTest),
           corpus_words))
    return numpy.exp2(-perwordbound), c.elapsed()
Exemplo n.º 4
0
class HDP(object):
    def __init__(self, corpus, dct, df):
        self.dct = dct
        self.corpus = corpus
        self.model = HdpModel(corpus, dct)
        self.df = df
        self.lda = None
        self.topic_dist = None

    def build_lda(self):
        self.lda = self.model.suggested_lda_model()

    def build_topic_dist(self):
        self.topic_dist = []
        for lst in self.lda[self.corpus]:
            distr = np.array([0.0] * 150)
            for tup in lst:
                distr[tup[0]] = tup[1]
            self.topic_dist.append(distr)

    def jensen_shannon(self, query, matrix):
        p = query
        q = matrix
        m = 0.5 * (p + q)
        E1 = entropy(p, m)
        E2 = entropy(q, m)
        E = E1 + E2
        return np.sqrt(0.5 * E)

    def similarity(self, query, matrix, k=10):
        sims = []
        for index, item in enumerate(matrix):
            sims.append(self.jensen_shannon(query, matrix[index]))
        sims = np.array(sims)
        return sims.argsort()[:k]

    def similarity_query(self, index, k=10, n=2):
        bow = self.dct.doc2bow(self.df.iloc[index, n])
        doc_distribution = np.array([0.0] * 150)
        for tup in self.lda.get_document_topics(bow=bow):
            doc_distribution[tup[0]] = tup[1]
        return self.similarity(doc_distribution, self.topic_dist, k)
Exemplo n.º 5
0
#
# Induce a hierarchy from the probabilistic generative model fit to the corpus
#
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.hdpmodel import HdpModel
from collections import OrderedDict
from scipy.stats import dirichlet
import edmonds
import pickle

out = '/home/mjg/data/descriptions'

# Load model
dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2')
hda = HdpModel.load(out + ".hdm")
topics = hda.show_topics(-1, -1, formatted=False)    

def mkarray(l):
  a = np.array([x for x in l])
  return a


# Calculate corpus probabilities
alpha = 0.1 * len(dictionary)
phi = []
for i in range(len(topics)):
  phic = {}
  for j in range(len(topics[i][1])):
    phic[topics[i][1][j][0]] = topics[i][1][j][1] 
  phi.append(OrderedDict(sorted(phic.items())))
Exemplo n.º 6
0
kfolds=10
kf = cross_validation.KFold(count1, n_folds=kfolds)
for li in f:
    li=li.split()
    corpora_documents.append(li)
for la in f2:
    la=la.split()
    label_level.append(la)
corpora_documents=array(corpora_documents)
label_level=array(label_level)

#生成字典和向量语料
dictionary = corpora.Dictionary(corpora_documents)
#dictionary.save('dictionary.dict')
corpus = [dictionary.doc2bow(text) for text in corpora_documents]
tfidf=models.TfidfModel(corpus)
corpus_tfidf=tfidf[corpus]

hdp=HdpModel(corpus_tfidf,id2word=dictionary)
corpus_hdp=hdp[corpus_tfidf]
index=similarities.MatrixSimilarity(corpus_hdp)

print(hdp.print_topics(num_topics=20, num_words=10))







print(lsi[test_doc_bow])

test_doc2 = 'Material and physics'
test_doc2 = custom_preprocess(test_doc2)
test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
print(test_doc_bow2)

print(lsi[test_doc_bow2])

lsi_cm=CoherenceModel(model=lsi,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v')
LSI_cm=lsi_cm.get_coherence()
LSI_cm

from gensim.models.hdpmodel import HdpModel

hdp = HdpModel(corpus=journals_corpus,id2word=journals_dictionary)

hdp_topics = hdp.print_topics()
for topic in hdp_topics:
  print(topic)

test_doc = 'Journal of medicines and herbs'
test_doc = custom_preprocess(test_doc)
test_doc_bow = journals_dictionary.doc2bow(test_doc)
print(test_doc_bow)

print(hdp[test_doc_bow])

test_doc2 = 'Material and physics'
test_doc2 = custom_preprocess(test_doc2)
test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
def main():
    parser = ArgumentParser(
        description=
        'wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information'
    )
    parser.add_argument('-ds',
                        '--dataset',
                        default='wiki',
                        help='What kind of dataset to use. (wiki,es,file)')
    parser.add_argument('-d',
                        '--dump-file',
                        help='Wiki: bz2 dump file with wiki in it')
    parser.add_argument('-l',
                        '--limit',
                        help='Wiki: How many documents to extract from wiki')
    parser.add_argument('--model-id',
                        default='model',
                        help='Filename for created model.')
    parser.add_argument(
        '--model-type',
        default='lsi',
        help='Model type (lsi, lda, word2vec, hdp, vocabulary).')
    parser.add_argument('--n-topics',
                        default=10,
                        help='Number of topics to model.')
    parser.add_argument('--n-passes',
                        default=1,
                        help='Number of passes for LDA  model.')
    parser.add_argument('--w2v-size',
                        default=100,
                        help='size of Word2Vec context.')
    parser.add_argument('--w2v-window', default=5, help='window for Word2Vec.')
    parser.add_argument('-q',
                        '--query',
                        default=None,
                        help='Elasticsearch: Query to use to fetch documents')
    parser.add_argument('--index', help='Elasticsearch: index to read from.')
    parser.add_argument('--doc_type',
                        default='doc',
                        help='Elasticsearch: data type in index.')
    parser.add_argument(
        '--data-dir',
        help='Directory to save the generated models and vocabularies into.')
    parser.add_argument(
        '--vocab',
        help=
        'Prebuilt Vocabulary file. Use this to avoid having to generate one.')

    opts = parser.parse_args()

    model_type = opts.model_type.lower()
    if model_type not in ['lsi', 'lda', 'word2vec', 'hdp', 'vocabulary']:
        logging.error("Invalid model type %s" % model_type)
        parser.print_usage()
        exit(-1)

    logging.info("Using model type %s" % model_type)

    dump_fn = opts.dump_file
    limit = int(opts.limit) if opts.limit else None

    data_type = opts.dataset.lower()
    if data_type not in ['es', 'wiki', 'file']:
        logging.error("Invalid dataset  type %s" % data_type)
        parser.print_usage()
        exit(-1)
    limit = None
    if opts.limit:
        limit = int(opts.limit)
    if not dump_fn and data_type in ['wiki']:
        logging.error('--dump-file required for wiki dataset')
        sys.exit(1)

    query = opts.query
    index = opts.index
    doc_type = opts.doc_type
    if data_type == 'es' and index is None:
        logging.error(
            "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter"
        )
        sys.exit(1)

    n_topics = int(opts.n_topics)
    n_passes = int(opts.n_passes)
    logging.info("Using %d topics." % n_topics)
    data_dir = opts.data_dir
    model_id = opts.model_id
    model_fn = '%s_%s_%d' % (model_id, model_type, n_topics)
    if data_dir:
        model_fn = '%s/%s' % (data_dir, model_fn)
    if model_type == 'word2vec':
        w2v_size = int(opts.w2v_size)
        w2v_window = int(opts.w2v_window)
        model_fn = '%s_w_%s_s_%s' % (model_fn, w2v_window, w2v_size)
    logging.info("Writing models to %s." % model_fn)

    if data_type == 'es':
        logging.info("Using data type %s with index %s, doc_type %s query %s" %
                     (data_type, index, doc_type, query))
        dataset = ElasticsearchDataset(read_index=index,
                                       read_doc_type=doc_type,
                                       query=query,
                                       normalize_func=normalize_es)
    elif data_type == 'wiki':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = WikipediaDataset(dump_fn=dump_fn,
                                   num_articles=limit,
                                   normalize_func=normalize_wiki)
    elif data_type == 'file':
        logging.info("Using data type %s with dump_file %s and limit %s" %
                     (data_type, dump_fn, limit))
        dataset = FileDataset(dump_fn=dump_fn,
                              num_articles=limit,
                              normalize_func=normalize_file)
    vocab_file = opts.vocab
    vocab = Dictionary()
    sw = set(stopwords.words('norwegian'))
    if not vocab_file or model_type == 'vocabulary':
        vocab.add_documents([get_tokenized(page, sw) for page in dataset])
        vocab.filter_extremes()
        vocab.compactify()
        vocab.save(model_fn + '.vocab')
    else:
        vocab = Dictionary.load(vocab_file)
    if model_type == 'vocabulary':
        return
    tfidf = TfidfModel(dictionary=vocab)
    if model_type == 'lsi':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LsiModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         id2word=vocab)
    elif model_type == 'lda':
        corpus = IterableDataset(dataset, sw, vocab)
        model = LdaModel(corpus=tfidf[corpus],
                         num_topics=n_topics,
                         passes=n_passes,
                         id2word=vocab)

    elif model_type == 'word2vec':
        corpus = IterableDataset(dataset, sw, vocab, doc2bow=False)
        corpus.dictionary = vocab
        model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size)
    elif model_type == 'hdp':
        corpus = IterableDataset(dataset, sw, vocab)
        model = HdpModel(corpus=tfidf[corpus], id2word=vocab)

    logging.info(model)
    model.save(model_fn)
Exemplo n.º 9
0
def classify(k4=None):
    corpora_documents = []
    label_level=[]
    filename="label_level01.txt"
    f=codecs.open("finalresult1.txt",'r',encoding="utf-8").readlines()
    f2=codecs.open(filename,'r',encoding="utf-8").readlines()
    y2,label_dict,n_x=y_label(filename)
    count1=12901
    kfolds=10
    kf = cross_validation.KFold(count1, n_folds=kfolds)
    for li in f:
        li=li.split()
        corpora_documents.append(li)
    for la in f2:
        la=la.split()
        label_level.append(la)
    corpora_documents=array(corpora_documents)
    label_level=array(label_level)
    sum_count=[]
    k=[];k1=[]
    k2=[];k3=[]
    for train_index, test_index in kf:
        #print(train_index, test_index)
        X_train, X_test = corpora_documents[train_index], corpora_documents[test_index]
        y_train, y_test = label_level[train_index], label_level[test_index]
        y2_train, y2_test = y2[train_index],y2[test_index]

        #生成字典和向量语料
        dictionary = corpora.Dictionary(X_train)
        #dictionary.save('dictionary.dict')
        corpus = [dictionary.doc2bow(text) for text in X_train]
        tfidf=models.TfidfModel(corpus)
        corpus_tfidf=tfidf[corpus]

        # Lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=500)
        # corpus_lsi=Lsi[corpus_tfidf]
        # index=similarities.MatrixSimilarity(corpus_lsi)

        Lsi=HdpModel(corpus_tfidf,id2word=dictionary)
        corpus_lsi=Lsi[corpus_tfidf]
        index=similarities.MatrixSimilarity(corpus_lsi)

        count=0
        y_prediction=zeros((len(test_index),n_x))
        for i,li in enumerate(X_test):
            #li=li.split()
            test_corpus_1 = dictionary.doc2bow(li)
            test_corpus_tfidf=tfidf[test_corpus_1]
            test_corpus_Lsi=Lsi[test_corpus_tfidf]
            sims=index[test_corpus_Lsi]
            sort_sims=sorted(enumerate(sims),key=lambda x:-x[1])
            print(sort_sims[:k4])
            predictions={}
            for m,n in sort_sims[:k4]:
                for key in y_train[m]:
                    if key not in predictions.keys():
                        predictions[key]=1
                    else:
                        predictions[key]+=1
            true_label=y_test[i]
            prediction=[]
            for key in predictions.keys():
                if predictions[key]>=k4/2:
                    prediction.append(key)
            true_label.sort()
            prediction.sort()
            # print("true label:",true_label)
            if len(prediction)==0:
                dict_sorted=sorted(predictions.items(),key=lambda x:x[1],reverse=True)
                prediction.append(dict_sorted[0][0])
                if true_label==prediction:
                    count+=1
                    #print(1)
            elif true_label==prediction:
                count+=1
                #print(1)
            else:
                false=0
                #print(0)
            # print("predict",prediction)
            # print(predictions)
            for label in prediction:
                # print('label_dict[label]:',label_dict[label])
                # print(i,len(y_prediction[i]))
                y_prediction[i,label_dict[label]]=1
            #print("#####################################")
        #print("count:",count)
        sum_count.append(count)
        hammingloss= sklearn.metrics.hamming_loss(y2_test, y_prediction)
        jaccard= sklearn.metrics.jaccard_similarity_score(y2_test, y_prediction)
        f1score= sklearn.metrics.f1_score(y2_test, y_prediction,average='micro')
        zerooneloss= sklearn.metrics.zero_one_loss(y2_test, y_prediction)
        print("hammingloss,jaccard,f1score,zerooneloss:", hammingloss, jaccard, f1score, zerooneloss)
        k.append(hammingloss)
        k1.append(jaccard)
        k2.append(zerooneloss)
        k3.append(f1score)


    print("hamming_loss mean:", array(k).mean())
    print("hamming_loss var:", array(k).var())

    print("jaccard mean:",array(k1).mean())
    print("jaccard var:", array(k1).var())

    print("f1_score mean:", array(k3).mean())
    print("f1_score var:", array(k3).var())

    print("zero_one_loss mean:", array(k2).mean())
    print("zero_one_loss var:", array(k2).var())
Exemplo n.º 10
0
# remove common words
stoplist = set(
    'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'
    .split(','))
stop_ids = [
    corpus.dictionary.token2id[stopword] for stopword in stoplist
    if stopword in corpus.dictionary.token2id
]
corpus.dictionary.filter_tokens(stop_ids)

# only keep the most frequent words
corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# save stuff
MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000)
corpus.dictionary.save_as_text(out + '_wordids.txt.bz2')
# save memory
dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2')
del corpus

# initialize corpus reader and word->id mapping
mm = MmCorpus(out + '_bow.mm')

# build tfidf
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(out + '.tfidf.model')
MmCorpus.serialize(out + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

# Run hierarchical Dirichlet process over corpus
hda = HdpModel(corpus=mm, id2word=dictionary)
hda.save(out + ".hdm")
Exemplo n.º 11
0


# Charge les données
print("Loading data...")
from loadjson import *
from nltk.corpus import stopwords

# Préparation des données
stoplist = stopwords.words('french') + stopwords.words('english') + list('\'"`():,;.!?')
docs = [[w for s in m['sents'] for w in s if w not in stoplist and w.isalnum() and len(w) >= 2] for m in mails.values()]
id2word = Dictionary(docs)
docs = [id2word.doc2bow(doc) for doc in docs] # Indexe les mots

print("Training HDP model...")
hdp = HdpModel(docs, id2word=id2word, max_time=10*60)
hdp.save('hdp.gensim')
id2word = hdp.id2word


print("Exporting HDP model...")
# Construction du tableau des associations mot-topic
out_topicid = []
out_word = []
out_p = []
for topicid, words in hdp.show_topics(topics=-1, topn=500, formatted=False):
    for word, p in words:
        if p < 5e-3:
            break
        out_topicid.append(int(topicid))
        out_word.append(word)
Exemplo n.º 12
0
def fit_model(corpus,id2word,num_topics=20):
    # 训练模型
    hdp = HdpModel(corpus=corpus, id2word=id2word)
    hdp.print_topics(num_topics)
    return hdp