if result is not None: vec = np.fromstring(result,dtype=np.float32) embeddings_dic[i] = vec else: count += 1 embeddings_dic[i] = getRandom_vec() # print(count) return embeddings_dic corpus,dic,labels = load_data.load_corpus() # TF-IDF tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic) # corpus_tfidf = [tfidf[doc] for doc in corpus] corpus_tfidf = tfidf[corpus] corpus_top = get_tfidf_top(corpus_tfidf,top_number=10) dic_embeddings = get_dic_embeddings(dic) doc_embeddings = model_util.get_doc_embeddings(corpus_top,dic_embeddings) train_data,train_label,test_data,test_label = load_data.get_train_test(doc_embeddings,labels) print("train size: "+str(train_data.shape[0])) print("test size: "+str(test_data.shape[0])) clf = LinearSVC() clf.fit(train_data,train_label) score = clf.score(test_data,test_label) print(score)
return matrix corpus,dic,labels = load_data.load_corpus() """ # TF-IDF tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic) corpus_tfidf = tfidf[corpus] # LDA lda_model = gensim.models.LdaModel(corpus_tfidf,num_topics=4,id2word=dic) """ lda_model = gensim.models.wrappers.LdaMallet('F:/mallet-2.0.8/bin/mallet.bat',corpus=corpus,num_topics=4,id2word=dic) doc_topics = [] for doc in corpus: doc_topics.append(lda_model.get_document_topics(doc,minimum_probability=0)) doc_topics_matrix = get_corpus_topic_distribution(doc_topics,num_topic=4) train_data,train_label,test_data,test_label = load_data.get_train_test(doc_topics_matrix,labels) print("train size: "+str(train_data.shape[0])) print("test size: "+str(test_data.shape[0])) # SVM classification # clf = SVC() clf = LinearSVC() clf.fit(train_data,train_label) score = clf.score(test_data,test_label) print(score)
# -*- coding: utf-8 -*- __author__ = 'PC-LiNing' import gensim from lda import load_data import numpy as np from sklearn.lda import LDA corpus,dic,labels = load_data.load_corpus() tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic) corpus_tfidf = [tfidf[doc] for doc in corpus] matrix = load_data.convert_to_matrix(corpus_tfidf) train_data,train_label,test_data,test_label = load_data.get_train_test(matrix,labels) lda = LDA(solver='svd',store_covariance=True) lda.fit(train_data,train_label) score = lda.score(test_data,test_label) print(score)