def three(): app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True result = LDA.LDA(10) ##문서 10개 돌림 # print return json.dumps(result, ensure_ascii=False)
def lda_terms_analysis(lda_model_filename, word2vec_model_filename): topics = LDA.get_topics_terms(lda_model_filename) word2vec = models.Word2Vec.load(word2vec_model_filename) new_topics = [] useless = [] for topic in topics: words = topic[-1] dictionary, matrix = get_words_matrix(words, word2vec) clusters, centers = cluster(matrix, dictionary, 2, 10) cohesions = [] for c in clusters.items(): sub_words = c[-1] label = c[0] _, sub_matrix = get_words_matrix(sub_words, word2vec) center = centers[label] cohesion = utilities.cohesion(sub_matrix, center) cohesions.append((label, cohesion)) cohesions.sort(key=lambda x: x[-1]) new_topic = list(topic[:-1]) new_topic.append(cohesions[0][1]) new_topic.append(clusters[cohesions[0][0]]) new_topics.append(new_topic) for c in cohesions[1:]: u_topic = list(topic[:-1]) u_topic.append(c[0]) u_topic.append(c[1]) u_topic.append(clusters[c[0]]) useless.append(u_topic) return new_topics, useless
def __get_topic(self): """ returns a dictionary of (hashtag: topic) attributes using Latent Dirichlet Allocation """ tweet_topic = {} tweet_data = [] for tweet in self.tweets: tweet_topic[tweet["id_str"]] = "" text = self.get_tweet_text(tweet) tweet_data.append((text, tweet["id_str"])) lda = LDA.LDA(tweet_data) for tweet in self.tweets: text = self.get_tweet_text(tweet) tweet_topic[tweet["id_str"]] = lda.predict_with_bag(text) # tweet_topic[tweet["id_str"]] = lda.predict_with_tf_idf(text) hashtag_topic = {} for hashtag in self.hashtags: hashtag_topic[hashtag["text"]] = [] for tweet, hashtagList in self.tweet_hashtag_map.items(): for hashtag in hashtagList: hashtag_topic[hashtag["text"]].append(tweet_topic[tweet]) #hashtag topic is the the topic of the majority of hashtag's tweets hashtag_topic = {hashtag: self.most_common(l) for hashtag, l in hashtag_topic.items()} return hashtag_topic
def main(): filename = '../resource/train.csv' itemid, numattr, cateattr, label = readfile(filename) totalnum = len(numattr) testnum = totalnum * 0.1 testnum = int(testnum) trainnum = totalnum - testnum trainnumattr = numattr[0: trainnum] traincateattr = cateattr[0: trainnum] trainlabel = label[0: trainnum] testnumattr = numattr[trainnum:] testcateattr = cateattr[trainnum:] testlabel = label[trainnum:] multidim = MultiDimension(traincateattr) trainextattr = multidim.gettrainextattr() testextattr = multidim.gettestextattr(testcateattr) trainattr = append(trainnumattr, trainextattr, axis = 1) testattr = append(testnumattr, testextattr, axis = 1) LDAcoe = LDA(trainattr, trainlabel) LDAtrainattr = conpress(trainattr, LDAcoe) LDAtestattr = conpress(testattr, LDAcoe) for i in range(20): print LDAtrainattr[i] import sys sys.exit(1) model = WeightedModel(LDAtrainattr, trainlabel) right = 0 for i in range(testnum): p = model.predict(LDAtestattr[i]) if p == testlabel[i]: right += 1 accuracy = float(right) / testnum print 'accuracy:', accuracy
def testSelectedFeatures1(): print("start testAdditionlSquaredFeatures()") LRModel = lr.LogisticRegression(0.001, 500) LDAModel = LDA.LDA() data1 = genRWNormalized() data2 = np.append(data1[:, [10, 1, 9, 6]], np.array([data1[:, -1]]).T, axis=1) data3 = addSquareFeature(data1, [10, 1, 9, 6]) a1 = 0 b1 = 0 a2 = 0 b2 = 0 a3 = 0 b3 = 0 for i in range(3): np.random.shuffle(data1) np.random.shuffle(data2) np.random.shuffle(data3) a1 += LRKFoldValidation(LRModel, data1, 5) b1 += LDAKFoldValidation(LDAModel, data2, 5) a2 += LRKFoldValidation(LRModel, data2, 5) b2 += LDAKFoldValidation(LDAModel, data2, 5) a3 += LRKFoldValidation(LRModel, data3, 5) b3 += LDAKFoldValidation(LDAModel, data3, 5) print("Accuracy for lr in rw is {}".format(a1 / 3)) print("Accuracy for LDA in rw is {}".format(b1 / 3)) print("Accuracy for lr in rw is {}".format(a2 / 3)) print("Accuracy for LDA in rw is {}".format(b2 / 3)) print("Accuracy for lr in rw is {}".format(a3 / 3)) print("Accuracy for LDA in rw is {}".format(b3 / 3))
def testLDAWithWine(): data = genDataWOHeader(file_path1) qualityToCategory(data) np.random.shuffle(data) #data1= removeOutLiersByND(data2) testSet, trainSet = seperateTestSet(data) aModel = LDA.LDA() return LDAKFoldValidation(aModel, trainSet, 5)
def getResult(dataMat, label, testNum, classNum): """ 加载每类测试集,计算总正确率 :param dataMat: 训练集 :param label: 训练集标签矩阵 :param testNum: 每类测试集的测试个数 :param classNum: 共几类 :return: """ Count = 0 # TODO 85 disc_set, disc_value = LDA.pca(dataMat, 85) redVects, Train_LDA = LDA.lda(dataMat, label, 85, 16, 11, 176) # LDA投影空间,最终的训练集 for classnum in range(1, classNum + 1): print('第', classnum, '类') Count += compare(disc_set, Train_LDA, redVects, label, testNum, classnum, 5) print('Final correctCount:', Count / 16)
def testLDAWithCancer(): data = genData(file_path2) classToCategory(data) preprocessData(data) np.random.shuffle(data) #data1= removeOutLiersByND(data2) testSet, trainSet = seperateTestSet(data) aModel = LDA.LDA() return LDAKFoldValidation(aModel, trainSet, 5)
def getResult(dataMat, label, PCA_dim, testNum, classNum, classInNum ): ''' 加载每类测试集,计算总正确率 :param dataMat: 训练集 :param label: 训练集标签矩阵 :param testNum: 每类测试集的测试个数 :param classNum: 共几类 :param classInNum:每类有几个图 :return: ''' Count = 0 # disc_set, disc_value ,meanFace= LDA.pca(dataMat, PCA_dim) disc_set,x,y=LDA.pca(dataMat,PCA_dim) Total=classNum*classInNum redVects, Train_LDA = LDA.lda(dataMat, label,PCA_dim, classNum, classInNum, Total) # LDA投影空间,最终的训练集 for classnum in range(1, classNum + 1): print('第',classnum,'类') Count += compare(disc_set, Train_LDA, redVects, label, testNum,classnum, 7) print('Final correctCount:', Count/classNum)
def small_data_test(): test_1 = LDA.lda_function( [[1, 0, 1, 2, 3], [2, 1, 1, 2, 3], [2, 2, 3, 4, 5]], [[0], [1], [1]], 3) result = [[-0.559, -0.281, -0.78], [-0.3336, -0.7849, 0.5219], [-0.759, 0.552, 0.345]] if test_1: return ("All OK") else: return ("something went wrong")
def featureSelection(train_x, train_y, test_x): feature_pool = np.arange(train_x.shape[1]) feature_selected_count = 0 selected_feature = [] while feature_selected_count < 2: best_acc = 0 best_feature = None for feature in feature_pool: # Cannot select same features if feature in selected_feature: continue candidate_feature = selected_feature.copy() candidate_feature.append(feature) print() print("Candidate feature = {}".format(candidate_feature)) divided = int(len(train_x) / K) overall_acc = 0 total_PD = [] total_FA = [] for fold in range(K): # print() # print("Now fold is {}".format(fold)) # Compute start and end index start = divided * fold end = divided * (fold + 1) # Filter data so candidate features training_x = np.concatenate((train_x[:start, candidate_feature], train_x[end:, candidate_feature])) training_y = np.concatenate((train_y[:start], train_y[end:])) validation_x = train_x[start:end, candidate_feature] validation_y = train_y[start:end] y_pred = LDA.knn(training_x, train_y, validation_x) acc = LDA.compute_accuracy(y_pred, validation_y) overall_acc += acc print("Overall accuracy: {}".format(overall_acc / K)) if overall_acc >= best_acc: best_acc = overall_acc best_feature = candidate_feature selected_feature = candidate_feature feature_selected_count += 1 print("Feature selected: {}".format(selected_feature)) print(train_x[:, selected_feature]) return train_x[:, selected_feature], test_x[:, selected_feature]
def RunTrainLDA(infile, pcaFile, ldaFile): import cPickle fp = open(infile, "r") dataset = cPickle.load(fp) subjID = cPickle.load(fp) fp.close() pca = PCA(dataset) pca_proj = pca.compute() np.save(pcaFile, pca_proj) lda_proj = [] lda = LDA(dataset, subjID, pca_proj) projData = lda.projectData() lda_proj = lda.train(projData) np.save(ldaFile, lda_proj)
def identify_clusters_in_project(project_name, project_path): create_logging_folders(project_name) temp_json_location = f'{Settings.DIRECTORY}/data/output.json' utils.execute_parser(project_path) # Read parsed document parsed_raw_json = {} with open(temp_json_location) as json_file: parsed_raw_json = json.load(json_file) classes = extract_classes_information_from_parsed_json(parsed_raw_json) graph = nx.DiGraph() graph = Graph.create_dependencies(classes, graph) lda.apply_lda_to_classes(graph, classes) calculate_absolute_weights(graph, classes, weight_type=WeightType.LDA) # TODO : think about if the pre_processing should be done or not graph = Clustering.pre_process(graph, remove_weak_edges=False, remove_disconnected_sections=True) clusters_results = [] if Settings.RESOLUTION: clusters, modularity = Clustering.community_detection_louvain( graph, resolution=Settings.RESOLUTION) clusters_results.append((clusters, modularity, Settings.RESOLUTION)) Clustering.write_modularity_and_services(clusters_results) else: clusters_results = Clustering.compute_multiple_resolutions(graph, start=0.3, end=1.1, step=0.1) # TODO: Reconsider techniques of post-processing # clusters = PostProcessing.process(clusters, classes, graph.copy()) return clusters_results
def hierarchical_topic_analyse_with_silhouette(corpus_filename, word2vec_model_filename, lda_filter=False, k=1): if lda_filter: topic2terms = pickle.load(open(corpus_filename)) else: topic2terms = LDA.get_topics_terms(corpus_filename) # topic2terms, _ = lda_terms_analysis(corpus_filename, word2vec_model_filename) topics = [] for t in topic2terms: topics.append(list(t)) if k == 0: return topics word2vec_model = models.Word2Vec.load(word2vec_model_filename) new_topics = [] for topic in topics: words = topic[-1] clusters = cluster_analyse_with_silhouette(words, word2vec_model, k) for c in clusters: new_topic = topic[:-1] new_topic.extend(c) new_topics.append(new_topic) return new_topics
def hierarchical_topic_analyse(lda_model_file, word2vec_model_file, k=1): topic2terms = LDA.get_topics_terms(lda_model_file) topics = [] for t in topic2terms: topics.append(list(t)) word2vec_model = models.Word2Vec.load(word2vec_model_file) for i in range(k): new_topics = [] for t in topics: words = t[-1] dictionary, matrix = get_words_matrix(words, word2vec_model) clusters = cluster(matrix, dictionary, 2, 10) for item in clusters: labels = t[:-1] labels.extend(list(item)) new_topics.append(labels) del dictionary del matrix del clusters topics = new_topics # topic_tree = get_topic_tree(topics) # return topic_tree return topics
ax.plot_trisurf([0, eigVects[:2][0, 0], eigVects[:2][1, 0]], [0, eigVects[:2][0, 1], eigVects[:2][1, 1]], [0, eigVects[:2][0, 2], eigVects[:2][1, 2]]) plt.subplot(224) plt.title('投影之后') plt.scatter(array(down_Mat)[:len(mansls), 0], array(down_Mat)[:len(mansls), 1], c='red') plt.scatter(array(down_Mat)[len(mansls):, 0], array(down_Mat)[len(mansls):, 1], c='green') plt.show() #--------------降维后再使用LDA---------------------- #用降维的数据测试LDA w, mean1, mean2, group1, group2 = LDA.train(down_Mat[:len(mansls), :], down_Mat[len(mansls):, :]) #将测试集降维 testboy, labels = LDA.getdata('boy82.txt', 0) testboy = mat(testboy) #训练集降维 test_Mat, test_eigvals, test_eigVects = pca_new(testboy, 2) group3 = test_Mat.T count = 0 for i in range(shape(group3)[1]): if (LDA.predict(group3[:, i].T, w, mean1, mean2))[0, 0] >= 0: #这里要注意和测试数据对应 count = count + 1 print('降维后的准确率:', count / shape(group3)[1]) #降维后有0.96准确率,而没降维只有0.84(用身高,体重)
sample_image = test_images[random.sample(range(len(test_label)), 10)] if input_.mode == 0: ## Doing PCA and get the eigenface and W(dimension reduction) PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images, Size=Size, FacePath="./PCA/EigenFace/") Reconstruct(EigenFace=PCA_EigenFace, sample_image=sample_image, Size=Size, Path="./PCA/") ## Doing LDA and get the fisherface and W(dimension reduction) LDA_mean, LDA_EigenFace, LDA_W = LDA(images=images, Size=Size, label=label, FacePath="./LDA/EigenFace/") Reconstruct(EigenFace=LDA_EigenFace, sample_image=sample_image, Size=Size, Path="./LDA/") elif input_.mode == 1: ## Doing PCA and get the eigenface and W(dimension reduction) print("PCA:") PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images, Size=Size, FacePath=None) ## Using PCA Knn on test image sets, I try to label the test images. KNN("PCA", k = 3, images = images, EigenFace = PCA_EigenFace.T, proj_train_image = PCA_W, label = label, \ test_images = test_images, test_label = test_label)
import LDA as C C.clearScreen() dataTraining = C.loadData("dataTraining.txt") X = dataTraining[:, 0:2] y = dataTraining[:, 2:3] C.plotLDA(X, y) C.plotNormalSurface(X, y)
key_set.add(keyword_rec) for keyword_rec_split_i in keyword_rec_split: key_set.add(keyword_rec_split_i) w2vModel = w2v.w2vModel(data) key_list = [] for w2vkey in key_set: if w2vModel.model.wv.__contains__(w2vkey): key_list.append(w2vkey) else: print('false:', w2vkey, flush=True) words = w2vModel.get_similar_words(pos_words=key_list) for word in words: rec_words += word[0] rec_words += ' ' lmodel = LDA.LDAModel(data,n_topics=100) topicwords = lmodel.model.print_topics(num_topics=2,num_words=5) wordsset = set() for i in topicwords: allwords = re.findall(r'".*?"',i[1]) for j in allwords: jj = j.strip('"') if keyword_rec != jj: wordsset.add(jj) for i in wordsset: rec_words += i rec_words += ' ' print(rec_words) print('key', len(words), flush=True) sqlcur.execute("REPLACE INTO keyword_recommend_t VALUES('" + str(keyword_rec) + "','" + timestr_2 + "','" + rec_words + "')") mysqldb.commit()
import time import BCWDataset, WQDataset import LDA, LogisticRegression import KFoldCrossValidator bcwd = BCWDataset.BCWDataset() bcwd.load() wqd = WQDataset.WQDataset() wqd.load() print("LDA, BCW") print(KFoldCrossValidator.validate(LDA.LDA(), 5, bcwd.X, bcwd.y)) print("LogReg, BCW") print( KFoldCrossValidator.validate( LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5, bcwd.X, bcwd.y)) print("LDA, WQ") print(KFoldCrossValidator.validate(LDA.LDA(), 5, wqd.X, wqd.y)) print("LogReg, WQ") print( KFoldCrossValidator.validate( LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5, wqd.X, wqd.y))
def _init_trans_mat(self): # Check input if any([x is None for x in [self.X, self.labels, self.d]]): raise ValueError('X, labels and subdim not set!') num_pts = self.X.shape[0] D = self.X.shape[1] subdim = self.d # Setup random state prng = RandomState() if self._SEED is not None: prng = RandomState(self._SEED) if self._verbose: print("Setting random seed to", self._SEED) if self._init_method == "PCA": if num_pts < self.d: raise ValueError('num_pts < subdim') if self.d > D: raise ValueError('subdim > inputdim') pca = PCA(n_components=subdim, whiten=False) pca.fit(self.X) L = pca.components_.T + 1E-6 elif self._init_method == "LDA": if self.d > D: raise ValueError('subdim > inputdim') lda_obj = LDA.LDA(self.X, self.labels) lda_obj.compute(dim=self.d) L = lda_obj.getTransform() L = L * (1. / LA.norm(L, ord=1, axis=1)).reshape(-1, 1) elif self._init_method == "randbeng": # L = 1. * bound * prng.rand(D, self.d) - bound L = np.random.normal(0, np.sqrt(2) / np.sqrt(self.D + self.d), (self.D, self.d)) elif self._init_method == "randbest": # Do some random generation of matrices pick the one with lowest # of constraints if self._verbose: print('Doing random pre-gen L') t0 = timeit.default_timer() best_L = prng.rand(D, self.d) L = best_L self.loss_fun(best_L) # nconsts = self._count_active_constraints() bound = np.sqrt(6. / (D + self.d)) best_N_consts = 1E10 for i in range(0, 10): L = 1. * bound * prng.rand(D, self.d) - bound # L = 1E-5*prng.rand(D,self.d) # L = L * (1./LA.norm(L,ord=1,axiss=1)).reshape(-1,1) self.loss_fun(L) consts = self._count_active_constraints() if consts < best_N_consts: best_N_consts = consts best_L = copy.copy(L) L = copy.copy(best_L) if self._verbose: print("Pre-gen of L done. Took:", "%3.3f" % (timeit.default_timer() - t0), end=", ") print("# active const", best_N_consts, end=", ") elif self._init_method == "rand": # method_str = print('Doing random pre-gen Lapa') bound = np.sqrt(6. / (D + self.d)) L = 1. * bound * prng.rand(D, self.d) - bound return L
import create_data import LDA import argparse parser = argparse.ArgumentParser(description='Enter number of topics') parser.add_argument('--n_topics', help='Number of Topics') args = parser.parse_args() path = "news-articles.txt" create_data.create(path) LDA.modeller(args.n_topics)
#!/usr/bin/env python # encoding: utf-8 import LDA if __name__ == '__main__': ts = LDA.get_topics_terms('../data/models/sougou_lda_50_model.md') out = open('topics', 'wb') for t, terms in ts.items(): out.write(str(t) + '\n') out.write('\t'.join(terms).encode('utf8') + '\n') out.write('\n') out.close()
#for userId, user in dic_user.iteritems(): # print(str(userId) + " " + str(len(user.tweet_set))) k_topics = num_topics LDA_iterations = num_iterations sentimentPoints = getSentimentPoints() #print(sentimentPoints) dictionary, corpus, out_set = preprocessing(doc_set) for i in range(0,len(out_set)): tweet_set[i].wordSet = out_set[i] sentimentsOfTweets = getSentimentScoreOfTweets(out_set) model = LDA(dictionary, corpus, k_topics, LDA_iterations) for i in range(0,len(sentimentsOfTweets)): tweet_set[i].russell_tuple = sentimentsOfTweets[i] sentDic = loadDict() dictByTopic = [] tempDic = {} topics = model.get_topics() for topic in topics: tempDic = {} for i in range(0,len(topic)): tempDic[dictionary[i]] = topic[i] dictByTopic.append(tempDic)
def featureSelection(data, isLR): selectedFeatureNum = [] selectedFeatureArray = -1 bestAccuracyAll = 0 y_2d = np.array([data[:, -1]]).T #print(y_2d) for i in range(data.shape[1] - 1): featureToAdd = -1 bestAccuracy = 0 column_2d = -1 print("select feature{}".format(i)) if i == 0: for j in range(data.shape[1] - 1): if (j in selectedFeatureNum) == False: column_2d = np.array([data[:, j]]).T nums = selectedFeatureNum + [j] # ------5 should be changed -- #print(np.concatenate((column_2d,y_2d), axis = 1)) if isLR: model = lr.LogisticRegression(0.001, 500) accuracy = LRKFoldValidation( model, np.concatenate((column_2d, y_2d), axis=1), 5) else: model = LDA.LDA() accuracy = LDAKFoldValidation( model, np.concatenate((column_2d, y_2d), axis=1), 5) print("Using feature(s){} accuracy is{}".format( nums, accuracy)) if accuracy >= bestAccuracy: bestAccuracy = accuracy featureToAdd = j selectedFeatureArray = column_2d bestAccuracyAll = bestAccuracy selectedFeatureNum.append(featureToAdd) continue else: #try add feature from the rest of set for j in range(data.shape[1] - 1): if (j in selectedFeatureNum) == False: column_2d = np.array([data[:, j]]).T nums = selectedFeatureNum + [j] # ------5 should be changed --- #print(np.concatenate((selectedFeatureArray, column_2d , y_2d), axis = 1)) if isLR: model = lr.lr.LogisticRegression(0.001, 500) accuracy = LRKFoldValidation( model, np.concatenate( (selectedFeatureArray, column_2d, y_2d), axis=1), 5) else: model = LDA.LDA accuracy = LDAKFoldValidation( model, np.concatenate( (selectedFeatureArray, column_2d, y_2d), axis=1), 5) print("Using feature(s){} accuracy is{}".format( nums, accuracy)) if accuracy >= bestAccuracy: bestAccuracy = accuracy featureToAdd = j #additional feature cannot improve performance by 1% if bestAccuracyAll >= bestAccuracy: print("maxima reached") break else: #add addtional feature bestAccuracyAll = bestAccuracy selectedFeatureNum.append(featureToAdd) selectedFeatureArray = np.concatenate( (selectedFeatureArray, np.array([data[:, featureToAdd]]).T), axis=1) print( "feature selection ended, best performing features are {}, the accuracy is {}" .format(selectedFeatureNum, bestAccuracyAll)) return selectedFeatureNum, selectedFeatureArray
x_n = x_n.reshape(-1, 1) p_ks = np.empty(len(self.unique_y)) for j, k in enumerate(self.unique_y): p_x_given_y = self._mvn_density(x_n, self.mu_ks[j], self.Sigma) p_y_given_x = self.pi_ks[j]*p_x_given_y p_ks[j] = p_y_given_x y_n[i] = self.unique_y[np.argmax(p_ks)] return y_n We fit the LDA model below and classify the training observations. As the output shows, we have 100% training accuracy. lda = LDA() lda.fit(X, y) yhat = lda.classify(X) np.mean(yhat == y) The function below visualizes class predictions based on the input values for a model with $\bx_n \in \mathbb{R}^2$. To apply this function, we build a model with only two columns from the `wine` dataset. We see that the decision boundaries are linear, as we expect from LDA. def graph_boundaries(X, model, model_title, n0 = 100, n1 = 100, figsize = (7, 5), label_every = 4): # Generate X for plotting d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0) d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1) X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2) # Get class predictions y_plot = model.classify(X_plot).astype(int)
# y_pred = LDA.knn(breast_lower_dimension_train, breast_train_y.values.ravel(), breast_lower_dimension_test) # acc = LDA.compute_accuracy(y_pred, breast_test_y.values.ravel()) # print("=================== IONOSPHERE ==============") # ionosphere_train_x_selection, ionosphere_test_x_selection = featureSelection(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), ionosphere_test_x.values) # ionosphere_lower_dimension_train, ionosphere_lower_dimension_test = LDA.LDA(ionosphere_train_x_selection, ionosphere_train_y.values.ravel(), ionosphere_test_x_selection, ionosphere_test_y.values.ravel(), 'ionosphere') # prior, train_mean, train_cov = NBC.train(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(ionosphere_test_x.values, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere', 'NBC', True) # # # Project to lower dimension # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K) # BUG # prior, train_mean, train_cov = NBC.train(ionosphere_lower_dimension_train, ionosphere_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(ionosphere_lower_dimension_test, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere_lower', 'NBC', True) print("=================== WINE ==============") wine_train_x_selection, wine_test_x_selection = featureSelection(wine_train_x.values, wine_train_y.values.ravel(), wine_test_x.values) wine_lower_dimension_train, wine_lower_dimension_test = LDA.LDA(wine_train_x_selection, wine_train_y.values.ravel(), wine_test_x_selection, wine_test_y.values.ravel(), 'wine') # prior, train_mean, train_cov = NBC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(wine_test_x.values, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine', 'NBC', True) # Pocket classifier # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K) train_weight = PC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM) acc = PC.test(wine_test_x.values, wine_test_y.values.ravel(), train_weight, CLASS_NUM, 'wine', 'PC', True) # # Project to lower dimension # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K) # BUG # prior, train_mean, train_cov = NBC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(wine_lower_dimension_test, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine_lower', 'NBC', True) # Pocket classifier # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K) train_weight = PC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)
reload(sys) sys.setdefaultencoding('utf8') # inputSrc = './sampleTest.txt' inputSrc = './appleLegalDocDir/apple-osx-test.txt' minTokenNum = 4 tokenList = list() sentenceList = list() with open(inputSrc) as f: text = f.read() # Task 0: use LDA to filter out common topics. This way the main popular topic words won't affect # calculation of similarities. ldaModel = LDA.loadModel() topics, topicCoherencePair = LDA.getTopicsWithNormalizedWeight(ldaModel) sentences = vocabBuilder.tokenize(text) for sentence in sentences: tokens = list() tmpTokens = sentence.split() for token in tmpTokens: if token not in topics: tokens.append(token) if len(tokens) >= minTokenNum: tokenList.append(tokens) sentenceList.append(sentence) print 'There are ', len(sentenceList), ' sentences in the input sample.'
def SJ_interest_measurement_run(): db_client = MongoClient('mongodb://%s:%s@%s' % (MONGODB_ID, MONGODB_PW, MONGODB_HOST)) db = db_client["soojle"] renewal_time = find_variable(db, 'renewal') USER_list = find_user_renewal(db, renewal_time) USER_list = list(USER_list) ACTION_DAY_CHECK = get_default_day(SJ_USER_ACTION_DAY_CHECK) CATEGORY_list = find_all_category_of_topic(db) CATEGORY_list = list(CATEGORY_list) CATEGORY = {} #뉴스피드 관심도 태그 구하기에서 빠른 접근을 위해서 dict형식으로 변환 for cate in CATEGORY_list: CATEGORY[cate['category_name']] = cate['tag'] for USER in USER_list: #좋아요/조회수가 하나도 없는 회원은 측정 안함. if (len(USER['fav_list']) == 0) and (len(USER['view_list']) == 0): continue user_log_backup(db, USER) fav_tag = [] view_tag = [] newsfeed_tag = [] fav_token = [] view_token = [] search_list = [] #사용자가 관심 기능을 수행한 게시물 ########################## fav_topic = (np.zeros(LDA.NUM_TOPICS)) if len( USER['fav_list'] ) <= SJ_USER_LOG_LIMIT['fav'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for fav in USER['fav_list']: fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] else: for fav in USER['fav_list']: if fav['date'] < ACTION_DAY_CHECK: continue fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] #FAS 구하기 fav_doc = (fav_tag + fav_token) * 2 #사용자가 접근을 수행한 게시물 ############################## view_topic = (np.zeros(LDA.NUM_TOPICS)) if len( USER['view_list'] ) <= SJ_USER_LOG_LIMIT['view'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for view in USER['view_list']: view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] else: for view in USER['view_list']: if view['date'] < ACTION_DAY_CHECK: continue view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] #FAS 구하기 view_doc = view_tag + view_token #사용자가 검색을 수행한 키워드 ############################## if len( USER['search_list'] ) <= SJ_USER_LOG_LIMIT['search'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for search in USER['search_list']: search_list += search['tokenizer_split'] else: for search in USER['search_list']: if search['date'] < ACTION_DAY_CHECK: continue search_list += search['tokenizer_split'] search_topic = LDA.get_topics(search_list) search_doc = search_list #사용자가 접근한 뉴스피드 ################################ A_NUM = 0 #대학교 B_NUM = 0 #동아리&모임 C_NUM = 0 #공모전&행사 D_NUM = 0 #진로&구인 E_NUM = 0 #자유 if len( USER['newsfeed_list'] ) <= SJ_USER_LOG_LIMIT['newsfeed'] * SJ_USER_ACTION_NUM_CHECK_PERCENT: for newsfeed in USER['newsfeed_list']: if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1 elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1 elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1 elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1 else: E_NUM += 1 else: for newsfeed in USER['newsfeed_list']: if newsfeed['date'] < ACTION_DAY_CHECK: continue if newsfeed['newsfeed_name'] == '대학교': A_NUM += 1 elif newsfeed['newsfeed_name'] == '동아리&모임': B_NUM += 1 elif newsfeed['newsfeed_name'] == '공모전&행사': C_NUM += 1 elif newsfeed['newsfeed_name'] == '진로&구인': D_NUM += 1 else: E_NUM += 1 newsfeed_tag += newsfeed['tag'] newsfeed_tag += (CATEGORY['대학교'] * A_NUM) + ( CATEGORY['동아리&모임'] * B_NUM) + (CATEGORY['공모전&행사'] * C_NUM) + ( CATEGORY['진로&구인'] * D_NUM) + (CATEGORY['커뮤니티'] * E_NUM) newsfeed_topic = LDA.get_topics(newsfeed_tag) #가중치 작업 fav_tag *= SJ_FAV_TAG_WEIGHT view_tag *= SJ_VIEW_TAG_WEIGHT fav_topic *= SJ_FAV_TOPIC_WEIGHT view_topic *= SJ_VIEW_TOPIC_WEIGHT search_topic *= SJ_SEARCH_TOPIC_WEIGHT newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT if len(USER['fav_list']) != 0: fav_topic /= len(USER['fav_list']) if len(USER['view_list']) != 0: view_topic /= len(USER['view_list']) #LDA Topic TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic) / SJ_TOPIC_RESULT_DIV #FASTTEXT FastText_doc = fav_doc + view_doc + search_doc if FastText_doc: USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist() else: USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist() #TAG tag_dict = dict(Counter(fav_tag + view_tag)) tag_dict = sorted(tag_dict.items(), key=lambda x: x[1]) #최종 태그들 오브젝트 TAG_RESULT = {} if len(tag_dict) >= 50: if tag_dict[0][1] == 1: tag_dict[0][1] = 2 TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1] for i in range(1, 50): tag_dict[i] = list(tag_dict[i]) if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]: tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5) TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] elif len(tag_dict) > 0: if tag_dict[0][1] == 1: tag_dict[0][1] = 2 TAG_RESULT[tag_dict[0][0]] = tag_dict[0][1] for i in range(1, len(tag_dict)): tag_dict[i] = list(tag_dict[i]) if (tag_dict[i - 1][1] * 1.5) < tag_dict[i][1]: tag_dict[i][1] = int(tag_dict[i - 1][1] * 1.5) TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] USER_TAG_SUM = sum(TAG_RESULT.values()) USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT if USER_TAG_SUM == 0: USER_TAG_SUM = 1 # 사용자 태그로 사용자 태그 벡터 구하기 USER_TAGS = [] for key, value in TAG_RESULT.items(): USER_TAGS += [key] * value TAG_VECTOR = FastText.get_doc_vector(USER_TAGS).tolist() #해당 USER 관심도 갱신! update_user_measurement( db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, TAG_VECTOR, USER_VERCTOR, len(USER['fav_list']) + len(USER['view_list']) + len(USER['search_list'])) update_variable(db, 'renewal', datetime.now()) if db_client is not None: db_client.close()
k_topics = 3 LDA_iterations = 500 sentimentPoints = getSentimentPoints() dictionary, corpus, out_set = preprocessing(doc_set) fileOut = open("out_dic", 'w') print(dictionary, file=fileOut) fileOut.close() for i in range(0, len(out_set)): tweet_set[i].wordSet = out_set[i] sentimentsOfTweets = getSentimentScoreOfTweets(out_set) model = LDA(dictionary, corpus, k_topics, LDA_iterations) #sentimentsOfTopics = getSentimentsScoreOfTopics(out_set, model.get_topics(), dictionary) print(sentimentsOfTweets) #print(sentimentsOfTopics) for i in range(0, len(sentimentsOfTweets)): tweet_set[i].russell_tuple = sentimentsOfTweets[i] fileOut = open("out_model", 'w') print(model.print_topics(num_topics=k_topics, num_words=10), file=fileOut) fileOut.close() #print(getStrOfSentiment(getPolaritySent(sentimentsOfTweets[0]))) #print(getStrOfSentiment(getPrimarySent(sentimentsOfTweets[0], sentimentPoints)))
def perform_lda(train_dataset, train_labelset, test_dataset): lda = LDA.LDA(train_dataset, train_labelset) projection_matrix, projected_train_data = lda.fit() print(np.shape(projection_matrix), np.shape(np.shape(test_dataset))) projected_test_data = lda.test_fit(projection_matrix, test_dataset) return projected_train_data, projected_test_data
# In[52]: (wtp, lwtp, zd, totz) = model.GibbsSampler(widf, docs, rdf.word, T, ITER, lidf,lwd) # In[55]: fpath = path+model_name+'_T'+str(T)+'s'+str(min_durantion)+'/' if not os.path.exists(fpath): os.makedirs(fpath) if model_name == 'LDA': model.visualTopic(T, fpath, dataset, wtp) else: util.visualTopic(T, fpath, dataset, wtp) np.save(fpath+'wtp.npy',wtp) if(len(lwtp)>0): np.save(fpath+'lwtp.npy',lwtp) # In[56]: if LW: (nwtp,nlwtp) = util.nomalise(wtp, lwtp) elif model_name != 'LDA': (nwtp,nlwtp) = util.nomalise(wtp) else: nlwtp=[]
print('K', lda.K) print('_uniqTermSet', lda._uniqTermSet) print('docsSize', lda._docNum) print('termSize', lda._termNum) print('Z ini:', lda.Z) print('docTopic ini', lda._docTopic) ##4 doc,2topic print('lda.termTopic', lda._termTopic) print('lda.Phi', lda.Phi) print('lda.Theta', lda.Theta) if __name__ == "__main__": corpus = [ "With all of the critical success Downey had experienced throughout his career, he had not appeared in a blockbuster film. That changed in 2008 when Downey starred in two critically and commercially successful films, Iron Man and Tropic Thunder. In the article Ben Stiller wrote for Downey's entry in the 2008 edition of The Time 100, he offered an observation on Downey's commercially successful summer at the box office.", "On June 14, 2010, Downey and his wife Susan opened their own production company called Team Downey. Their first project was The Judge.", "Robert John Downey Jr. is an American actor, producer, and singer. His career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age.", "In 2008, Downey was named by Time magazine among the 100 most influential people in the world, and from 2013 to 2015, he was listed by Forbes as Hollywood's highest-paid actor. His films have grossed over $14.4 billion worldwide, making him the second highest-grossing box-office star of all time." ] X = [i.split(' ') for i in corpus] lda = LDA.LDA() lda.fit(X) printAttr(lda) #fig,ax= lda.plotDocTopicDist(2) #fig,ax = lda.plotTermTopicDist(2) #fig,ax = lda.plotTopicTermDist(1) plt.show()
#tokens_ko = t.nouns(document1) #print(type(tokens_ko)) #print(tokens_ko) split = [] docRemovingStopWord = [i for i in tokens_ko if not i in koreanStopWord] texts.append(docRemovingStopWord) tokens_ko = [] #찾기 위해 선언한 변수(굳이 없어도 실행가능) #ko = nltk.Text(tokens_ko, name='document') #토큰으로 만든 데이터를 다시 list로 변환시킨다. # texts.append(tokens_ko) #토큰으로 만든 데이터 list를 사전으로 형성(각 토큰마다 id를 생성) dictionary = LDA.Dictionary(texts) #dictionary_path = "/home/ice-kms/LDAModel/iter_1000_Real_articleDic_10000_compound_topicNum_20.dict" #corpora.Dictionary.save(dictionary, dictionary_path) #문서-단어 행렬를 만들기 위해서 bag-of-word로 변형하는 과정(토큰화 데이터를 사용) corpus = [dictionary.doc2bow(text) for text in texts] #lda 모델 형성 #ldamodel = gensim.models.LdaMallet(corpus,num_topics=20,id2word=dictionary, passes=20) ldamodel = LDA.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=100, iterations=1000) #print(len(ldamodel))
def testPic(dataMat, label): # def testPic(dataMat, label, disc_set, disc_value, redVects, Train_LDA): print("thread") j = 0 isRight = 0 isRight2 = 0 testTimes = 0 while True: testImgSet = './pic/s0.bmp' if not os.path.isfile(testImgSet): continue disc_set, disc_value, meanFace = LDA.pca(dataMat, 40) redVects, Train_LDA = LDA.lda(dataMat, label, 40, 17, 11, 11 * 17) # LDA投影空间,最终的训练集 # testImgSet = createImageSet.createTestMat('Yale', testInClass, testNum, testInClass, 100 * 100) testImgSet = ImageSet.HistogramEqualization(testImgSet) # print("shape", testImgSet.shape) testImgSet = np.reshape(testImgSet, (-1, 1)) testImgSet = disc_set.T.dot(testImgSet) testImgSet = redVects.T.dot(testImgSet) disList = [] testVec = np.reshape(testImgSet, (1, -1)) for sample in Train_LDA.T: disList.append(np.linalg.norm(testVec - sample)) # print('disList', disList) sortIndex = np.argsort(disList) print(label[sortIndex[0]]) if 16 == int(label[sortIndex[0]]): isRight = isRight + 1 if 17 == int(label[sortIndex[0]]): isRight2 = isRight2 + 1 os.remove('./pic/s0.bmp') j = j + 1 # j = j + 1 # 在脸上检测眼睛 (40, 40)是设置最小尺寸,再小的部分会不检测 # eyes = eye_cascade.detectMultiScale(roi_gray, 1.03, 5, 0, (40, 40)) # 把眼睛画出来 # for(ex, ey, ew, eh) in eyes: # cv2.rectangle(img, (x+ex, y+ey), (x+ex+ew, y+ey+eh), (0, 255, 0), 2) if j == 5: if isRight >= 4 or isRight2 >= 4: if isRight >= 4: print("欢迎你,史长顺!") # break camera.release() cv2.destroyAllWindows() break if isRight2 >= 4: print("欢迎你,饶丝雨!") # break camera.release() cv2.destroyAllWindows() break else: if isRight < 4: isRight = 0 testTimes += 1 print("测试失败") if testTimes >= 5: # break camera.release() j = 0 if isRight2 < 4: isRight2 = 0 testTimes += 1 print("测试失败2") if testTimes >= 5: # break camera.release() j = 0
def measurement_run(): db_client = MongoClient('mongodb://%s:%s@%s' %(MONGODB_ID, MONGODB_PW, MONGODB_HOST)) db = db_client["soojle"] renewal_time = find_variable(db, 'renewal') #리뉴얼 시간보다 이상인 사람만 측정! (관심도 측정이 될 지표의 변동이 생겼다는 뜻!) USER_list = find_user_renewal(db, renewal_time) USER_list = list(USER_list) for USER in USER_list: fav_tag = [] view_tag = [] newsfeed_tag = [] fav_token = [] view_token = [] search_list = [] #사용자가 관심 기능을 수행한 게시물 ########################## fav_topic = (np.zeros(LDA.NUM_TOPICS)) for fav in USER['fav_list']: fav_topic += fav['topic'] fav_tag += fav['tag'] fav_token += fav['token'] #FAS 구하기 fav_doc = (fav_tag + fav_token) * 2 #사용자가 접근을 수행한 게시물 ############################## view_topic = (np.zeros(LDA.NUM_TOPICS)) for view in USER['view_list']: view_topic += view['topic'] view_tag += view['tag'] view_token += view['token'] #FAS 구하기 view_doc = view_tag + view_token #사용자가 검색을 수행한 키워드 ############################## for search_obj in USER['search_list'][:SJ_SEARCH_MEASURE_NUM]: search_list += search_obj['tokenizer_split'] search_topic = LDA.get_topics(search_list) search_doc = search_list #사용자가 접근한 뉴스피드 ################################ for newsfeed in USER['newsfeed_list']: newsfeed_tag += newsfeed['tag'] newsfeed_topic = LDA.get_topics(newsfeed_tag) #가중치 작업 fav_tag *= SJ_FAV_TAG_WEIGHT view_tag *= SJ_VIEW_TAG_WEIGHT fav_topic *= SJ_FAV_TOPIC_WEIGHT view_topic *= SJ_VIEW_TOPIC_WEIGHT search_topic *= SJ_SEARCH_TOPIC_WEIGHT newsfeed_topic *= SJ_NEWSFEED_TOPIC_WEIGHT if len(USER['fav_list']) != 0: fav_topic /= len(USER['fav_list']) if len(USER['view_list']) != 0: view_topic /= len(USER['view_list']) #LDA Topic TOPIC_RESULT = (fav_topic + view_topic + search_topic + newsfeed_topic)/SJ_TOPIC_RESULT_DIV #FASTTEXT FastText_doc = fav_doc + view_doc + search_doc if FastText_doc: USER_VERCTOR = FastText.get_doc_vector(fav_doc + view_doc + search_doc).tolist() else: USER_VERCTOR = ft_vector = (np.zeros(FastText.VEC_SIZE)).tolist() #TAG tag_dict = dict(Counter(fav_tag + view_tag)) tag_dict = sorted(tag_dict.items(), key=lambda x: x[1], reverse = True) #빈도수 랭킹 상위 X위 까지 보관. TAG_RESULT = {} if len(tag_dict) >= 10: for i in range(10): TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] else: for i in range(len(tag_dict)): TAG_RESULT[tag_dict[i][0]] = tag_dict[i][1] USER_TAG_SUM = sum(TAG_RESULT.values()) #1.5배 증가 USER_TAG_SUM *= SJ_TAG_SUM_WEIGHT #만약 TAG_SUM 이 0이면 1로 설정. if USER_TAG_SUM == 0: USER_TAG_SUM = 1 #해당 USER 관심도 갱신! (관심도 측정 횟수 +1) update_user_measurement(db, USER['_id'], list(TOPIC_RESULT), TAG_RESULT, USER_TAG_SUM, USER_VERCTOR, USER['measurement_num']+1) update_variable(db, 'renewal', datetime.now()) if db_client is not None: db_client.close()