n = config.list_alltf_result # 各文書における総単語のtf値が入ったリストを代入 "実行したら保存したファイル名を変えること" fw = open("/home/matsui-pc/matsui/experiment/plsa_n.csv" ,"wa") # 始めは新しく作りその後は追加で書き込んでいく csvWriter = csv.writer(fw) # csvファイルに書き込むための準備 for i in range(len(n)): csvWriter.writerow(n[i]) # ファイル名と特徴量をcsvファイルへ書き込む fw.close() # print len(n[0]) ## 確認用 # for i in range(len(n)): # print n[i] # print len(n[0]) p = plsa.plsa(n) # plsa.py の関数plsaに投げる print "PLSA計算中" p.train() # EMステップを繰り返す print "\n" # print "*********** 最終的な出力 ************" print "P(z) = ", print p.pz # P(z) print "P(d|z) = ", print p.pz_d # P(d|z) print "P(w|z) = ", print p.pz_w # P(w|z) print "P(z|d,w)", print p.pdw_z # P(z|d,w)
# iterate over the files in the directory. document_paths = ['./Files/'] documents = [] for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): words, lines = common_utils.split(stop_words_set, document_file) # tokenize documents.append(words) vocabulary = common_utils.build_vocabulary(documents) number_of_topics = 3 max_iterations = 1 topic_word_prob, document_topic_prob = plsa.plsa(number_of_topics, max_iterations, documents) common_utils.print_topic_word_distribution(topic_word_prob, vocabulary, number_of_topics, 3, "./topic-word.txt") common_utils.print_document_topic_distribution(document_topic_prob, documents, number_of_topics, 3, "./document-topic.txt") path_wordsim = './wordsim353_sim_rel/wordsim_similarity_goldstandard.txt' data_cos = [] data_scalar = [] plsa_matrix = pd.DataFrame(data=topic_word_prob, columns=vocabulary) consistent_wordsim = common_utils.read_consistent_wordsim(
for i in range(nD): for word in word_freq[i]: Ndw[i][wordID[word]] = word_freq[i][word] print Ndw #pprint(total_freq) #print Ndw nZ = 3 noise = +np.random.rand(nD,nW) Pd_z, Pw_z,Pz_d,Pz_w = plsa.plsa(Ndw+noise,nZ,100) Y = np.concatenate((Pz_d.T,Pz_w.T)) Y = Y[:,:-1] #for i in range(len(Y)): # Y[i] = Y[i][:2] #Y = Y[:2] #print np.shape(Y) #Y = tsne.tsne(Ndw.T,2,nD)