num_of_topic = 4 iterations = 20 doc = ProcDoc.readFile(doc_path) doc_dict = ProcDoc.docPreproc(doc) # general model collection = {} for doc_ID, word_count in doc_dict.items(): for word, count in word_count.items(): if word in collection: collection[word] += count else: collection[word] = count if not os.path.isfile(cluster_dir + "/pwz_list.pkl"): with open("exp/w_IDs.pkl", "wb") as wIDs_file : pickle.dump(collection.keys(), wIDs_file, True) cluster_mdl = ClusterModel(doc_dict, collection.keys(), num_of_topic) cluster_mdl.save(cluster_dir) with open(cluster_dir + "/pwz_list.pkl", "rb") as pwz_file: pwz = pickle.load(pwz_file) doc_np, doc_IDs = ProcDoc.dict2npDense(doc_dict, collection.keys()) pwd = np.ones((doc_np.shape[0], num_of_topic)) doc_np = np.transpose(doc_np) # PLSA model = pLSA(doc_np, num_of_topic, pwz, pwd) [pzd, pwz, pzdw] = model.EM_Trainging(iterations) with open("exp/pzd.pkl", "wb") as pzd_file : pickle.dump(pzd, pzd_file, True) with open("exp/pwz.pkl", "wb") as pwz_file : pickle.dump(pwz, pwz_file, True) with open("exp/pzdw.pkl", "wb") as pzdw_file : pickle.dump(pzdw, pzdw_file, True)
num_topics = 4 iterations = 20 doc_file = ProcDoc.readFile(doc_path) doc_mdl_dict = ProcDoc.docPreproc(doc_file) # general model vocab = {} for doc_ID, word_count in doc_mdl_dict.items(): for word, count in word_count.items(): if word in vocab: continue else: vocab[word] = len(list(vocab.keys())) if not os.path.isfile(cluster_dir + "/pwz_list.npy"): np.save("exp/w_IDs", vocab) cluster_mdl = ClusterModel(doc_mdl_dict, vocab, num_topics) cluster_mdl.save(cluster_dir) pwz = np.load(cluster_dir + "/pwz_list.npy") doc_mdl_np, _, doc_IDs = ProcDoc.dict2npDense(doc_mdl_dict, list(vocab.keys())) pzd = np.ones((doc_mdl_np.shape[0], num_topics)) doc_mdl_np = np.transpose(doc_mdl_np) # PLSA model = pLSA(doc_mdl_np, num_topics, pwz, pzd) [pzd, pwz, pzdw] = model.EM_Trainging(iterations) np.save("exp/pzd", pzd) np.save("exp/pwz", pwz) np.save("exp/pzdw", pzdw)