def create_my_summarizer(cls, cae_model_path, word_vector_model_path="vector/100", mode=0): word_vectors = WordVectors.load(word_vector_model_path) convae = ConvolutionAutoEncoder.rebuild_for_testing( mini_batch_size=1, filemodel=cae_model_path) return CAESummarizer(convae, word_vectors, mode)
def statistic_freq(): wordvectors = WordVectors.load("model/wordvector.txt") freq_array = [0] * 500 # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] += 1 # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] += 1 # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name + "/" + file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) freq_array[len(words)] += 1 except: print "exception parse XML: ", file_name continue print("Finish cluster name:", cluster_name, " , Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) plt.plot(range(200), freq_array[:200], color='red', marker='.') plt.show()
def statistic_freq(): wordvectors = WordVectors.load("model/wordvector.txt") freq_array = [0] * 500 # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] +=1 # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] +=1 # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name +"/"+ file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) freq_array[len(words)] +=1 except: print "exception parse XML: ", file_name continue print("Finish cluster name:", cluster_name," , Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) plt.plot(range(200), freq_array[:200], color='red', marker='.') plt.show()
def create_summary_format_duc2004(ducpath, wordvectors_path, summary_path): wordvectors = WordVectors.load(wordvectors_path) clusters = [] for cluster_id in os.listdir(ducpath): if cluster_id[0] == ".": continue cluster = Cluster.load_from_folder_duc(cluster_id,ducpath+ "/"+cluster_id,wordvectors) summary = CAESummarizer.summary(cluster,100) file_summary = summary_path + "/" + cluster_id[:-1].upper()+".M.100.T.1" with open(file_summary, mode="w") as f: for line in summary: f.write(line + "\n") clusters.append(cluster) print("Finish loading cluster_id: ", cluster_id) return clusters
def create_summary_format_duc2004(ducpath, wordvectors_path, summary_path): wordvectors = WordVectors.load(wordvectors_path) clusters = [] for cluster_id in os.listdir(ducpath): if cluster_id[0] == ".": continue cluster = Cluster.load_from_folder_duc(cluster_id, ducpath + "/" + cluster_id, wordvectors) summary = CAESummarizer.summary(cluster, 100) file_summary = summary_path + "/" + cluster_id[:-1].upper( ) + ".M.100.T.1" with open(file_summary, mode="w") as f: for line in summary: f.write(line + "\n") clusters.append(cluster) print("Finish loading cluster_id: ", cluster_id) return clusters
def create_summary_format_opinosis(opinosis_path, wordvectors_path, summary_path): wordvectors = WordVectors.load(wordvectors_path) clusters = [] for cluster_id in os.listdir(opinosis_path): if cluster_id[0] == ".": continue cluster = Cluster.load_from_opinosis(cluster_id,opinosis_path+"/"+cluster_id, wordvectors) summary = CAESummarizer.summary(cluster,25,"kmean_simple") if len(summary) == 0: print("ttdt") cluster_id,_,_ = cluster_id.split(".") folder_summary = summary_path+"/"+cluster_id if not os.path.isdir(folder_summary): os.makedirs(folder_summary) file_summary = folder_summary+"/"+cluster_id+".1.txt" with open(file_summary, mode="w") as f: for line in summary: f.write(line + "\n") clusters.append(cluster) print("Finish loading cluster_id: ", folder_summary) return clusters
def create_summary_format_opinosis(opinosis_path, wordvectors_path, summary_path): wordvectors = WordVectors.load(wordvectors_path) clusters = [] for cluster_id in os.listdir(opinosis_path): if cluster_id[0] == ".": continue cluster = Cluster.load_from_opinosis(cluster_id, opinosis_path + "/" + cluster_id, wordvectors) summary = CAESummarizer.summary(cluster, 25, "kmean_simple") if len(summary) == 0: print("ttdt") cluster_id, _, _ = cluster_id.split(".") folder_summary = summary_path + "/" + cluster_id if not os.path.isdir(folder_summary): os.makedirs(folder_summary) file_summary = folder_summary + "/" + cluster_id + ".1.txt" with open(file_summary, mode="w") as f: for line in summary: f.write(line + "\n") clusters.append(cluster) print("Finish loading cluster_id: ", folder_summary) return clusters
self.length = content.count(" ") self.sentece_id = -1 import numpy as np import time import cPickle if __name__ == "__main__": clusterpath = "data/vietnamesemds/cluster_1/" vectormodel = "model/word2vec/100" vietnamesemds_path = "data/vietnamesemds/" start = time.time() w2v = WordVectors.load("vector/100") end = time.time() convae = ConvolutionAutoEncoder.rebuild_for_testing( mini_batch_size=1, filemodel="model/CAE.model") clusters = [None] * 201 counter = 1 for cluster_id in os.listdir(vietnamesemds_path): _, id = cluster_id.split("_") cluster = Cluster.load_from_folder( cluster_id, vietnamesemds_path + cluster_id + "/") print("Cluster ", counter) counter += 1 for document in cluster.list_documents: for sentence in document.list_sentences:
if len(elements) < 1: print(elements) idx = sentence.count(" ") try: histo[idx] += 1 if idx > 70 and idx < 100: print sentence except: pass for khung in histo: print khung if __name__ == "__main__": wordvector_w2v = WordVectors.load_from_text_format("model/word2vec.txt", name="word2vec") prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_w2v, "dailymail") prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_w2v, "duc04") prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_w2v, "duc05") wordvector_glove = WordVectors.load_from_text_format("model/glove.filter.txt", name="glove") prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_glove, "dailymail") prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_glove, "duc04") prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_glove, "duc05") wordvector_cw = WordVectors.load_from_text_format("model/cwvector.txt", name="cw") prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_cw, "dailymail") prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_cw, "duc04") prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_cw, "duc05")
print "Grad check failed for dWh: sum of error = %.9f"%(err/count) from nltk.parse.stanford import StanfordParser from nltk.treetransforms import chomsky_normal_form from nltk.tree import Tree from vector.wordvectors import WordVectors parser = StanfordParser(path_to_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser.jar", path_to_models_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar" ,model_path="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") if __name__ == "__main__": rng = np.random.RandomState(4488) wordvector = WordVectors.load_from_text_format("model/word2vec.txt", "word2vec") pos_sent = [] neg_sent = [] with open("data/rt-polarity.neg.txt",mode="r") as f: neg_sent.append(f.readline()) neg_sent.append(f.readline()) neg_sent.append(f.readline()) with open("data/rt-polarity.pos.txt",mode="r") as f: pos_sent.append(f.readline()) pos_sent.append(f.readline()) pos_sent.append(f.readline()) trees = [] labels = [0]*3 + [1]*3
import sys from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if __name__ == "__main__": data_scatters = [] start_total = 0 max_word = 70 min_word = 10 if rank == 0: start_total = time.time() wordvectors = WordVectors.load_from_text_format("model/cwvector.txt", name="word2vec") print("Finished read wordvectors ...") with open("data/sentence.score.duc04.txt", mode="r") as f: traindata = f.readlines() size_sample = int(len(traindata) / size) for i in range(size): if i * size_sample + size_sample > len(traindata): data_scatters.append(traindata[i * size_sample:]) else: data_scatters.append( traindata[i * size_sample:i * size_sample + size_sample]) else: wordvectors = None data_scatter = None wordvectors = comm.bcast(wordvectors, root=0)
Returns: total_cost: cost of foward total_grad: of theta """ cost = 0 grad = 0 forward=instances return cost, grad if __name__ == "__main__": rng = np.random.RandomState(4488) wordvector = WordVectors.load_from_text_format("../model/cwvector.txt", "cwvector") rnn = RecursiveNeuralNetworl(embsize=wordvector.embsize,mnb_size=30,lr=0.1,wordvector=wordvector,act_func=act.tanh) X_train, X_valid = load_sentiment_data() rnn.train(X_train , X_valid) # with open("../data/rt-polarity.neg.out.txt", mode="r") as f: # neg_trees_str = f.readlines() # # X_neg = [] # for neg_tree_str in neg_trees_str[:5]: # t = Tree(neg_tree_str) # t = merge_bin_tree(t) # t.label = 0 # X_neg.append(t)
print(elements) idx = sentence.count(" ") try: histo[idx] += 1 if idx > 70 and idx < 100: print sentence except: pass for khung in histo: print khung if __name__ == "__main__": wordvector_w2v = WordVectors.load_from_text_format("model/word2vec.txt", name="word2vec") prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_w2v, "dailymail") prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_w2v, "duc04") prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_w2v, "duc05") wordvector_glove = WordVectors.load_from_text_format( "model/glove.filter.txt", name="glove") prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_glove, "dailymail") prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_glove, "duc04") prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_glove, "duc05")
text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) samples.append(words) except: print "exception parse XML: ", file_name continue print("Finish collecting training data from DUC2004") print("length of samples", len(samples)) end_collect = time.time() print("Total time for collecting training data: " + str(end_collect - start_collect)) return samples if __name__ == "__main__": wordvectors = WordVectors.load("model/wordvector.txt") train_data = collect_data_from_ptb_brow_duc2004() final_array = [] for i, words in enumerate(train_data): words_array = wordvectors.cae_prepare_data_from_words(words, 10, 100) final_array.append(words_array) if i == 69: break final_array = np.array(final_array) print(final_array.shape)
sys.stdout.write("Finish collecting training data from DUC2004") sys.stdout.flush() sys.stdout.write("length of samples" + str(len(samples))) sys.stdout.flush() end_collect = time.time() sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect)) sys.stdout.flush() return samples if __name__ == "__main__": data_scatters = [] start_total = 0 if rank == 0: start_total = time.time() wordvectors = WordVectors.load("model/wordvector.txt") print("Finished read wordvectors ...") traindata = collect_data_from_ptb_brow_duc2004() size_sample = int(len(traindata)/size) for i in range(size): if i* size_sample + size_sample > len(traindata): data_scatters.append(traindata[i*size_sample:]) else: data_scatters.append(traindata[i*size_sample : i*size_sample+size_sample]) else: wordvectors = None data_scatter = None wordvectors = comm.bcast(wordvectors, root = 0) print("Process:", rank, "broadcasted wordvectors ...") data_scatter = comm.scatter(data_scatters,root=0)
self.vector = vector self.length = content.count(" ") self.sentece_id = -1 import numpy as np import time import cPickle if __name__ == "__main__": clusterpath = "data/vietnamesemds/cluster_1/" vectormodel = "model/word2vec/100" vietnamesemds_path = "data/vietnamesemds/" start = time.time() w2v = WordVectors.load("vector/100") end = time.time() convae = ConvolutionAutoEncoder.rebuild_for_testing(mini_batch_size=1,filemodel="model/CAE.model") clusters = [None]* 201 counter = 1 for cluster_id in os.listdir(vietnamesemds_path): _, id = cluster_id.split("_") cluster = Cluster.load_from_folder(cluster_id, vietnamesemds_path + cluster_id + "/") print ("Cluster ", counter) counter+=1 for document in cluster.list_documents: for sentence in document.list_sentences: sentence_matrix = w2v.cae_prepare_data(sentence.content) if sentence_matrix is None:
def create_my_summarizer(cls, cae_model_path , word_vector_model_path = "vector/100", mode = 0): word_vectors = WordVectors.load(word_vector_model_path) convae = ConvolutionAutoEncoder.rebuild_for_testing(mini_batch_size=1, filemodel=cae_model_path) return CAESummarizer(convae, word_vectors, mode)