def main(transform_func = None, n = 10): parser=StanfordParser( path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar", path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar", model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ) test_sents = treebank.sents()[-n:] print "len(test_sents) = %d" %(len(test_sents)) if transform_func and callable(transform_func): print "transforming it using ", transform_func test_sents = [[transform_func(w) for w in s] for s in test_sents] # transform it print test_sents[:10] print "predicting" pred_parses = parser.parse_sents(test_sents) gold_parses = treebank.parsed_sents() print "evaluating" correct_n = gold_n = predicted_n = 0.0 for gparse, pparse in zip(gold_parses, pred_parses): cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), get_nodes_with_range(pparse)) correct_n += cn gold_n += gn predicted_n += pn print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
def __init__(self): nltk.download('treebank') nltk.download('stopwords') ids = nltk.corpus.treebank.fileids() self.sents=[] for id in ids: self.sents+=list(treebank.sents(id)) self.wc={} stop_words = set(stopwords.words('english')) for i in range(len(self.sents)): self.sents[i]=[word for word in self.sents[i] if word[0]!='*'] for j in range(len(self.sents[i])): if not self.sents[i][j] in self.wc: self.wc[self.sents[i][j]]=[] self.wc[self.sents[i][j]].append((i,j)) self.word_set=[] frequency = 10 for i in self.wc: if len(self.wc[i])>= frequency and i.isalpha() and not i in stop_words: self.word_set.append(i) self.n=len(self.word_set) self.n_s=len(self.word_set) print(len(self.sents)) print(len(self.wc)) print(len(self.word_set))
def get_word2vec( train_fn="data/rap/input.txt", saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"): try: print "loading word2vec model at {0}".format(saved_model_fn) model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True) print "model loaded" return model except IOError: print "no word2vec model found at {0}".format(saved_model_fn) with open(train_fn) as f: data = f.read() clean = TextLoader.clean_str(data) lines = [line.split(" ") for line in clean.split('\n')] full_data = brown.sents() + movie_reviews.sents() + treebank.sents( ) + lines print "training word2vec model" model = Word2Vec(workers=8) model.build_vocab(full_data) for i in xrange(0, 5): print "epoch " + str(i + 1) # full_data = shuffle(full_data) pb = ProgressBar(maxval=len(full_data)) chunk_size = len(full_data) / 100 j = 0 pb.start() while j + chunk_size < len(full_data): model.train(full_data[j:j + chunk_size]) j += chunk_size pb.update(j) print "done training" model.save(saved_model_fn) return model
def test(): model = torch.load('./ckpt/model0.pt') leafmodel = LeafNet() x = treebank.sents('wsj_0003.mrg')[0] y = treebank.parsed_sents('wsj_0003.mrg')[0] preprocess(y) # embed_x is the list of embedding vectors of x embed_x = [] x_list = [] l = int(len(x)) for i in range(0, l): txlist = [] x[i] = x[i].lower() txlist.append(x[i]) tembed = torch.Tensor(get_embed(x[i])) embed_x.append(tembed) pred = leafmodel(embed_x[i]) gt = (torch.argmax(pred)).item() txlist.append(gt) x_list.append(txlist) # we got the (sentence,gt) list, embedding vector list for the leafs xscore = 0.0 while (len(x_list) != 1): x_list, embed_x, tscore = calculate_score(x_list, embed_x, model) xscore = xscore + tscore x_list = str(x_list).replace('[', '(').replace(']', ')').replace( '\'', '').replace(',', '') x_list_tree = Tree.fromstring((x_list)) draw_trees(x_list_tree) draw_trees(y)
def create_input_dataset(): print 'Loading input' input_data = [] tags = [] sents = wsj.sents() json_file = open('data.json','w') counter = 0 for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') datapoint['wn'] = temp datapoint['index'] = j datapoint['i'] = counter counter += 1 if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] datapoint['tag'] = word[1] json_file.write(json.dumps(datapoint)) json_file.write('\n') input_data.append(datapoint) tags.append(word[1]) print 'Done' json_file.close() return input_data, tags
def create_filter_index(): sents = [" ".join(sent) for sent in treebank.sents()[:no_comp_reqs]] ids = list(range(no_comp_reqs)) tags = ["" for sent in sents] filter_index,_,_,_ = glossary_extraction(sents, ids, tags, tag_mode="load tagger", filter_mode="threshold", threshold_coverage=1) with open('../temp/filter_index.pickle','wb') as f: pickle.dump(filter_index,f)
def build_index(out_filename, in_filename = None): '''Builds data files for word lookup. Can take an optional input file to add to the data pool which is processed (not working). Data is then dumped to a pickle file.''' sents_data = [] try: in_file = open(in_filename).read() sents_data += sent_tokenize(in_file) in_file.close() except: print("Warning: Failed to load external file for building.") sents_data += brown.sents() + treebank.sents() # get sentences, chop of rtheir ambiguous heads, and look at their words! mysents = [sent[1:] for sent in sents_data] # flatten sublists of words to list of words mywords = [word for word in mysents for word in word] cfd = ConditionalFreqDist((word.lower(), word) for word in mywords) # look up most frequent form of lowercase word by doing cfd['word'].max() # but need to check for existance of word in cfd first # made pickle file too large and slow # wordlist = set(words.words()) # wordlist.update(brown.words()) # wordlist.update(treebank.words()) # common_words_lower = set([w for w in wordlist if w.islower()]) # common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)]) out_file = open(out_filename, 'wb') pickle.dump(cfd, out_file, 2) # pickle.dump(common_words_lower, out_file, 2) # pickle.dump(common_words_titlecase, out_file, 2) out_file.close()
def collect_data_from_ptb_brow_duc2004(): start_collect = time.time() samples = [] # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Penn Tree Bank") sys.stdout.flush() # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Brown") sys.stdout.flush() # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name + "/" + file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) samples.append(words) except: print "exception parse XML: ", file_name continue sys.stdout.write("Finish collecting training data from DUC2004") sys.stdout.flush() sys.stdout.write("length of samples" + str(len(samples))) sys.stdout.flush() end_collect = time.time() sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect)) sys.stdout.flush() return samples
def collect_data_from_ptb_brow_duc2004(): start_collect = time.time() samples = [] # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Penn Tree Bank") sys.stdout.flush() # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) samples.append(words) sys.stdout.write("Finish collecting training data from Brown") sys.stdout.flush() # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name +"/"+ file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) samples.append(words) except: print "exception parse XML: ", file_name continue sys.stdout.write("Finish collecting training data from DUC2004") sys.stdout.flush() sys.stdout.write("length of samples" + str(len(samples))) sys.stdout.flush() end_collect = time.time() sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect)) sys.stdout.flush() return samples
def read_wsj_from_treebank(self, index): from nltk.corpus import treebank self.__reset() self.__input_text = 'wsj_000' + str(index) + '.mrg' self.__sents = treebank.sents(self.__input_text) self.__tagged_sents = treebank.parsed_sents(self.__input_text) if self.__verbose: self.__print_all() return self.__tagged_sents
def read_treebank(input_vocab_size=10000, output_vocab_size=10000, seq_len=10): all_sents = [] for fname in treebank.fileids(): sents = treebank.sents(fname) if sents: all_sents.extend(sents) return read_dataset(all_sents, input_vocab_size, output_vocab_size, seq_len)
def create_dataset(): #print 'Loading dataset' dataset = [] tags = [] sents = wsj.sents() for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j,word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if(j > 0): temp.append(sents[i][j-1]) else: temp.append('*') if(j > 1): temp.append(sents[i][j-2]) else: temp.append('*') temp.append(sents[i][j]) if(j < len_sentence-1): temp.append(sents[i][j+1]) else: temp.append('*') if(j < len_sentence-2): temp.append(sents[i][j+2]) else: temp.append('*') #what is WN ? datapoint['wn'] = temp datapoint['index'] = j if(prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if(prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) #print 'Done' return dataset, tags
def create_dataset(): print "Loading dataset" dataset = [] tags = [] sents = wsj.sents() for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]): prev = None prev_prev = None for j, word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) if j > 0: temp.append(sents[i][j - 1]) else: temp.append("*") if j > 1: temp.append(sents[i][j - 2]) else: temp.append("*") temp.append(sents[i][j]) if j < len_sentence - 1: temp.append(sents[i][j + 1]) else: temp.append("*") if j < len_sentence - 2: temp.append(sents[i][j + 2]) else: temp.append("*") datapoint["wn"] = temp datapoint["index"] = j if prev == None: datapoint["t_minus_one"] = "*" else: datapoint["t_minus_one"] = prev[1] if prev_prev == None: datapoint["t_minus_two"] = "*" else: datapoint["t_minus_two"] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) print "Done" return dataset, tags
def generator(self): for index, file in enumerate(self.file_ids): if index % 10 == 0: print("Processed " + str(index) + " of " + str(len(self.file_ids)) + " files") parsed_sentences = treebank.parsed_sents(file) sentences = treebank.sents(file) for i in range(len(parsed_sentences)): yield { 'file': file, 'id': i, 'raw': sentences[i], 'parsed': parsed_sentences[i] }
def create_dataset(): print 'Loading dataset' dataset = [] tags = [] sents = wsj.sents() for i, sentence in enumerate(wsj.tagged_sents()[:10]): prev = None prev_prev = None for j, word in enumerate(sentence): datapoint = {} temp = [] len_sentence = len(sentence) temp.append(sents[i][j]) if (j > 0): temp.append(sents[i][j - 1]) else: temp.append('*') if (j > 1): temp.append(sents[i][j - 2]) else: temp.append('*') if (j < len_sentence - 1): temp.append(sents[i][j + 1]) else: temp.append('*') if (j < len_sentence - 2): temp.append(sents[i][j + 2]) else: temp.append('*') datapoint['wn'] = temp datapoint['index'] = j if (prev == None): datapoint['t_minus_one'] = '*' else: datapoint['t_minus_one'] = prev[1] if (prev_prev == None): datapoint['t_minus_two'] = '*' else: datapoint['t_minus_two'] = prev_prev[1] prev_prev = prev prev = word # print datapoint,word[1] dataset.append(datapoint) tags.append(word[1]) print 'Done' return dataset, tags
def statistic_freq(): wordvectors = WordVectors.load("model/wordvector.txt") freq_array = [0] * 500 # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] += 1 # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] += 1 # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name + "/" + file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) freq_array[len(words)] += 1 except: print "exception parse XML: ", file_name continue print("Finish cluster name:", cluster_name, " , Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) plt.plot(range(200), freq_array[:200], color='red', marker='.') plt.show()
def get_trees_sentences(): trees = [] sentences = [] for file in treebank.fileids(): for tree in treebank.parsed_sents(file): tree_str = str(tree) trees.append(tree_str) for sentence in treebank.sents(file): s = "" s = " ".join(words for words in sentence) sentences.append(s) assert len(trees) == len(sentences) sentences = list(map(lambda x: x.lower(), sentences)) return (trees, sentences)
def collect_word_from_data(): vocab = {} # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): for word in treebank_sents[i]: vocab[str(word)] = 1 print("Finish Penn Tree Bank corpus, vocab size: ", str(len(vocab.keys()))) # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): for word in brown_sents[i]: vocab[str(word)] = 1 print("Finish Broww corpus, vocab size: ", str(len(vocab.keys()))) def parse_xml(file_path): try: tree = ET.parse(file_path) return tree except: return None # dailymail data with open("../data/sentence.score.dailymail.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word)] = 1 # duc04 data with open("../data/sentence.score.duc04.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word)] = 1 # duc05 data with open("../data/sentence.score.duc05.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word)] = 1 print("Finish reading vocab size: ", str(len(vocab.keys()))) return vocab
def statistic_freq(): wordvectors = WordVectors.load("model/wordvector.txt") freq_array = [0] * 500 # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] +=1 # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp) freq_array[len(words)] +=1 # DUC data folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs" clusters_name = os.listdir(folder_path) for cluster_name in clusters_name: if cluster_name[0] == ".": # except file .DStore in my macbook continue files_name = os.listdir(folder_path + "/" + cluster_name) for file_name in files_name: if file_name[0] == ".": # except file .DStore in my macbook continue file_path = folder_path + "/" + cluster_name +"/"+ file_name try: tree = ET.parse(file_path) root = tree.getroot() text_tag = root._children[3] if text_tag.tag == "TEXT": text = text_tag.text.replace("\n", "") sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: words = nltk.word_tokenize(sentence) freq_array[len(words)] +=1 except: print "exception parse XML: ", file_name continue print("Finish cluster name:", cluster_name," , Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) plt.plot(range(200), freq_array[:200], color='red', marker='.') plt.show()
def _init_train(self): lemmas = [ tup[0].split() for tup in self.db.loadProcessed("lemmatized") ] model = FastText(min_count=5) model.build_vocab(brown.sents()) model.train( brown.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(treebank.sents(), update=True) model.train( treebank.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(movie_reviews.sents(), update=True) model.train( movie_reviews.sents(), total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) model.build_vocab(lemmas, update=True) model.train( lemmas, total_examples=model.corpus_count, total_words=model.corpus_total_words, epochs=model.epochs, ) return model
def preprocess_corpora(): brown_words = brown.tagged_words(simplify_tags=True) treebank_words = treebank.tagged_words(simplify_tags=True) ''' #this takes forever. bwog_corpus = nltk.corpus.PlaintextCorpusReader('../bwog-corpus-txt', '.*\.txt') bwog_sents = bwog_corpus.sents(bwog_corpus.fileids()) bwog_words = [] for s_i in xrange(0, len(bwog_sents)/100000): #TODO: skip punctuation simp_tagged_sent = [(word,simp_tag(tag)) for word,tag in nltk.pos_tag(bwog_sents[s_i])] bwog_words.extend(simp_tagged_sent) ''' all_tagged_words = brown_words + treebank_words #+ bwog_words all_sents = brown.sents() + treebank.sents() #+ bwog_sents compute_concordance(all_tagged_words)
def collect_word_from_data(): vocab = {} # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): for word in treebank_sents[i]: vocab[str(word).lower()] = 1 print("Finish Penn Tree Bank corpus, vocab size: ", str(len(vocab.keys()))) # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): for word in brown_sents[i]: vocab[str(word).lower()] = 1 print("Finish Broww corpus, vocab size: ", str(len(vocab.keys()))) # dailymail data with open("../data/sentence.score.dailymail.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 # duc04 data with open("../data/sentence.score.duc04.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 # duc05 data with open("../data/sentence.score.duc05.txt", mode="r") as f: for line in f: sentence, score = line.split("hynguyensplit") words = nltk.word_tokenize(sentence) for word in words: vocab[str(word).lower()] = 1 print("Finish reading vocab size: ", str(len(vocab.keys()))) return vocab
def main(): k = 10 # k-cross validation correctSum = 0. totalSum = 0. untagged = corpus.sents() tagged = corpus.tagged_sents() # optional parameter: tagset='universal' share = int(len(tagged) / k) print("####", k, "Fold Cross Validation ####") for i in range(k): print("Round", i + 1, end='\t') testRange = (i * share, (i + 1) * share) test_data = untagged[testRange[0]:testRange[1]] train_data = tagged[:testRange[0]] + tagged[testRange[1]:] ans_data = tagged[testRange[0]:testRange[1]] eva = validation(corpus, train_data, test_data, ans_data) correctSum += eva[0] totalSum += eva[1] print("### Average accuracy: %.4f" % (correctSum / totalSum), "###")
def syntax(tagtokens): print "\n" print "Step 3: Syntax Analysis\n" dataset_size = len(treebank.parsed_sents()) split_size = int(dataset_size * 0.97) learning_set = treebank.parsed_sents()[:split_size] test_set = treebank.parsed_sents()[split_size:] #create a set containing the raw sentences sents = treebank.sents() raw_test_set = [[w for w in sents[i]] for i in range(split_size, dataset_size)] #construct the PCFG tbank_productions = [] for sent in learning_set: for production in sent.productions(): tbank_productions.append(production) for word, tag in tagtokens: t = Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): tbank_productions.append(production) tbank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'), tbank_productions) chart_parser = nltk.ChartParser(tbank_grammar) trees = chart_parser.parse(tokens) count = 1 for tree in trees: print tree # show first few trees if (count == 3): break else: count += 1 return trees
def main(): # Read sentence i of the treebank, with n1 <= i < n2 n1 = 0 n2 = 10 # Parse sentence j of the treebank, with m1 <= j < m2 # NB: usually, one should _NOT_ test a parser on the same sentences used to train it!!! # But, in this example, the grammar is so small that it is unlikely to parse a new # "unknown" sentence. The "unknown" sentence should use the same terminal symbols # that the grammar "knows", and should be parseable with the productions the grammar "knows"... m1 = 0 m2 = 1 # Induce grammar from a subset of the treebank parsed sentences. Allocate parsers cfg_earley_parser, pcfg_pchart_parser = generate_grammar_and_parsers( treebank.parsed_sents()[n1:n2]) # Parse sentences from the treebank for i in range(m1, m2): sentence = treebank.sents()[i] # the sentence to parse print("Parsing:", sentence) gold_tree = treebank.parsed_sents()[i] # the right parse tree # Parse the sentence with parsers we define; # see: # http://www.nltk.org/book/ch08-extras.html # http://www.nltk.org/book/ch08.html earley( cfg_earley_parser, sentence, gold_tree) # here do not return any tree... just show all of them tree = pchart(pcfg_pchart_parser, sentence, gold_tree) # here get the best tree print("\nBEST TREE WITH PROB.: %.12e" % tree.prob()) tree.draw() # draw the tree
# Extracts Penn Treebank from NLTK. from nltk.corpus import treebank from operator import itemgetter import codecs words = treebank.sents() tagged_words = [map(itemgetter(1), sent) for sent in treebank.tagged_sents()] parsed_sents = treebank.parsed_sents() total_sents = len(parsed_sents) f = codecs.open('../data/penn_treebank','w','utf-8') assert (len(words) == len(tagged_words) and len(words) == len(parsed_sents)), ' '.join(map(str, [len(words), len(tagged_words), len(parsed_sents)])) f.write(str(total_sents) + '\n') for i in xrange(total_sents): sent_len = len(words[i]) f.write(str(sent_len) + '\n') sent = ' '.join(words[i]) pos = ' '.join(tagged_words[i]) assert(sent.count('\n') == 0 and pos.count('\n') == 0 and len(sent.split(' ')) == sent_len and len(pos.split(' ')) == sent_len) f.write(sent + '\n') f.write(pos + '\n') tree = str(parsed_sents[i]).split('\n') f.write(str(len(tree)) + '\n') f.write('\n'.join(tree) + '\n')
from nltk.corpus import treebank as wsj #file_in=open('wsj_0003.pos','r') #content=file_in.readlines() #file_in.close() #print content[:5] file_out_tag=open('tagged_sent_sample','w') file_out_sent=open('untagged_sent_sample','w') out1=wsj.sents()[:20] out2=wsj.tagged_sents()[:20] line='' for i in out1 : #file_out_sent.write('\n\n') line=' '.join(i) file_out_sent.write(line) file_out_sent.write('\n') #print line for i in out2 : file_out_tag.write('\n\n') words='' for j in i : #print j words='/'.join(j) #print words file_out_tag.write(words) file_out_tag.write('\n')
elif token[1] in ["RB", "RBR", "RBS"]: mapped_tag = "SRB" else: mapped_tag = "MISC" mapped_sentence.append((token[0], mapped_tag)) return mapped_sentence #Training step train_sents = treebank.tagged_sents()[:500] t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) #Testing step test_sents = treebank.sents()[500:1000] tagged_sents = treebank.tagged_sents()[500:1000] id = 0 file = open('Method-A-predictions.tsv', 'w') w = csv.writer(file, delimiter='\t') for actual_tagged_sent, actual_sent in zip(tagged_sents, test_sents): predicted_sent = t2.tag(actual_sent) evaluate(coarse_map(actual_tagged_sent), coarse_map(predicted_sent)) row = id,coarse_map(actual_tagged_sent), coarse_map(predicted_sent) w.writerow(row) id += 1 table = [] total_tokens = 0 total_correct_tokens = 0 for k1, v1 in total_count.items():
import nltk import MeCab from nltk import Tree from nltk.corpus import brown, gutenberg, treebank from nltk.tokenize.api import TokenizerI # <markdowncell> # ### Corpora # # NLTK has several built-in corpora and resources # <codecell> treebank.sents() # <codecell> nltk.download('treebank') # <codecell> print treebank.parsed_sents()[0] # <markdowncell> # NLTK's CorpusReader classes manage files: # <codecell>
######## UNIGRAM TAGGER ########## from nltk.tag import UnigramTagger from nltk.corpus import treebank #We use the first 3000 sentences of the treebank corpus as the training set to initialize #the UnigramTagger class #Unigram tagger can be trained by giving it a list of tagged sentences at initialization. train_sents=treebank.tagged_sents()[:3000] tagger=UnigramTagger(train_sents) print treebank.sents()[0] print tagger.tag(treebank.sents()[0]) test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents) tagger=UnigramTagger(model={'Pierre':'NN'}) tagger.tag(treebank.sents())[0]
''' Created on Jul 2, 2015 @author: dongx ''' import nltk from nltk.corpus import brown, treebank from nltk.tag import untag, tnt, DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, brill, brill_trainer from nltk.corpus.reader import ChunkedCorpusReader #trainning data sent = treebank.sents()[0] brown_train_sents = brown.tagged_sents(categories='news')[1001:] brown_test_sents = brown.tagged_sents(categories='news')[:1000] #form multiple tagger in a tagging chain def backoff_tagger(tagged_sents, tagger_classes, backoff=None): if not backoff: backoff = tagger_classes[0](tagged_sents) del tagger_classes[0] for cls in tagger_classes: tagger = cls(tagged_sents, backoff=backoff) backoff = tagger return backoff def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag
def validate(model, leafmodel): model.eval() loss_list = [] tick = time.time() total_avg_loss = 0 for name in val_data_list: print('loaded file ', name) filex = treebank.sents(name) filey = treebank.parsed_sents(name) for s in range(0, len(filex)): print('working on sentence ', s) x = filex[s] y = filey[s] preprocess(y) # embed_x is the list of embedding vectors of x embed_x = [] x_list = [] l = int(len(x)) for i in range(0, l): txlist = [] x[i] = x[i].lower() txlist.append(x[i]) tembed = torch.Tensor(get_embed(x[i])) embed_x.append(tembed) pred = leafmodel(embed_x[i]) gt = (torch.argmax(pred)).item() txlist.append(gt) x_list.append(txlist) # we got the (sentence,gt) list, embedding vector list for the leafs xscore = 0.0 while (len(x_list) != 1): x_list, embed_x, tscore = calculate_score( x_list, embed_x, model) xscore = xscore + tscore x_list = str(x_list).replace('[', '(').replace(']', ')').replace( '\'', '').replace(',', '') x_list_tree = Tree.fromstring((x_list)) # print('xscore is .....', xscore) yscore, _, celoss = compute_gtscore(y, model) delta_loss = compute_delta(x_list_tree, y) flist = [] flist.append(delta_loss) delta_loss = torch.Tensor(flist) delta_loss = delta_loss.detach() # print('xscore is .....', xscore) yscore, _, celoss = compute_gtscore(y, model) # print('yscore is .....', yscore) # print('classification celosss is ....', celoss) loss = (xscore - yscore) + celoss + delta_loss total_avg_loss = total_avg_loss + loss.item() total_avg_loss = total_avg_loss / (len(filex) * len(data_list)) print('validated for this epoch') return total_avg_loss
bigram.evaluate(test_sents) # # Find most frequent nouns # The most frequent nouns usually provide information on the subject of a text. Below, the most frequent nouns of an already tagged text of the *Treebank*-corpus are determined. Let's see if we can conclude the text's subject. # In[20]: from nltk.corpus import treebank from nltk import FreqDist from nltk import bigrams print("\nTreebank sentences: ", treebank.sents(fileids="wsj_0003.mrg")) # In[21]: tagged0003=treebank.tagged_words(tagset="universal",fileids="wsj_0003.mrg") print("File tagged0003: ",tagged0003) # In[22]: fdist=FreqDist(a[0].lower() for a in tagged0003 if a[1]=="NOUN") #fdist.tabulate(20) print(fdist.most_common(20))
logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) if __name__ == "__main__": # Load Word2Vec from Google w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin",binary=True) # Create object WordVectors wordvectors = WordVectors(300,np.empty((0,300),dtype=float),{}) # wordvectors = WordVectors.load("model/wordvector.txt") # Penn Tree Bank treebank_sents = treebank.sents() for i in range(len(treebank_sents)): senttmp = " ".join(treebank_sents[i]) words = nltk.word_tokenize(senttmp) wordvectors.add_wordvector_from_w2vmodel(w2v,words) print("Finish penn tree bank corpus, Wordvector size: ", str(wordvectors.embed_matrix.shape[0])) # Brown brown_sents = brown.sents() for i in range(len(brown_sents)): if i % 1000 == 0: print("brow, process line: ", i) senttmp = " ".join(brown_sents[i]) words = nltk.word_tokenize(senttmp)
for command in commands: c.execute(command) print "Building clean words list..." words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) def singles(words): if len(words) < 1: return for w in words: if re.match("[a-zA-Z'-]+", w) and w.strip() != "''": yield w
categories_df = {cat: pd.read_csv(f"./data/{cat}.csv") for cat in categories} negative_sample_size = int(len(categories_df[selected_category]) / 4) print(f"Selected Category: {selected_category}") for category in categories_df: categories_df[category].drop('URL', 1, inplace=True) if category != selected_category: categories_df[category] = categories_df[category].sample( negative_sample_size) categories_df[category] = categories_df[category].assign( **{selected_category: category == selected_category}) print("{} has {} samples;".format(category, len(categories_df[category]))) #print(categories_df[category].head()) treebank_background = pd.DataFrame( map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)), columns=["excerpt"]).assign(description=False) #print("Treebank has {} samples.".format(len(treebank_background))) #print("categories_df") corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False) corpus.append(treebank_background, ignore_index=True, sort=False) corpus.fillna(value='', inplace=True) #print(corpus) pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear')) X, y = corpus.excerpt, corpus[selected_category] #cross validation cv_results = cross_validate(pipeline, X, y, cv=5)
print("false positives: ", falsePositives, "out of", amountPosTest, "tests") print("false negatives: ", falseNegative, "out of", amountNegTest, "tests") print("correctly parsed noun phrases:", posSucess, "out of", posSucess + falseNegative, "in gold standard") #----------------------------------- MAIN ----------------------------------- sentences = treebank.parsed_sents() train_len = round(0.9 * len(sentences)) data_train = sentences[0:train_len] data_test = sentences[train_len + 1:-1] sents = treebank.sents() raw_sents = [[w for w in sents[i]] for i in range(train_len + 1, len(sentences))] train_rules = chunker(data_train) test_rules = chunker(data_test) # All the rules with a probability under the value of # prob_thresh will be discarded prob_thresh = 0 rulesExclusivelyInTrain, rulesExclusivelyInTest, trainAmount, testAmount = accuracy( train_rules, test_rules, prob_thresh) print('Noun phrase rules that occur exclusively in the train corpus: ', rulesExclusivelyInTrain)
def train(model, leafmodel, optimizer, total_epochs): model.train() loss_list = [] val_loss_list = [] for t in range(total_epochs): tick = time.time() total_avg_loss = 0 print('epoch is .....', t) for name in train_data_list: print('loaded file ', name) filex = treebank.sents(name) filey = treebank.parsed_sents(name) for s in range(0, len(filex)): print('working on sentence ', s) x = filex[s] y = filey[s] preprocess(y) # embed_x is the list of embedding vectors of x embed_x = [] x_list = [] l = int(len(x)) optimizer.zero_grad() for i in range(0, l): txlist = [] x[i] = x[i].lower() txlist.append(x[i]) tembed = torch.Tensor(get_embed(x[i])) embed_x.append(tembed) pred = leafmodel(embed_x[i]) gt = (torch.argmax(pred)).item() txlist.append(gt) x_list.append(txlist) # we got the (sentence,gt) list, embedding vector list for the leafs xscore = 0.0 while (len(x_list) != 1): x_list, embed_x, tscore = calculate_score( x_list, embed_x, model) xscore = xscore + tscore x_list = str(x_list).replace('[', '(').replace(']', ')').replace( '\'', '').replace(',', '') x_list_tree = Tree.fromstring((x_list)) # print('xscore is .....', xscore) yscore, _, celoss = compute_gtscore(y, model) delta_loss = compute_delta(x_list_tree, y) flist = [] flist.append(delta_loss) delta_loss = torch.Tensor(flist) delta_loss = delta_loss.detach() # print('yscore is .....', yscore) # print('classification celosss is ....', celoss) loss = (xscore + delta_loss - yscore) + celoss loss.backward() optimizer.step() total_avg_loss = total_avg_loss + loss.item() total_avg_loss = total_avg_loss / (len(filex) * len(data_list)) loss_list.append(total_avg_loss) print('************************************************') tock = time.time() print('epoch_time ====', (tock - tick)) val_loss = validate(model, leafmodel) val_loss_list.append(val_loss) torch.save(model, './ckpt/model' + str(t) + '.pt') loss_array = np.array(loss_list) val_loss_array = np.array(val_loss_list) np.savetxt("train_loss.csv", loss_array, delimiter=",") np.savetxt("val_loss.csv", val_loss_array, delimiter=",")
from nltk.tag import UnigramTagger from nltk.corpus import treebank # train train_sents = treebank.tagged_sents()[:3000] tagger = UnigramTagger(train_sents) print(treebank.sents()[0]) print(tagger.tag(treebank.sents()[0])) # test test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
from nltk.corpus import treebank as wsj #file_in=open('wsj_0003.pos','r') #content=file_in.readlines() #file_in.close() #print content[:5] file_out_tag=open('tagged_sent_sample','w') file_out_sent=open('untagged_sent_sample','w') out1=wsj.sents()[:10] out2=wsj.tagged_sents()[:10] line='' for i in out1 : #file_out_sent.write('\n\n') line=' '.join(i) file_out_sent.write(line) file_out_sent.write('\n') #print line for i in out2 : file_out_tag.write('\n\n') words='' for j in i : #print j words='/'.join(j) #print words file_out_tag.write(words) file_out_tag.write('\n')
print("ABC to sentences") genesis_corp_sents = genesis.sents() print("Genesis to sents") frame_net_corp_sents = fn.sents() print("Frame_net to sents") state_union_corp_sents = state_union.sents() print('state union to sents') subject_corp_sents = subjectivity.sents() print('Subjectvity to sents') brown_corp_sents = brown.sents() print("Brown corpus to sents") movie_reviews_corp_sents = movie_reviews.sents() print("Movie reviews to sents ") guttenberg_corp_sents = gutenberg.sents() print("Guttenberg to sents") treebank_corb_sents = treebank.sents() print("Freebank to sents") reuters_corp_sents = reuters.sents() print("Reuters to sents") webtext_corp_sents = webtext.sents() print("Webtext to sents") logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Cleaning data ...") discard_punctuation_and_lowercased_sents_condll2007 = [[ word.lower() for word in sent if word not in punctuation ] for sent in conll2007_corp_sents]
# -*- coding: utf-8 -*- #!/usr/bin/env python from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews, treebank if __name__ == '__main__': brown_sentences = Word2Vec(brown.sents()) movie_sentences = Word2Vec(movie_reviews.sents()) treebank_sentences = Word2Vec(treebank.sents()) print brown_sentences.most_similar('money', topn=5) print movie_sentences.most_similar('money', topn=5) print treebank_sentences.most_similar('money', topn=5)
def build_corpus(selected_category): categories_df = {cat : pd.read_csv(f"../data/{cat}.csv") for cat in categories} negative_sample_size = int(len(categories_df[selected_category]) / 4) print(f"Selected Category: {selected_category}") for category in categories_df: categories_df[category].drop('URL', 1, inplace=True) if category != selected_category: categories_df[category] = categories_df[category].sample(negative_sample_size) categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category}) print("{} has {} samples;".format(category, len(categories_df[category]))) #print(categories_df[category].head()) treebank_background = pd.DataFrame(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)), columns=["excerpt"]).assign(description=False) #print("Treebank has {} samples.".format(len(treebank_background))) #print("categories_df") corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False) corpus.append(treebank_background, ignore_index=True, sort=False) corpus.fillna(value='', inplace=True) return corpus
class _BertTxtContainer: # memorizza la frase più lunga che BERT possa valutare def __init__(self): self.tokN = 0 self.txt = '' def addTxtArr(self, inputTxt): tokenized = tokenizer.tokenize(inputTxt) if len(tokenized) + self.tokN <= 512: self.txt += inputTxt + ' ' self.tokN += len(tokenized) return True return False # calcolo perplexity su PennTreebank N = len(treebank.sents()) perplexity = [] print('Frasi:', N) bert_txt = _BertTxtContainer() c = 0 for sent in treebank.sents()[:N]: c += 1 sentTxt = ' '.join(sent) # se ho sforato, calcolo perplexity e inserisco la frase in un nuovo oggetto if not bert_txt.addTxtArr(sentTxt): perplexity.append(BERT_model.get_score(bert_txt.txt)) # print('tokN', bert_txt.tokN, 'toks:', bert_txt.txt) bert_txt = _BertTxtContainer() bert_txt.addTxtArr(sentTxt) print(100 * c / N, '%')
import nltk from nltk.tag import hmm from nltk.probability import LaplaceProbDist from typing import List, Dict, AnyStr from numpy import mean import argparse from nltk.corpus import treebank import re treebank_sentence = [' '. join(sentence) for sentence in treebank.sents()] lower_cased = [sentence.lower() for sentence in treebank_sentence] allowed_states = re.compile('[^a-z,.\s]') print(allowed_states.('a', '')) ''.join([allowed_states.sub('', character) for character in lower_cased[0]]) for i in treebank_sentence[0]: print(type(i)) for i in additional_text_transitions.keys(): if i not in tagger._transitions.keys(): print(i)
from __future__ import division from gensim.models import word2vec from nltk.corpus import brown, treebank import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) tb_lens = 0 for i, sent in enumerate(treebank.sents()): tb_lens += len(sent) model = word2vec.Word2Vec(iter=1, size=300, window=10, min_count=1, sg=0) model.build_vocab(brown.sents()) model.train(treebank.sents(), total_examples=tb_lens, epochs=model.iter) print(" < -o- > vocab length:") print(len(model.wv.vocab)) model.save('model')
import nltk from gensim.models import Word2Vec from nltk.corpus import brown, movie_reviews, treebank b = Word2Vec(brown.sents()) mr = Word2Vec(movie_reviews.sents()) t = Word2Vec(treebank.sents()) print(b.most_similar('money', topn=5)) print('aew')
with open('tc_tags.txt', 'w') as f: for tag in tc_tags: f.write('%s\n' % tag) treebank_tags = [] for t in treebank.tagged_words(): treebank_tags.append(t[1]) with open('treebank_tags.txt', 'w') as f: for tag in treebank_tags: f.write('%s\n' % tag) with open('tc_sent_lengths.txt', 'w') as f: for sent in tc['sents']: f.write('%s\n' % len(sent)) with open('treebank_sent_lengths.txt', 'w') as f: for sent in treebank.sents(): f.write('%s\n' % len(sent)) #tc_tags_series = pd.Series(tc_tags) #tc_tag_freq = tc_tags_series.value_counts() #tc_tag_freq.plot(kind='bar') #treebank_tags_series = pd.Series(treebank_tags) #treebank_tag_freq = treebank_tags_series.value_counts() #treebank_tag_freq.plot(kind='bar') #plt.show()
from nltk.corpus import treebank from nltk.tree import * import nltk from nltk.grammar import * import numpy treebank.ensure_loaded() # building the grammar and test set tbank_productions = treebank.parsed_sents() grammar_used = tbank_productions[:int(len(tbank_productions) * 0.8)] # normalize the c structures for t in grammar_used: t.chomsky_normal_form() tbank_productions2 = list(treebank.sents()) test_part = tbank_productions2[int(len(tbank_productions) * 0.8):] # prodcutions productions = [] for t in grammar_used: productions += Tree.productions(t) # induce PCFG S = nltk.Nonterminal("S") grammar = nltk.induce_pcfg(S, productions) prod = grammar.productions() #helping function to get the probability of a production def findProb(lhsa, rhsa, prod):
import nltk from nltk.tag import UnigramTagger from nltk.corpus import treebank training= treebank.tagged_sents()[:7000] unitagger=UnigramTagger(training) print(treebank.sents()[0]) print(unitagger.tag(treebank.sents()[0]))
import nltk from nltk.corpus import treebank from nltk.tag import UnigramTagger unitag = UnigramTagger(model={'Vinken': 'NN'}) print(unitag.tag(treebank.sents()[0]))
brown_POS_tags = set() for word, pos in brown.tagged_words(tagset='universal'): brown_vocab.add(word.lower()) brown_POS_tags.add(pos) hmm = SequentialHMM(brown_vocab, brown_POS_tags) brown_corpus = [[(word.lower(), pos) for word, pos in sent] for sent in brown.tagged_sents(tagset='universal')] print("Not allowing unknown words:") hmm.train(brown_corpus) print("1 example in treebank:") for sent in treebank.sents(): new_sent = [ word.lower() for word in sent if word.lower() in brown_vocab ] if len(new_sent) == len(sent): print(hmm.decode(new_sent)) break print("--------") print("Allowing unknown words:") hmm = SequentialHMM(brown_vocab, brown_POS_tags, allow_unknown=True) hmm.train(brown_corpus) print("5 examples in treebank:") for sent in treebank.sents()[:5]:
import nltk from nltk.tag import BigramTagger from nltk.corpus import treebank training_1= treebank.tagged_sents()[:7000] bigramtagger=BigramTagger(training_1) print(treebank.sents()[0]) print(bigramtagger.tag(treebank.sents()[0])) testing_1 = treebank.tagged_sents()[2000:] print(bigramtagger.evaluate(testing_1))