def CKY_parser(): ''' Given the PCFG, we use the built in CKY praser function to get a sentence's most probable parse ''' PCFG_grammar = make_PCFG_grammar() # Utilize the ViertabiParser given the PCFG grammar induction rules parser = ViterbiParser(PCFG_grammar) # Sample sentence parse sentences = treebank.parsed_sents('wsj_1964.mrg') skipped_sentences = 0 # A for loop to print out the full parse for sentence in sentences: sentence = sentence.leaves() try: PCFG_grammar.check_coverage(sentence) for parse in parser.parse(sentence): print(parse) except: skipped_sentences += 1 continue print("Total skipped sentences:", skipped_sentences)
def test_sentences(grammar): for t in test: print("Processing: " + str(t)) reference = list(treebank.tagged_words(t)) tokens = list(treebank.words(t)) print("fixing grammar.....") # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary fixed_grammar = get_fixed_grammer(grammar, tokens) print("fixed grammar") print("Building Parser....") parser = ViterbiParser(fixed_grammar) print("Parsing...") #Gets list of all possible trees, the most likely tree is at index 0 start = time.time() parses = parser.parse_all(tokens) print("Time") print(start - time.time()) #Getting POS tags from parser tree leafs = parses[0].pos() #Calculating accuracy of Parser results correct_tags = 0.0 for i in range(len(leafs)): if leafs[i] == reference[i]: correct_tags += 1.0 print(str(correct_tags/len(leafs)))
def evaluate_sentence(sentence: string, grammar: PCFG): sentence = sentence.split() print(sentence, flush=True) pos = [pos for word, pos in pos_tag(sentence)] print(pos, flush=True) parser = ViterbiParser(grammar, trace=0) for line in accumulate(pos, lambda total, token: total + ' ' + token): line = line.split() print(line) print([tree.prob() for tree in list(parser.parse(line))], flush=True)
def test(): """ A test to check if the changes I made have the intended effect """ import nltk from nltk.parse import ViterbiParser sent = 'I saw the man with my telescope' tokens = sent.split() grammar = nltk.toy_pcfg1 parser = ViterbiParser(grammar) parser.trace(3) parses = parser.nbest_parse(tokens) print(parses)
def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def main(): data = pd.read_csv(data_file_path) data = data.drop(columns=["Unnamed: 0"]) (sentence, sentence_tokens) = readsentence() # take input from user and save text, tokenized text if os.path.exists('mytagger.pkl'): # try to open a previously saved tagger input = open('mytagger.pkl', 'rb') mytagger = load(input) input.close() else: # no such tagger is found so train/save it mytagger = traintagger() output = open('mytagger.pkl', 'wb') dump(mytagger, output, -1) output.close() tagged_tokens = mytagger.tag(sentence_tokens) print(tagged_tokens) if os.path.exists('mypcfg.pickle'): # try to open a previously saved PCFG input = open('mypcfg.pickle', 'rb') mypcfg = load(input) input.close() else: # no such PCFG exists, so induce/save it mypcfg = buildpcfg() output = open('mypcfg.pickle', 'wb') dump(mypcfg, output) output.close() try: tree = sequence_matching(tagged_tokens) print("Sequence matching was used") except: parser = ViterbiParser(mypcfg) tree = parser.parse(tagged_tokens) print("Vitberi parser was used") finally: if not isinstance(tree, Tree): Tree.pretty_print(next(tree)) # do something to print it out, or print error message if input couldn't be parsed else: print(tree) df2 = {'sentence': sentence, 'sentence tokens': sentence_tokens, 'tagged tokens': tagged_tokens} data = data.append(df2, ignore_index=True) data.to_csv(data_file_path) print("Previous data:") print(data)
def parseCKY(sentence, grammar): # Tokenize the sentence. tokens = sentence.split() #print('Coverage of input words by a grammar:') change_words = [] for i, ele in enumerate(tokens): try: grammar.check_coverage([ele]) except: #clprint("%s is not covered by the grammar. Replacing it with 'UNK'" % ele) change_words.append(tokens[i]) tokens[i] = 'UNK' parsers = [ViterbiParser(grammar)] # Run the parsers on the tokenized sentence. from functools import reduce times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print('\nsentence: %s\n ' % (sentence)) t = time.time() parses = parser.parse_all(tokens) times.append(time.time() - t) if parses: lp = len(parses) p = reduce(lambda a, b: a + b.prob(), parses, 0.0) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 # for parse in parses: # print(parse) return parses
def parse_sentence(self, sent): """ Parse sent using induced grammar Visualize the most likely parse tree for sent :return: None. Save parsing results to pcfg.txt """ if self.grammar is None: raise ValueError("PCFG hasn't been induced yet.") # other parser option(s): e.g., parser = pchart.InsideChartParser(self.grammar) parser = ViterbiParser(self.grammar) parser.trace(3) # http://www.nltk.org/api/nltk.parse.html sys.stdout = open('pcfg.txt', 'w') parses = parser.parse(sent) for parse in parses: print(parse) # visualize the tree: print(parse.draw())
def Parser_Section(): demos = [('I saw John through the telescope', toy_pcfg1)] sent, grammar = demos[0] # print(grammar) # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) parser.trace(0) # Use this to change verbosity t = time.time() parses = parser.parse_all(tokens) print("Time:", time.time()-t) if parses: lp = len(parses) p = reduce(lambda a,b:a+b.prob(), parses, 0.0) else: p = 0 print("Probability:", p)
def perplexity(): ''' Give the PCFG and the parser used, run the parser on the rest of the treebank and calculates the perplexity of the model given the testing sentences. ''' PCFG_grammar = make_PCFG_grammar() parser = ViterbiParser(PCFG_grammar) all_p = [] skipped_sentence = 0 for item in treebank.fileids()[1964:]: trees = treebank.parsed_sents(item) for tree in trees: tree = tree.leaves() try: PCFG_grammar.check_coverage(tree) for parse in parser.parse(tree): parse_string = str(parse) p = re.search(r"p=([^/]+)", parse_string).group(1) p = p[:-1] all_p.append(float(p)) except: skipped_sentence += 1 continue perplexity = 1 N = float(len(all_p)) for p in all_p: perplexity = perplexity * (1/p) perplexity = pow(perplexity, 1/float(N)) print("Perplexity:", perplexity) print("All parse probabilities:", all_p) print("Skipped sentences:", skipped_sentence) print("PCFG grammar:", PCFG_grammar)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import PCFG from nltk.parse import ViterbiParser toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)
def init_viterbi(self): return ViterbiParser(self.grammar)
from nltk import induce_pcfg from nltk.parse import pchart from nltk.parse import ViterbiParser from nltk.treetransforms import * from nltk import * productions = [] for item in treebank._fileids: length=int(len(item)*0.9) for tree in treebank.parsed_sents(item)[:length]: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) parser = pchart.InsideChartParser(grammar) parserv= ViterbiParser(grammar2) for item in treebank._fileids: start=int(len(item)*0.9) for tree in treebank.parsed_sents(item)[start:]: sent = tree.leaves() print(tree.pos()) for parse in parser.parse(sent): print(parse) for parse in parserv.parse(sent): print(parse)
def create_viterbi_parser(grammar, pickle_it=False, filename="viterbi"): parser = ViterbiParser(grammar) parser.trace(0) if pickle_it: pickle.dump(parser, open("%s%s-parser.p" % (var_dir, filename), "wb")) return parser
#print productions ############ create PCFG from the productions ####### from nltk import Nonterminal from nltk import induce_pcfg S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) print(grammar) ######### Parser with CYK dynamic algorithm ######## from nltk.parse import pchart from nltk.parse import ViterbiParser from nltk.treetransforms import un_chomsky_normal_form parser = ViterbiParser(grammar) parser.trace(2) parses_bank = [] test_file = open(args.test_dir, 'wb') test_output_file = open(args.output_dir, 'wb') for i in range(valid_idx, test_idx + 1): # take the leaves of each tree of testset and store # them in the test file tokens = treebank[i][0].leaves() sentence = u" ".join(tokens) test_file.write((sentence + u"\n").encode('utf-8')) print 'parsing :', sentence
def __init__(self): self.wordToTags = defaultdict(set) convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()] for word, tag in convertedTaggedWords: self.wordToTags[word].add(tag) productions = list() S = nltk.Nonterminal('S') for tree in treebank.parsed_sents(): productions += tree.productions() # create the grammar pcfg = nltk.induce_pcfg(S, productions) # print(pcfg) self.viterb = ViterbiParser(pcfg) self.mostRecentTree = None self.validPosTags = set() self.validChunkTags = set() self.validIOBTags = set() self.relationTags = set() self.anchorTags = set() # pos tags self.validPosTags.add("CC") self.validPosTags.add("CD") self.validPosTags.add("DT") self.validPosTags.add("EX") self.validPosTags.add("FW") self.validPosTags.add("IN") self.validPosTags.add("JJ") self.validPosTags.add("JJR") self.validPosTags.add("JJS") self.validPosTags.add("LS") self.validPosTags.add("MD") self.validPosTags.add("NN") self.validPosTags.add("NNS") self.validPosTags.add("NNP") self.validPosTags.add("NNPS") self.validPosTags.add("PDT") self.validPosTags.add("POS") self.validPosTags.add("PRP") self.validPosTags.add("PRP$") self.validPosTags.add("PR") self.validPosTags.add("PBR") self.validPosTags.add("PBS") self.validPosTags.add("RP") self.validPosTags.add("SYM") self.validPosTags.add("TO") self.validPosTags.add("UH") self.validPosTags.add("VB") self.validPosTags.add("VBZ") self.validPosTags.add("VBP") self.validPosTags.add("VBD") self.validPosTags.add("VBG") self.validPosTags.add("WDT") self.validPosTags.add("WP") self.validPosTags.add("WP$") self.validPosTags.add("WRB") self.validPosTags.add(".") self.validPosTags.add(",") self.validPosTags.add(":") self.validPosTags.add("(") self.validPosTags.add(")") # chunk tags self.validChunkTags.add("NP") self.validChunkTags.add("PP") self.validChunkTags.add("VP") self.validChunkTags.add("ADVP") self.validChunkTags.add("ADJP") self.validChunkTags.add("SBAR") self.validChunkTags.add("PRT") self.validChunkTags.add("INTJ") self.validChunkTags.add("PNP") # IOB tags self.validIOBTags.add("I-") self.validIOBTags.add("O-") self.validIOBTags.add("B-") # relation tags self.relationTags.add("SBJ") self.relationTags.add("OBJ") self.relationTags.add("PRD") self.relationTags.add("TMP") self.relationTags.add("CLR") self.relationTags.add("LOC") self.relationTags.add("DIR") self.relationTags.add("EXT") self.relationTags.add("PRP") # anchor tags self.anchorTags.add("A1") self.anchorTags.add("P1")
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [ ('I saw the man with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse)
#yang di test_set for word, tag in treebank.tagged_words(): t = Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): tbank_productions.append(production) print tbank_productions[2] #Secara otomatis membangun grammar (terutama menghitung probability rule) #dari list production rule tbank_productions tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions) print tbank_grammar #PARSING parser = ViterbiParser(tbank_grammar) s = time.time() #parsing untuk raw data latih kedua for t in parser.parse(raw_test_set[1]): print(t) #hitung waktu parsing s = time.time() - s #gold standard dari dataset kedua print test_set[1] '''Tugas anda adalah membangun fungsi untuk mengukur akurasi dari parser yang telah dibangun. Akurasi terdiri dari 2 macam, yaitu exact match, dan partial match (rata-rata recall dan precision). Cari sendiri bagaimana menghitung recall dan precision untuk parsing dari referensi yang valid.
def do_cky(grammar): global test global posx viterbi = ViterbiParser(grammar) # inicializa o parser com a gramatica (PCFG) resultados = [] for t in test[:1]: # para cada sentenca da base de teste try: sent = t.leaves() # pega a frase if len(sent) <= 18: # filtro para palavras com até 18 palavras (incluindo pontuação) ssent = [] for s in sent: # checar a cobertura da gramatica para cada palavra try: grammar.check_coverage([s]) ssent.append(s) except ValueError: # para os casos de palavras desconhecidas ssent.append("UNK") saida = [] for i in viterbi.parse(ssent): # utiliza o parser para a sentenca de teste saida.append(i) # lista para o resultado das duas vertentes: descobrir os não-terminais que original os terminais; e identificar os não-terminais que derivam as palavras list_eval_val = [] list_eval_test = [] list_tag_val = [] list_tag_test = [] posx = 0 make_tree_evaluation(saida[0][0],list_eval_test,list_tag_test,0) # realiza a avalicao para o resultado do parser posx = 0 make_tree_evaluation(t,list_eval_val,list_tag_val, 0) # realiza a avalicao para a arvore da base de teste # ordena pela ordem de visitacao list_eval_test.sort(key=lambda tup: tup[3]) list_eval_val.sort(key=lambda tup: tup[3]) # quantidade de acertos acertos = len(set(list_eval_test).intersection(set(list_eval_val))) # labeled precision lp = acertos/len(list_eval_test) # labeled recall lr = acertos/len(list_eval_val) # f1 f1 = 0 if lp > 0 and lr > 0: f1 = 2*lp*lr/(lp+lr) # tagging accuracy ta = 0 ta = len([i for i, j in zip(list_tag_test, list_tag_val) if i == j]) ta /= len(list_tag_val) # armazena o resultado r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta} resultados.append(r) else: print("Sentença com mais de 18 palavras.") except Exception: print("Árvore mal formada.") # realiza o calculo da media para cada metrica media_lp = sum(item['lp'] for item in resultados)/len(resultados) media_lr = sum(item['lr'] for item in resultados)/len(resultados) media_f1 = sum(item['f1'] for item in resultados)/len(resultados) media_ta = sum(item['ta'] for item in resultados)/len(resultados) print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.parse import ViterbiParser # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)