def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) tree = parser.parse(sentence) print(dumps(tree))
def sqrt_PCFG(G: PCFG): """ Input: a PCFG G Output: a PCFG that is the sqrt of G """ WCFG_rules = {} for S in G.rules: WCFG_rules[S] = { F: (G.rules[S][F][0], G.rules[S][F][1]**(0.5)) for F in G.rules[S] } # Yeah, I know... not exactly a PCFG (probabilities do not sum to 1), but it fits the bill WCFG = PCFG(start=G.start, rules=WCFG_rules) partition_function = compute_partition_function(WCFG) PCFG_rules = {} for S in WCFG.rules: new_rules_S = {} for F in WCFG.rules[S]: args_F = WCFG.rules[S][F][0] w = WCFG.rules[S][F][1] multiplier = prod(partition_function[arg] for arg in args_F) new_rules_S[F] = (args_F, w * multiplier * 1 / partition_function[S]) PCFG_rules[S] = new_rules_S return PCFG(G.start, PCFG_rules)
def pcfg(sentence): pcfg = PCFG() pcfg.readCFGRules(FilePath.ROOT + "rules.txt") #pcfg.showRules() pcfg.parse(sentence) pcfg.showTrees() pass
def a_star(G: PCFG): """ A generator that enumerates all programs using A*. Assumes that the PCFG only generates programs of bounded depth. """ frontier = [] initial_non_terminals = deque() initial_non_terminals.append(G.start) heappush( frontier, ( -G.max_probability[G.start].probability[(G.__hash__(), G.start)], (None, initial_non_terminals, 1), ), ) # A frontier is a heap of pairs (-max_probability, (partial_program, non_terminals, probability)) # describing a partial program: # max_probability is the most likely program completing the partial program # partial_program is the list of primitives and variables describing the leftmost derivation, # non_terminals is the queue of non-terminals appearing from left to right, and # probability is the probability of the partial program while len(frontier) != 0: max_probability, (partial_program, non_terminals, probability) = heappop( frontier ) if len(non_terminals) == 0: yield partial_program else: S = non_terminals.pop() for P in G.rules[S]: args_P, w = G.rules[S][P] new_partial_program = (P, partial_program) new_non_terminals = non_terminals.copy() new_probability = probability * w new_max_probability = new_probability for arg in args_P: new_non_terminals.append(arg) new_max_probability *= G.max_probability[arg].probability[ (G.__hash__(), arg) ] heappush( frontier, ( -new_max_probability, (new_partial_program, new_non_terminals, new_probability), ), )
def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens) self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)} self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word] # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ) self.grammar_dicts = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand tag of the grammar rule idx_root_tag = self.tag_to_id[root_tag] self.grammar_dicts[idx_root_tag] = {} dico = {} for (split, proba) in rules.items(): # split is the right hand term, and proba the probability of the rule idx_left_tag = self.tag_to_id[split[0]] idx_right_tag = self.tag_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_dicts[idx_root_tag] = dico
def __init__(self, corpus): # PCFG and OOV class self.pcfg = PCFG(corpus) self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags, self.pcfg.tokens) # Initialize CYP probability matrix self.proba_matrix = None self.cyk_matrix = None
def run(args): data = loader.load_treebanks(TREEBANK_PATH) train_data, dev_data, test_data = loader.train_test_split( data, 0.8, 0.1, 0.1) words, embeddings = loader.load_word_embeddings(EMBEDDING_PATH) pcfg = PCFG(train_data) pcfg.train(train_data) pcfg.set_oov(OOV, words, embeddings) if args.generate_output: output = pcfg.generate_output(test_data) if args.evaluation: accs, nb_no_parse = pcfg.predict(test_data[:2]) if args.parse: corpus = [] with open(args.txt_path, 'r') as f: corpus = f.read().split('\n') pcfg.parse_from_txt(corpus)
def build_model(): pcfg = PCFG() if exists(MODEL): pcfg.load_model(MODEL) else: print "Building the Grammar Model" start = time() if not exists(TEMP_DIR): makedirs(TEMP_DIR) # Normalise the treebanks if not exists(QUESTIONBANK_NORM): normalize_questionbank(QUESTIONBANK_DATA, QUESTIONBANK_PENN_DATA) gen_norm(QUESTIONBANK_NORM, [QUESTIONBANK_PENN_DATA]) if not exists(PENNTREEBANK_NORM): gen_norm(PENNTREEBANK_NORM, glob(PENNTREEBANK_GLOB)) # Keep a part of the treebanks for testing i = 0 with open(MODEL_TREEBANK, 'w') as model, open(TEST_DAT, 'w') as dat, open(TEST_KEY, 'w') as key: for treebank in [QUESTIONBANK_NORM, PENNTREEBANK_NORM]: for tree in open(treebank): i += 1 if (i % 100) == 0: sentence, n = get_sentence(loads(tree)) if n > 7 and n < 20: dat.write(sentence+'\n') key.write(tree) else: i -= 1 model.write(tree) # Learn PCFG pcfg.learn_from_treebanks([MODEL_TREEBANK]) pcfg.save_model(MODEL) print "Time: (%.2f)s\n" % (time() - start) return pcfg
def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols, self.PCFG.freq_tokens) #note : if the id of a symbol is above self.PCFG.nb_tags, #it's an artificial symbol introduced with Chomsky normalization self.symbol_to_id = { symbol: i for (i, symbol) in enumerate(self.PCFG.list_all_symbols) } #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids: #we store rules with an additional hierarchical level for speed up #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ) #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries self.grammar_ids = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand symbol of the grammar rule idx_root_tag = self.symbol_to_id[root_tag] self.grammar_ids[idx_root_tag] = {} dico = {} for (split, proba) in rules.items( ): #split is the right hand term, and proba the probability of the rule idx_left_tag = self.symbol_to_id[split[0]] idx_right_tag = self.symbol_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_ids[idx_root_tag] = dico #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ? #this is what stores self.lexicon_inverted self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]
def run(args): has_effect = False if args: try: train_corpus, val_corpus, test_corpus = data.get_train_val_test() words, embeddings = data.get_polyglot_words_embeddings() parser = PCFG() parser.learn_probabilities_and_rules(train_corpus) parser.set_oov_module(OovModule, words, embeddings) if args.inference: get_gold(parser, test_corpus, filename='evaluation_data.gold') get_predictions(parser, test_corpus, filename='evaluation_data.parser_output') if args.evaluation: evaluation('evaluation_data.gold', 'evaluation_data.parser_output') if args.parse: parser.parse_from_txt(args.txt_path) except Exception as e: logger.exception(e) logger.error("Uhoh, the script halted with an error.") else: if not has_effect: logger.error( "Script halted without any effect. To run code, use command:\npython3 main.py <args>" )
# !/usr/bin/env python3 # -*- coding: utf-8 -*- # -------------------------------------------# # main.py # # author: sean lee # # qq: 929325776 # # email: [email protected] # #--------------------------------------------# from pcfg import PCFG parser = PCFG() parser.fit('./corpus/toy/train.txt') parser.parse("the man saw the dog") ''' print(parser.N_dict) print(parser.NR_dict) print(parser.TR_dict) '''
for index, token in enumerate(tokens): resulting_chart[index, index] = token for row in range(1, L): for col in range(0, row): resulting_chart[row][col] = ' ' for row in range(0, L): for col in range(row+2, L+1): if not resulting_chart[row, col]: resulting_chart[row, col] = ' ' f.write(str(resulting_chart)) f.write('\n') f.close() return (chart, best_probability[0]) if __name__ == "__main__": try: os.remove('workfile.txt') except OSError: pass sentences_to_parse = 10 pcfg = PCFG.load(PCFG_SOURCE) with open(TOKEN_SOURCE, "r") as source: for index, line in enumerate(source): if index == sentences_to_parse: break tokens = line.split() (_, best_probability) = CKY_chart(tokens, pcfg) print("{:.4f}: {}".format(best_probability.bw, " ".join(tokens)))
class Timeout(object): def __init__(self, seconds=1, error_message="Timeout"): self.seconds = seconds self.error_message = error_message def handle_timeout(self, signum, frame): raise TimeoutError(self.error_message) def __enter__(self): signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) def __exit__(self, type, value, traceback): signal.alarm(0) if __name__ == "__main__": pcfg = PCFG.load(PCFG_SOURCE) print "PCFG loaded." with open(TREE_SOURCE, "r") as source: for tree in Tree.from_stream(source): tokens = [leaf.decode("ASCII") for leaf in tree.leaves()] try: with Timeout(TIMEOUT): (_, bw_prob) = CYK_chart(pcfg, tokens) print bw_prob except TimeoutError: pass
def train(train_data_filename, train_rare_filename, pcfg_model_filename, rare_words_rule): print 'train PCFG model' pcfg = PCFG() for l in open(train_data_filename): t = json.loads(l) pcfg.count(t) pcfg.count_word() print 'process rare word' process_rare_words(open(train_data_filename), open(train_rare_filename, 'w'), pcfg.rare_words, rare_words_rule) print 'train PCFG model again' new_pcfg = PCFG() for l in open(train_rare_filename): t = json.loads(l) new_pcfg.count(t) new_pcfg.cal_rule_params() new_pcfg.write(open(pcfg_model_filename, 'w')) return new_pcfg
def main(): train_data_filename = 'parse_train.dat' train_rare_filename = 'p1.train.rare.dat' pcfg_model_filename = 'parser_train.counts.out' pcfg = PCFG() for l in open(train_data_filename): t = json.loads(l) pcfg.count(t) pcfg.count_word() process_rare_words(open(train_data_filename), open(train_rare_filename, 'w'), pcfg.rare_words, rare_words_rule_p1) new_pcfg = PCFG() for l in open(train_rare_filename): t = json.loads(l) new_pcfg.count(t) new_pcfg.cal_rule_params() new_pcfg.write(open(pcfg_model_filename, 'w'))
import math import pprint import torch import torch.nn as nn import torch.nn.functional as F from pcfg import PCFG def p(s): print(s) return s gen_a = lambda terminals_per_category: PCFG.fromstring( " A -> " + " | ".join([ "\"a" + str(i) + "\" [" + str(1 / terminals_per_category) + "]" for i in range(terminals_per_category) ])) generate_test_grammar = lambda p1, terminals_per_category: PCFG.fromstring( " S -> NP-s VP-s [" + str((1 - p1) / 2) + "] | NP-s S VP-s [" + str( p1 / 2) + "] | NP-p VP-p [" + str( (1 - p1) / 2) + "] | NP-p S VP-p [" + str(p1 / 2) + "]\n" + " NP-s -> N-s [" + str(1) + "]\n" + " NP-p -> N-p [" + str(1) + "]\n" + " VP-s -> V-s [" + str(1) + "]\n" + " VP-p -> V-p [" + str( 1) + "]\n" + " N-s -> " + " | ".join([ "\"n" + str(i) + "-s\" [" + str(1 / terminals_per_category) + "]" for i in range(terminals_per_category) ]) + "\n" + " N-p -> " + " | ".join([ "\"n" + str(i) + "-p\" [" + str(1 / terminals_per_category) + "]" for i in range(terminals_per_category)
sys.exit(1) # load the train file to trees trees = [] f = open(trainfilename, 'r') for line in f: trees.append(nltk.Tree.fromstring(line)) # preprocss the tree forms: ignore functional labels and binarize to CNF for tree in trees: # ignore_func_labels(tree) tree.chomsky_normal_form(horzMarkov=2) # tree.chomsky_normal_form() # learn PCFG lexicon, grammar, vocabulary, symbols = PCFG(trees) # print(grammar) # for OOV oovwords = OOV(embedfilename, vocabulary) # parse new sentences using CYK based on learned PCFG # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords) # i = 0 for line in sys.stdin: # print('start parse') # print(line) # start = time.time() # if line == '\n': continue # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)
from typing import TextIO from pcfg import PCFG import os BASEPATH: str = os.path.dirname(__file__) f: TextIO with open(os.path.join(BASEPATH, "subject_adjectives.txt")) as f: subject_adjectives: PCFG = PCFG.fromstring(f.read()) n: int = int(input("How many sentences do you want generated? ")) sentence: str for sentence in subject_adjectives.generate(n): print() print(sentence.capitalize())
def __init__(self, grammar_path, expand_binaries=False): self.grammar = PCFG.from_file(grammar_path, expand_binaries)
def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) return parser.parse(sentence)
from pcfg import PCFG import argparse parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="training treebank corpus", type=str) parser.add_argument("--sentences", help="raw token sentences", type=str) parser.add_argument("--outfile", help="name of the output file", type=str) args = parser.parse_args() grammar = PCFG(args.corpus) grammar.parse_corpus() grammar.predict(args.sentences, args.outfile)
def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) return parser.parse(sentence) if __name__ == "__main__": if len(sys.argv) != 2: print("usage: python3 parser.py GRAMMAR") exit() start = time() grammar_file = sys.argv[1] print("Loading grammar from " + grammar_file + " ...", file=stderr) pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) print("Parsing sentences ...", file=stderr) with Pool(processes = os.cpu_count()) as pool: trees = pool.map(multi_f, stdin.readlines()) for t in trees: print(dumps(t)) print("Time: (%.2f)s\n" % (time() - start), file=stderr)
) (VP (VB gave) (NP (DT the) (NN lecture) ) ) )""" # uncomment to use the above simple trees for debugging: # trees = [TRANSFORM(Tree.from_string(t)) for t in (t0, t1)] # grammar = PCFG.from_trees(trees) # let's get some input to build a grammar: grammar = PCFG.from_trees(list(TRANSFORM(t) for t in Tree.from_stream(GzipFile('bigger_treebank_2.txt.gz')))) print "Read {} rules in grammar.".format(len(grammar)) trees = list(TRANSFORM(t) for t in Tree.from_stream(open('end_of_wsj.txt'))) print "Read {} trees.".format(len(trees)) # now try and parse our trees: results = [] for idx, tree in enumerate(trees): tokens = [(t,) for t in tree.terminals()] # print 'Sentence {}\tTokens: "{}"'.format(idx, ' '.join(tree.terminals())) chart = Chart(grammar, tokens) chart.pretty_print() has_parse = chart.extract_parse() if not has_parse: print 'Sentence {}\tTokens: "{}" has no parse!'.format(idx, ' '.join(tree.terminals()))
if args.fnc == "generate": print "### GENERATING WORDS ###" print "model:", args.model, args.n print "language:", args.lang if args.model == "nphone": lm = NgramModel(args.n, corpus, 1) elif args.model == "nsyll": if args.lex.startswith("celexes/syll"): lm = NsyllModel(args.n, corpus, 1) else: print "Use syll__ file for this model" sys.exit() elif args.model == "pcfg": if args.lex.startswith("celexes/pcfg"): print call(["./pcfg/io","-d","1","-g", args.grammar, args.lex],stdout = open('grammars/gram_pcfg.wlt', 'w')) lm = PCFG('grammars/gram_pcfg.wlt') corpus = [re.sub(" ","",x) for x in corpus] else: print "Use pcfg__ file for this model" sys.exit() lm.create_model(corpus, args.smoothing) o = "Lexicons/lex_" + args.lex.split("/")[-1][:-4] + "_cv" + str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing) + ".txt" lexfile = write_lex_file(o, corpus, args.cv, args.iter, lm, args.h**o) print "null lexicons wrote on", lexfile print "### WRITING RESULTS ###" write_all(lexfile, args.graph, args.lang) else: o = "evaluation/eval_" + args.lex.split("/")[-1][:-4] + "_cv" + str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing)+ ".txt" out = open(o, 'w')
# stdlib import argparse # project from pcfg import PCFG if __name__ == "__main__": print("Welcome to my parser!") print("Please wait while loading the model ...") pcfg = PCFG() pcfg.from_path('sequoia-corpus+fct.mrg_strict.txt') pcfg.fit() print("Model loaded!") while True: print("Please enter phrase to parse!") phrase = str(input('>>> ')) tokenized = phrase.split() parsed = pcfg.pcky(tokenized) if not parsed: print("Sorry, we couldn't parse your line :(") else: print(parsed) print(">>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<\n")