def main(config): grammar_string = parse_induced_grammar( config.grammar ) if config.output: with open(config.output, 'w') as f: f.write(grammar_string) grammar = PCFG.fromstring( grammar_string ) grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things # Create directory for parse_trees if it does not already exist if config.textfile: if not os.path.exists(config.output_parse): os.makedirs(config.output_parse) if config.textfile: parser = ViterbiParser(grammar) with open(config.textfile, 'r') as f: lines = f.read().splitlines() for i, line in enumerate(lines): if i==config.number_parses: break print(f"Parsing sentence {i+1}") sent = line.split() for t in parser.parse(sent): TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
def pcfg_data_likelihood(cfg_path, weights, data, counts, epsilon=1e-10): """Compute the log-likelihood of the real programs dataset using PCFG with user-specified weights. @param cfg_path: string path to PCFG dump @param weights: np.array parameters of CFG. @param data: list of code segments each code segment is a list of strings (space-sep) @param counts: each data point is not weighted equally we weight by occurrence @param epsilon: default to use for empty trees [default: 1e-10] @return log_lik: float log likelihood of dataset. """ # space of possible integers (some of the language # requires a countably infinite number of possiblilities. # we only care about encoding the real program-space so # we only explicitly model the integers in the real set. integer_domain = get_integer_domain(data) pcfg = build_pcfg(cfg_path, weights, integer_domain, True) parser = ViterbiParser(pcfg) log_like = 0 missing = 0 for i, (code, cnt) in enumerate(zip(data, counts)): generator = parser.parse(code) if generator is not None: tree = next(generator) ll = tree.logprob() else: # this program is not covered by the pCFG ll = np.log(epsilon) log_like += -ll * cnt missing += 1 return log_like
def parse_command(self, seqs, keep=3): non_terminals = get_nonterminals(self._pcfg) viterbi = ViterbiParser(self._pcfg) for seq, id in seqs: curr_trees = [] for parse_option in get_parse_options(seq, non_terminals): try: for t in viterbi.parse(parse_option): curr_trees.append((t, parse_option)) except ValueError: print(parse_option) print(curr_trees) curr_trees = sorted(curr_trees, key=lambda tree: -tree[0].prob()) print(seq, sum([tree[0].prob() for tree in curr_trees]), len(curr_trees)) if keep != -1: curr_trees = curr_trees[:keep] print('now', len(curr_trees)) for tree, parse_option in curr_trees: self._parsed_trees.append((parse_option, tree, id)) print(len(seqs), len(self._parsed_trees)) trees = [(tree[0], tree[1], tree[2]) for tree in self._parsed_trees] output_files = [] for i, (option, tree, ind) in enumerate(trees): a = save_tree(tree, None, 'parse{}'.format(i), postscript=False, prob=tree.prob(), csb_id=ind) output_files.append(a) merge_pdfs(output_files, 'merged_parse.pdf')
def parse(parser: ViterbiParser, sentence): start_time = time.time() parser.trace(trace=1) for tree in parser.parse(sentence): print(tree) print( f"Time elapsed for sentence of length {len(sentence)}: {time.time() - start_time}" )
def parsing(sample, g): from nltk.parse.viterbi import ViterbiParser from nltk.draw.tree import draw_trees parser = ViterbiParser(g) for s in sample: print " ".join(s) t = parser.parse(s) if t: print t.logprob()
def parse_treebank(parser: ViterbiParser, sentences): start_time = time.time() parser.trace(trace=1) for sentence in treebank.parsed_sents(sentences[:3]): tokens = sentence.leaves() for tree in parser.parse(tokens): print(tree) print( f"Time elapsed for sentence of length {len(tokens)}: {time.time() - start_time}" )
def test_PCFG(grammar, shapes=False): ''' Test whether the grammar can parse a sentence ''' #sent = [i.replace("'","") for i in TERMINALS[:5]] #sent = "in the middle center is a green square".split() if not shapes: sent = "2 2 2 12 2 12 2 2 12 2".split() else: sent = "in the middle center is a green square".split() sr = ViterbiParser(grammar) for t in sr.parse(sent): t.draw()
def run_parser(corpus): """ Runs the parser on a corpus. @param corpus: List of lists with input tokens """ for sentence in corpus: grammar = getGrammar(sentence) parser = Parser(grammar) sent = splitSentence(sentence) tree = parser.parse(sent) # tree.draw() # print tree.pprint(margin=30) extractDepParse(tree, sentence)
def sanity_test(): """Unit Test to make sure this stuff is working. This function should NOT break. """ from ..rubric_utils.load_params import ( get_pcfg_params, get_pcfg_path, get_codeorg_data_root, ) data_root = get_codeorg_data_root(1, 'raw') theta = get_pcfg_params(1, author='teacher', random=False) cfg_path = get_pcfg_path(1, author='teacher') data, counts = load_real_asts(data_root, 1, True) integer_domain = get_integer_domain(data) # CKY parser for p-cfgs... pcfg = build_pcfg(cfg_path, theta, integer_domain, False) parser = ViterbiParser(pcfg) generator = parser.parse(['Move', '(', '50', ')']) tree = next(generator) # print(tree.logprob()) print(tree)
def analyse_viterbi(pcfg, messages): """ Infers the Viterbi parses of the fixed induction set, split induction set and evaluation set Writes parses to txt file Computes message likelihood, tree diversity and evaluation coverage Writes these properties to a pickle file Returns a list of strings for summarized properties """ # Get terminals prods_lexical = [ prod for prod in pcfg.productions() if type(prod.rhs()[0]) == str ] terminals = set([prod.rhs()[0] for prod in prods_lexical]) # Compute message likelihoods and tree depth parser = ViterbiParser(pcfg) message_count = len(messages) message_count_quarter = int(np.ceil(message_count / 4)) lines_parse = [] trees = [] tree_depths = [] logprobs = [] failed_parses = [] parsed_count_weighted = 0 for i, sent in enumerate(messages): sent = list(sent) if all(sym in terminals for sym in sent): tree_list = list(parser.parse(sent)) if len( tree_list ) == 1: # if the message can be parsed, tree_list contains one tree tree = tree_list[0] parse = to_parse_string(tree) trees.append(parse) tree_depths.append(tree_depth(tree)) logprobs.append( tree.logprob() / np.log(2) ) # convert natural logarithm from tree to log base 2 for description length else: parse = "NO_PARSE" logprobs.append(None) tree_depths.append(None) failed_parses.append(sent) else: parse = "NO_PARSE" logprobs.append(None) tree_depths.append(None) failed_parses.append(sent) # Compute final statistics parsed_count = len(ignore_none(logprobs)) unparsed_count = message_count - parsed_count # Collect evaluation information (of unique messages) eval_stats = { 'log2likelihoods': logprobs, # corresponds to {data: frequencies} 'unparsed_count': unparsed_count, 'parsed_count': parsed_count, 'failedparses': failed_parses, } # Evaluation coverage coverage = parsed_count / len(messages) eval_stats['coverage'] = coverage * 100 eval_stats['average_log2likelihood'] = mean(logprobs) or float('nan') return eval_stats
def parse(parser: ViterbiParser, sentence): for tree in parser.parse(sentence): yield tree