def create_taskgrammar(grammar, task, encoders): logger.info('Creating specific grammar for task %s' % task) productions = grammar.productions(Nonterminal(task)) start_token = Nonterminal('S') new_productions = [] for start_production in productions: first_token = start_production.rhs()[0] if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'): for new_start_production in grammar.productions(first_token): new_productions.append(Production(start_token, new_start_production.rhs())) else: new_productions.append(Production(start_token, start_production.rhs())) for production in grammar.productions(): for new_production in new_productions: if production.lhs() in new_production.rhs() and production not in new_productions: if production.lhs().symbol() == 'ENCODERS': # Use encoders only for types of features in the dataset if len(encoders) > 0: new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders])) else: new_productions.append(Production(production.lhs(), ['E'])) else: new_productions.append(production) task_grammar = CFG(start_token, new_productions) with open(TASK_GRAMMAR_PATH, 'w') as fout: fout.write('\n'.join([str(x) for x in task_grammar.productions()])) return task_grammar
def get_all_pronouns(tree): pronouns_available = [] for production in tree.productions(): if (production._lhs == Nonterminal('PRP') or production._lhs == Nonterminal('PossPro')) and type(production._rhs[0]) is str: pronouns_available.append(production._rhs[0]) return pronouns_available
def reinsert_unary_chains(tree, old_grammar): old_unary_productions = [p for p in old_grammar.productions() if len(p) == 1 and p.is_nonlexical()] nodeList = [tree] while nodeList != []: node = nodeList.pop() if not isinstance(node, Tree): continue assert len(node) <= 2 nodeCopy = node.copy() children_rhs = [Nonterminal(child.label()) if not isinstance(child, str) else child for child in node] possibilities = [] possibility = [Nonterminal(node.label())] query = Production(possibility[-1], children_rhs) while query not in old_grammar.productions(): new_possibilities = [possibility + [p.rhs()[0]] for p in old_unary_productions if p.lhs() == possibility[-1]] possibilities.extend(new_possibilities) possibility = possibilities.pop(0) query = Production(possibility[-1], children_rhs) # Once a chain has been found, add it back in: node[0:] = [] # remove children lastnode = node for nt in possibility[1:]: newnode = Tree(nt.symbol(), []) lastnode[0:] = [newnode] lastnode = newnode lastnode[0:] = [child for child in nodeCopy] for child in lastnode: nodeList.append(child)
def main(args): sentence = args.sentence.lower() args.sentence = sentence tokens = sentence.split() grammar = loadGrammar(args) nonterm = getnonterm(grammar) terminalProductionRules = getTerminalProbability(args, grammar, nonterm) HSrules = grammar.productions(Nonterminal('HS')) for rule in HSrules: grammar.productions().remove(rule) ESrules = grammar.productions(Nonterminal('ES')) for rule in ESrules: grammar.productions().remove(rule) grammar.productions().extend(terminalProductionRules) for token in tokens: grammar.productions().append( ProbabilisticProduction(Nonterminal(token.upper()), [unicode(token)], prob=1)) #print "Grammars" grammarlist = str(grammar).split('\n')[1:] #print "Transfered" strgrammar = '' for p in grammar.productions(): rhs = p.rhs() rhsstr = '' for r in rhs: if is_terminal(r): rhsstr += '\'' + str(r) + '\' ' else: rhsstr += str(r) + ' ' strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format( p.prob()) + ']\n' #print strgrammar grammar = PCFG.fromstring(strgrammar.split('\n')) #''' #grammar = loadGrammar(args) #tokens = args.sentence.lower().split() #nonterm = getnonterm(grammar) CYK(tokens, nonterm, grammar) #with open(args.grammar_file, 'r') as f: # content = f.read() #trees = corpus2trees(content) #productions = trees2productions(trees) #listnonterm = [] #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions) #print grammar #''' '''
def test_production_from_grammar(self): grammar_str = """ S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ grammar = parse_cfg(grammar_str) productions = grammar.productions() expect_production = Production( lhs=Nonterminal("S"), rhs=[Nonterminal("NP"), Nonterminal("VP")]) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_production = Production(lhs=Nonterminal("N"), rhs=['dog']) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_not_in = Production(lhs="S", rhs=["NP", "VP"]) self.assertNotIn(expect_not_in, productions, error_msg) expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"]) self.assertNotIn(expect_not_in, productions, error_msg)
def traverse(self, node): assert (self.grammar != None) prob = 0.0 length = 0 if node.height() == 2: return (prob, length) lhs = Nonterminal(node.label()) productions = self.grammar.productions(lhs) #find the productions from flag = False rhs_list = [] for c in node: rhs_list.append(Nonterminal(c.label())) tuple_rhs = tuple(rhs_list) for p in productions: if p.lhs() == lhs and p.rhs() == tuple_rhs: flag = True prob += math.log(p.prob()) break if not flag: prob += math.log(eps) length += 1 for c in node: ret = self.traverse(c) prob += ret[0] length += ret[1] return (prob, length)
def test_current_production(self): inputs_ = [(""" (S (sentence (type_1_sentence_coord_1 (type_1_sentence_coord_2 (type_2_sentence (THERE There) (AUX is) (Noun_Phrase (det (DET an)) (Noun_w_support (Adj_phrase (Adj_core (JJ small)) (AND and) (Adj_phrase (Adj_core (JJ red)))) (Noun_Count (NN apple))))))) (PERIOD .))) """, Production(Nonterminal("S"), [Nonterminal("sentence")]))] for i, (input_, expect_) in enumerate(inputs_): tree = Tree.parse(input_) production = current_production(tree) self.assertEqual(expect_, production)
def generate_grammar_and_parsers(parsed_sents): # From sentences, extract the parsing tree and transform each tree to a list of CFG productions; # generate a set containing all the productions (without repetitions) tbank_productions_with_repet = [ production for parsed_sent in parsed_sents for production in parsed_sent.productions() ] tbank_productions = set( tbank_productions_with_repet) # exclude repetitions print("Num. of unique productions read:", len(tbank_productions)) # Build a CFG from the productions print("\nBuinding a CFG...") cfg_grammar = CFG(Nonterminal('S'), tbank_productions) # a CFG print(cfg_grammar, end="\n\n") # CFG - An Earley parser cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3) # Build a PCFG from the productions print("Building a PCFG...") pcfg_grammar = induce_pcfg( Nonterminal('S'), tbank_productions_with_repet) # a PCFG, here repetitions are needed! print(pcfg_grammar, end="\n\n") # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html pcfg_pchart_parser = InsideChartParser(pcfg_grammar) return cfg_earley_parser, pcfg_pchart_parser # return both parsers
def Parse(self, sent): """ Implement the CKY algorithm for PCFGs, populating the dynamic programming table with log probabilities of every constituent spanning a sub-span of a given test sentence (i, j) and storing the appropriate back-pointers. """ sent.append(" ") dynamic_table = defaultdict(float) backpointers = defaultdict(tuple) #for j,token in enumerate(sent): for j in range(0,len(sent)): for rule in self._r2l_lex[(sent[j],)]: dynamic_table[(j,j+1,rule.lhs())] = log(rule.prob()) for i in range(j-1,-1,-1): for k in range(i+1, j): newlist1 = [] newlist2 = [] for key in dynamic_table.keys(): if key[0] == i and key[1] == k: newlist1.append(key[2]) if key[0] == k and key[1] == j: newlist2.append(key[2]) for b in newlist1: for c in newlist2: rulelist = self._r2l[(b,c)] for rule in rulelist: if (i,j,rule.lhs()) not in dynamic_table.keys() or dynamic_table[(i,j,rule.lhs())] < log(rule.prob()) + dynamic_table[(i,k,rule.rhs()[0])] + dynamic_table[(k,j,rule.rhs()[1])]: dynamic_table[(i,j,rule.lhs())] = log(rule.prob()) + dynamic_table[(i,k,rule.rhs()[0])] + dynamic_table[(k,j,rule.rhs()[1])] backpointers[(i,j,rule.lhs())] = (k,rule.rhs()[0],rule.rhs()[1]) if sent == ["Terms", "were", "n't", "disclosed", ".", " "]: print dynamic_table[(0,len(sent)-1,Nonterminal("S"))] return self.BuildTree(dynamic_table,sent,backpointers,(0,len(sent)-1,Nonterminal("S")))
def BuildTree(cky_table, sent): n = len(sent) if Nonterminal("S") not in cky_table[0][n - 1].keys(): # print "not start with S" return None else: tree = BuildTreeHelper(cky_table, sent, 0, n - 1, Nonterminal("S")) return tree
def convert(tree): # convert from ntlk.tree.Tree to our AnnotatedTree if isinstance(tree, nltk.tree.Tree): symbol = Nonterminal(tree.label()) children = list(convert(_) for _ in tree) rule = Production(Nonterminal(tree.label()), _child_names(tree)) rule_selection_id = _find_rule_selection_id(rule) return AnnotatedTree(symbol=symbol, children=children, rule=rule, rule_selection_id=rule_selection_id) else: return AnnotatedTree(symbol=tree)
def terminal_distance(grammar, x): # due to masking that enforces minimal ring length, must override term distances derived purely from grammar if x['token'] == Nonterminal('aliphatic_ring'): return 8 elif x['token'] == Nonterminal('cycle_bond'): return max(2, 7 - x['ring_size']) elif x['token'] == Nonterminal('cycle_double_bond'): # need to go at least to cycle_bond -> num1 -> number return max(3, 7 - x['ring_size']) else: return grammar.terminal_dist(x['token'])
def str2production(str): prod_split = str.partition('->') nltk_lhs = Nonterminal(prod_split[0].strip()) nltk_rhs = [Nonterminal(e.strip()) for e in prod_split[2].split()] nltk_tree = nltk.grammar.Production(nltk_lhs, nltk_rhs) lhs = prod_split[0].strip() rhs = [e.strip() for e in prod_split[2].split()] # return super().__init__(lhs, rhs) # return Production(lhs, rhs) return Production(nltk_tree, lhs, rhs)
def BuildTree(cky_table, sent): """ Build a tree by following the back-pointers starting from the largest span (0, len(sent)) and recursing from larger spans (i, j) to smaller sub-spans (i, k), (k, j) and eventually bottoming out at the preterminal level (i, i+1). """ if Nonterminal('S') not in cky_table[(0, len(sent))]: return None else: return InvertedGrammar.recursive_build(cky_table, sent, Nonterminal("S"), 0, len(sent))
def __init__(self, grammar=grammar_zinc_new, checks=False): # self.mask_gen = get_mask_gen() # self.mask_gen.do_terminal_mask = False self.term_dist = {} self.d_term_dist = {} self.grammar = grammar self.GCFG = self.grammar.GCFG self.checks = checks for p in self.GCFG.productions(): for s in p.rhs(): if is_terminal(s): # terminals have term distance 0 self.term_dist[frozendict({'token': s})] = 0 self.term_dist[frozendict({'token': Nonterminal('None')})] = 0 # seed the search with the root symbol self.term_dist[frozendict({'token': Nonterminal('smiles')})] = float('inf') while True: # iterate to convergence # print('*** and one more pass... ***') last_term_dist = copy.copy(self.term_dist) for sym in last_term_dist.keys(): if is_terminal(sym['token']): self.term_dist[sym] = 0 if self.term_dist[sym] > 0: mask = self.get_mask_from_token(sym) # [p for ip, p in enumerate(self.GCFG.productions()) if mask[ip]] if self.checks: assert (not all([x == 0 for x in mask])) for ip, p in enumerate(self.GCFG.productions()): if mask[ip]: # print('trying', sym, p) this_exp = apply_rule([sym], 0, p, None, self.checks) this_term_dist = 1 for this_sym in this_exp: if frozendict(this_sym) not in self.term_dist: self.term_dist[frozendict(this_sym)] = float('inf') print('added ', this_sym, 'from', sym, 'via', p) # if 'ring_size' in sym and sym['ring_size'] > 6: # print('aaa') this_term_dist += self.term_dist[frozendict(this_sym)] if this_term_dist < self.term_dist[frozendict(sym)]: # if 'ring_size' in sym and sym['ring_size'] > 6: # print('aaa') print('improving:', p, self.term_dist[frozendict(sym)], this_term_dist, [self.term_dist[frozendict(this_sym)] for this_sym in this_exp]) self.term_dist[frozendict(sym)] = this_term_dist if last_term_dist == self.term_dist: break
def read_productions(self, productions_filename): productions = [] with io.open(productions_filename, 'r', encoding='utf8') as f: for line in f: line = line.strip() components = line.split(u'+') lhs = Nonterminal(components[0]) rhs = tuple([ Nonterminal(nt.strip()) for nt in components[1].split(u' ') ]) prob = float(components[2]) pp = ProbabilisticProduction(lhs, rhs, prob=prob) productions.append(pp) self.grammar = PCFG(Nonterminal('S'), productions)
def train(): files = tb.fileids() data = list(tb.parsed_sents(files)) # 80:20 split split = int(len(data) * 0.8) train_data = data[:split] test_data = data[split:] P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg( train_data) total_precision = 0 toal_recall = 0 total_f1_score = 0 i = 0 for test in test_data: print('Test', i) i += 1 try: words = test.leaves() scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms), copy(P_vocab), copy(P_term_parents), copy(P_parents_count)) start = Tree(Nonterminal('S'), []) if scores[0][len(words)][Nonterminal('S')] == 0: start = get_start(scores, len(words)) predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms) clean_tree(predicted_tree) predicted_tree.un_chomsky_normal_form() precision, recall, f1_score = evaluate(words, predicted_tree, test) print(precision, recall, f1_score) total_precision += precision toal_recall += recall total_f1_score += f1_score except: print('***************Failed', i - 1) continue total_precision /= len(test_data) toal_recall /= len(test_data) total_f1_score /= len(test_data) print('Precision', total_precision) print('Recall', toal_recall) print('F1_score', total_f1_score)
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD): print("\ninitializing...") global ALPHA global LPG_DIFF_THRESHOLD global MC_THRESHOLD global and_symb_count global or_symb_count global ignore_mc_ec ALPHA = alpha LPG_DIFF_THRESHOLD = gd_thr MC_THRESHOLD = mc_thr and_symb_count = 0 or_symb_count = 0 ignore_mc_ec = False ## create an empty grammar G S = Nonterminal("_START_") R = [ProbabilisticProduction(S, [""], prob=1.)] G = PCFG(S, R) T = _create_t(C) # create a table T ## repeat until no further rule to be learned i = 0 while not _finished(T): i += 1 print("\niter. n° %d" % (i,)) found, G, C, T, N = _learning_by_biclustering(G, C, T) if not found: print("NO MORE RULES CAN BE LEARNED") break G, C, T = _attaching(N, G, C, T) G = _postprocessing(G, C) print("\n", G) # DEBUG return G
def binarize(grammar): """Binarize grammar by introducing new nonterminals""" result = [] for rule in grammar.productions(): if len(rule.rhs()) > 2: # this rule needs to be broken down left_side = rule.lhs() symbol_names = [ tsym.symbol() if not isinstance(tsym, str) else '@' + tsym for tsym in rule.rhs() ] for k in range(1, len(rule.rhs()) - 1): new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join( symbol_names[k:]) + '>' new_sym = Nonterminal(new_rhs_name) new_production = Production(left_side, (rule.rhs()[k - 1], new_sym)) left_side = new_sym result.append(new_production) last_prd = Production(left_side, rule.rhs()[-2:]) result.append(last_prd) else: result.append(rule) n_grammar = CFG(grammar.start(), result) return n_grammar
def convert_hybrid(grammar): ''' Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B ''' rules = grammar.productions() new_rules = [] for rule in rules: lhs = rule.lhs() rhs = rule.rhs() # check for hybrid rules if rule.is_lexical() and len(rhs) > 1: new_rhs = [] for item in rule.rhs(): if is_terminal(item): new_sym = Nonterminal(item) new_rhs.append(new_sym) # add new lexical rule with dummy lhs nonterminal new_rules.append(Production(new_sym, (item, ))) else: new_rhs.append(item) # add converted mixed rule with only non-terminals on rhs new_rules.append(Production(lhs, tuple(new_rhs))) else: new_rules.append(rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def parse(self, tokens): tagged = nltk.pos_tag(tokens) missing = False for tok, pos in tagged: if not self._grammar._lexical_index.get(tok): missing = True self._grammar._productions.append( ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001)) # WeightedProduction(Nonterminal(pos), [tok], prob=0.000001)) if missing: self._grammar._calculate_indexes() # returns a generator, so call 'next' to get the ProbabilisticTree tree = super(PCFGViterbiParser, self).parse(tokens) if issubclass(tree.__class__, nltk.tree.Tree): print 'returning a tree' return tree elif isinstance(tree, types.GeneratorType): try: return next(tree) except (StopIteration): tweet = ' '.join(tokens) print u'Couldn\'t parse {}'.format(tweet) return None else: error("Type of tree is: {}".format(type(tree)))
def code_to_sample (code, grammar, items=[Nonterminal("S")]): """Reconstructs expression and productions from parse tree encoding. Input: code - parse tree encoding in string format, as returned by generate sample grammar - PCFG object that was used to generate the code items - list containing start symbol for the grammar. Default: [Nonterminal("S")] Output: frags - expression in list form. Call "".join(frags) to get string. productions - list of used productions in string form. The parse tree is ordered top to bottom, left to right. code0 - auxilary variable, used by the recursive nature of the function. Should be an empty string. If not, something went wrong.""" code0 = code frags = [] productions=[] if len(items) == 1: if isinstance(items[0], Nonterminal): prods = grammar.productions(lhs=items[0]) prod = prods[int(code0[0])] productions += [prod] frag, productions_child, code0 = code_to_sample(code0[1:], grammar, prod.rhs()) frags += frag productions += productions_child else: frags += [items[0]] else: for item in items: frag, productions_child, code0 = code_to_sample (code0, grammar, [item]) frags += frag productions += productions_child #print(frags, code0) return frags, productions, code0
def demo2(): from nltk import Nonterminal, Production, CFG nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) grammar = CFG(S, productions) text = "I saw a man in the park".split() d = CFGDemo(grammar, text) d.mainloop()
def demo2(): from nltk import Nonterminal, Production, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = ContextFreeGrammar(S, productions) text = 'I saw a man in the park'.split() d = CFGDemo(grammar, text) d.mainloop()
def productions(self): prod = [] prod.append(Production(Nonterminal(self._label), self.children_name())) for i in self._child: if isinstance(i, Tree): prod.extend(i.productions()) return prod
def process_hybrid_productions(productions): new_productions_list = [] # list of new productions to_remove_list = [] # Hybrid production for p in productions: is_hybrid = 0 # flag that indicates if current production is hybrid if len(p.rhs() ) > 1: # more than one symbols are on the right hand side rh_list = [] # new list for right hand symbols for r_symbol in p.rhs(): if is_terminal(r_symbol): # for terminal symbol dummy_symbol = Nonterminal( r_symbol) # create dummy nonterminal new_productions_list.append( Production(dummy_symbol, [r_symbol])) # new unit production rh_list.append(dummy_symbol) is_hybrid = 1 # hybrid production confirmed else: # for nonterminal symbol rh_list.append(r_symbol) if is_hybrid: # need to remove original production and add some productions # in the loop, we won't change the list. Store them first. new_productions_list.append(Production( p.lhs(), rh_list)) # new production with dummy symbol to_remove_list.append(p) return to_remove_list, new_productions_list
def if_then_else_demo(): """ Demo if-then-else grammar """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar nonterminals = 'E E1 PLUS T T1 TIMES F LPAREN RPAREN ID' (E, E1, PLUS, T, T1, TIMES, F, LPAREN, RPAREN, ID) = [Nonterminal(s) for s in nonterminals.split()] productions = ( Production(E, [T, E1]), Production(E1, [PLUS, T, E1]), Production(E1, []), Production(T, [F, T1]), Production(T1, [TIMES, F, T1]), Production(T1, []), Production(F, [LPAREN, E, RPAREN]), Production(F, [ID]), Production(PLUS, ['+']), Production(TIMES, ['*']), Production(LPAREN, ['(']), Production(RPAREN, [')']), Production(ID, ['a']), Production(ID, ['b']), Production(ID, ['c']), ) grammar = ContextFreeGrammar(E, productions) text = "a * b + c".split() RecursiveDescentApp(grammar, text).mainloop()
def create_rule_series_helper(nonterminal_list): if len(nonterminal_list) < 2: return [] else: new_symbol = get_symbol(nonterminal_list[1]) for i in range( 2, len(nonterminal_list)): # combine last n-1 symbols as one new_symbol = new_symbol + '_' + get_symbol(nonterminal_list[i]) lh_symbol = get_symbol( nonterminal_list[0]) + '_' + new_symbol # symbol on the left hand productions = [ Production(Nonterminal(lh_symbol), (nonterminal_list[0], Nonterminal(new_symbol))) ] productions.extend(create_rule_series_helper(nonterminal_list[1:])) return productions
def demo(): from nltk import Nonterminal, parse_cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] grammar = parse_cfg(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """) def cb(grammar): print grammar top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop()
def __call__(self, last_actions): """ Returns the 'smart' mask :param last_actions: :return: """ if self.t >= self.MAX_LEN: raise StopIteration("maximum sequence length exceeded for decoder") mask = np.zeros([len(last_actions), len(self.grammar.GCFG.productions())]) if self.S is None: # populate the sequences with the root symbol self.S = [[{'token': Nonterminal('smiles')}] for _ in range(len(last_actions))] for s in self.S: s[0]['term_dist'] = self.term_dist_calc(s[0]) self.Stree =[[x for x in y] for y in self.S] for i, a in enumerate(last_actions): self.S[i], mask[i, :] = self.process_one_action(self.S[i], a) self.t += 1 self.prev_actions = last_actions self.mask = mask return mask
def load(self, filepath): cfg_string = ''.join(list(open(filepath).readlines())) # parse from nltk cfg_grammar = nltk.CFG.fromstring(cfg_string) # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar) self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar) # our info for rule macthing self.head_to_rules = head_to_rules = {} self.valid_tokens = valid_tokens = set() rule_ranges = {} total_num_rules = 0 first_head = None for line in cfg_string.split('\n'): if len(line.strip()) > 0: head, rules = line.split('->') head = Nonterminal(head.strip()) # remove space rules = [_.strip() for _ in rules.split('|')] # split and remove space rules = [ tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()]) for rule in rules ] head_to_rules[head] = rules for rule in rules: for t in rule: if isinstance(t, str): valid_tokens.add(t) if first_head is None: first_head = head rule_ranges[head] = (total_num_rules, total_num_rules + len(rules)) total_num_rules += len(rules) self.first_head = first_head self.rule_ranges = rule_ranges self.total_num_rules = total_num_rules
def __init__(self, grammar): """ grammar -- a binarised NLTK PCFG. """ assert grammar.is_binarised() self.grammar = grammar self.start_sym = grammar.start().symbol() self._pi = defaultdict(dict) self._bp = defaultdict(dict) # Dicts with logprobs of lexical and unlexical productions self.prods_lps_lex = ps_lex = defaultdict(dict) self.prods_lps_unlex = ps_unlex = defaultdict(dict) for p in grammar.productions(): # a str lhs = N.symbol(p.lhs()) # a tuple of str rhs = p.rhs() if p.is_lexical(): ps_lex[rhs][lhs] = p.logprob() else: rhs = tuple(map(N.symbol, p.rhs())) ps_unlex[rhs][lhs] = p.logprob()
def slade(t, g): prods2 = {} for p in g.productions(): if(p.lhs() not in prods2): prods2[p.lhs()] = [] # If rhs is not in the lhs key, add it to the rhs list if(p.rhs() not in prods2.get(p.lhs())): prods2[p.lhs()].append(p.rhs()) lcount = {} xs = []; # Dic with key = lhs, value list of rhs prods = {} # I'm gonna use this to avoid repeated Non Terminals rhs = set() # This set has all Insides non terminals inside = set() lastInside = -1 x = -1 i = -1 # Count apparitions of non terminals lhs for p in noRepeats(t.productions()): lcount[p.lhs()] = lcount.get(p.lhs(), 0) + 1 print p # add inside non terminals to inside set for key, value in lcount.iteritems(): if value > 1: inside.add(key) # Nodes in postOrder postOrder = t.treepositions(order='preorder') for n in postOrder: if type(t[n]) is not str: currentProd = t[n].productions()[0] # Add prods to dict where key is lhs and value a list of the rhs if(currentProd.lhs() not in prods): prods[currentProd.lhs()] = [] # If rhs is not in the lhs key, add it to the rhs list if(currentProd.rhs() not in prods.get(currentProd.lhs())): prods[currentProd.lhs()].append(currentProd.rhs()) # Remember last inside used, to add the +x if(currentProd.lhs() in inside): lastInside = int(Nonterminal.symbol(currentProd.lhs())) x = prods2[currentProd.lhs()].index(currentProd.rhs()) xs.append(x) i = 0 else: i = prods[currentProd.lhs()].index(currentProd.rhs()) # # change node name based on Inside # if(Nonterminal.symbol(currentProd.lhs()) != 'S'): # if(int(Nonterminal.symbol(currentProd.lhs())) == lastInside + 1): # t[n].node = t[n].node + "(" + str(i) + ")" + " + " + str(x) # xs.append(x); # # t[n].node = t[n].node + str(i) + str(x) # else: # t[n].node = t[n].node + "(" + str(i) + ")" # # t[n].node = t[n].node + str(i) # Print parse tree productions # # print "Productions: " # noRep = noRepeats(t.productions()) # for p in noRep: # if p.rhs() in rhs: # noRep.remove(p) # rhs.add(p.rhs()) # total = 0 # for p in noRep: # total += p.__len__() + 1 # print p print "ENCODING" print xs # draw parse tree t.draw()