def nonTerminal(self): for prod in self.grammar: if is_nonterminal(prod.lhs()): self.non_terminal.add(prod.lhs()) for i in range(len(prod.rhs())): y = prod.rhs()[i] if is_nonterminal(y): self.non_terminal.add(y)
def rule_adds_atom(p): atoms = ['c', 'n', 'o', 's', 'f', 'cl', 'br', 'i'] if any([x.lower() in atoms for x in p.rhs() if is_terminal(x)]) or \ any(['valence' in x._symbol for x in p.rhs() if is_nonterminal(x)]): return 1 elif any(['segment' in x._symbol for x in p.rhs() if is_nonterminal(x)]): return 2 else: return 0
def is_cnf(production): rhs = production.rhs() if len(rhs) == 1: return grammar.is_terminal(rhs[0]) elif len(rhs) == 2: return (grammar.is_nonterminal(rhs[0]) and grammar.is_nonterminal(rhs[1])) else: return False
def _generate_one(grammar, item, depth, maxlen): if depth > 0 and maxlen > 0: if is_nonterminal(item): for prod in grammar.productions(lhs=item): for frag in _generate_all(grammar, prod.rhs(), depth - 1, maxlen): yield frag else: yield [item] if depth > 0 and maxlen == 0 and is_nonterminal(item): # has empty production if any(prod.rhs() == tuple() for prod in grammar.productions(item)): yield []
def process_one_action(self, this_S, a): if a is not None: # 1. Apply the expansion from last prod rule this_rule = self.grammar.GCFG.productions()[a] # find the token to apply the expansion to for this_index, old_token in enumerate(this_S): if is_nonterminal(old_token['token']): break if this_rule.lhs() != Nonterminal('Nothing'): new_tokens = apply_rule(this_S, this_index, this_rule, self.grammar, self.checks)#apply_rule(old_token, this_rule, self.t) # do the replacement if self.checks: this_S[this_index]['children'] = new_tokens this_S = this_S[:this_index] + new_tokens + this_S[this_index + 1:] # 2. generate masks for next prod rule # find the index of the next token to expand, which is the first nonterminal in sequence for this_index, this_token in enumerate(this_S): if is_nonterminal(this_token['token']): break this_token = {'token': nltk.grammar.Nonterminal('Nothing')} # get the formal grammar mask self.grammar_mask = self.get_grammar_mask(this_token) if this_token['token'] == nltk.grammar.Nonterminal('Nothing'): # # we only get to this point if the sequence is fully expanded return this_S, self.grammar_mask # get the terminal distance mask if self.do_terminal_mask: term_distance = sum([x['term_dist'] for x in this_S])#sum([self.term_dist_calc(x) for x in this_S]) steps_left = self.MAX_LEN - self.t - 1 self.terminal_mask = np.zeros_like(self.grammar_mask) rule_dist = self.term_dist_calc.rule_d_term_dist(this_token) new_term_dist = rule_dist + term_distance self.terminal_mask[new_term_dist < steps_left - 1] = 1 else: self.terminal_mask = np.ones_like(self.grammar_mask) # if we're expanding a ring numeric token self.ring_mask = self.get_ring_mask(this_token, this_S, this_index) mask = self.grammar_mask * self.terminal_mask * self.ring_mask if self.checks: assert(not all([x == 0 for x in mask])) return this_S, mask
def is_unit(production): if (len(production.rhs()) == 1 and grammar.is_nonterminal(production.rhs()[0])): return True else: return False
def remove_unitary_productions(cfg_grammar): """ Remove unitary-productions that aren't terminals, by making sure all downstream productions get trickled up unfortunately, this is recursive, because you might create singletons as you're shifting things Note, this does NOT detect cycles """ unary = False productions = cfg_grammar.productions() for production in productions: if len(production) == 1: # Identity the first unary productions if is_nonterminal(production.rhs()[0]): unary = production break if not unary: # Base Case return cfg_grammar else: # get all productions of B, so we can make them all productions of A b_prods = cfg_grammar.productions(lhs=unary.rhs()[0]) b_rhses = [b_prod.rhs() for b_prod in b_prods] existing_productions = [prod for prod in productions if prod != unary] new_productions = [Production(unary.lhs(), b_rhs) for b_rhs in b_rhses] new_grammar = CFG(cfg_grammar.start(), existing_productions+new_productions) return remove_unitary_productions(new_grammar)
def children(g, parent): """Get Nonterminals that are used in a production or nonterminal productions Parameters ---------- g : nltk.CFG parent : nltk.Production or nltk.Nonterminal Returns ------- children : set of Nonterminal See Also -------- nltk.CFG, nltk.Nonterminal, nltk.Production """ res = set() if isinstance(parent, Production): prods = [parent] else: prods = g.productions(parent) for prod in prods: for item in prod.rhs(): if is_nonterminal(item): res.add(item) return res
def create_taskgrammar(grammar, task, encoders): logger.info('Creating specific grammar for task %s' % task) productions = grammar.productions(Nonterminal(task)) start_token = Nonterminal('S') new_productions = [] for start_production in productions: first_token = start_production.rhs()[0] if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'): for new_start_production in grammar.productions(first_token): new_productions.append(Production(start_token, new_start_production.rhs())) else: new_productions.append(Production(start_token, start_production.rhs())) for production in grammar.productions(): for new_production in new_productions: if production.lhs() in new_production.rhs() and production not in new_productions: if production.lhs().symbol() == 'ENCODERS': # Use encoders only for types of features in the dataset if len(encoders) > 0: new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders])) else: new_productions.append(Production(production.lhs(), ['E'])) else: new_productions.append(production) task_grammar = CFG(start_token, new_productions) with open(TASK_GRAMMAR_PATH, 'w') as fout: fout.write('\n'.join([str(x) for x in task_grammar.productions()])) return task_grammar
def apply(self, chart, grammar, edge): if edge.is_incomplete(): return found = edge.lhs() for prod in grammar.productions(rhs=found): bindings = {} if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue # We rename vars here, because we don't want variables # from the two different productions to match. used_vars = find_variables((prod.lhs(), ) + prod.rhs(), fs_class=FeatStruct) found = found.rename_variables(used_vars=used_vars) result = unify(_next, found, bindings, rename_vars=False) if result is None: continue new_edge = FeatureTreeEdge.from_production( prod, edge.start()).move_dot_forward(edge.end(), bindings) if chart.insert(new_edge, (edge, )): yield new_edge
def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not (left_edge.end() == right_edge.start() and left_edge.is_incomplete() and right_edge.is_complete() and isinstance(left_edge, FeatureTreeEdge)): return found = right_edge.lhs() nextsym = left_edge.nextsym() if isinstance(right_edge, FeatureTreeEdge): if not is_nonterminal(nextsym): return if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return # Create a copy of the bindings. bindings = left_edge.bindings() # We rename vars here, because we don't want variables # from the two different productions to match. found = found.rename_variables(used_vars=left_edge.variables()) # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to # generate B3 (result). result = unify(nextsym, found, bindings, rename_vars=False) if result is None: return else: if nextsym != found: return # Create a copy of the bindings. bindings = left_edge.bindings() # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) # Add it to the chart, with appropriate child pointers. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge
def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). nextsym_with_bindings = edge.next_with_bindings() done = self._done.get((nextsym_with_bindings, index), (None, None)) if done[0] is chart and done[1] is grammar: return for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves(): continue if first != chart.leaf(index): continue # We rename vars here, because we don't want variables # from the two different productions to match. if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): new_edge = FeatureTreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym_with_bindings, index] = (chart, grammar)
def apply(self, chart, grammar, edge): if edge.is_incomplete(): return found = edge.lhs() for prod in grammar.productions(rhs=found): bindings = {} if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue # We rename vars here, because we don't want variables # from the two different productions to match. used_vars = find_variables( (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct ) found = found.rename_variables(used_vars=used_vars) result = unify(_next, found, bindings, rename_vars=False) if result is None: continue new_edge = FeatureTreeEdge.from_production( prod, edge.start() ).move_dot_forward(edge.end(), bindings) if chart.insert(new_edge, (edge,)): yield new_edge
def is_separated(g): """Check if grmmar is separated Grammar is separated if all its productions start with a terminal And for all nonterminals, no two productions start with the same terminal Parameters ---------- g : nltk.CFG Returns ------- bool See Also -------- nltk.CFG, nltk.Nonterminal """ nonts = nonterminals(g) for nont in nonts: starts = set() for prod in g.productions(nont): start = prod.rhs()[0] if is_nonterminal(start): return False if start in starts: return False starts.add(start) return True
def process_unit_productions(productions, nonterminal_dict): # maintain a set which is same as the production list to speed up the program production_set = set(productions) need_another_loop = 0 to_remove_list = [] to_add_list = [] for p in productions: if len(p.rhs()) == 1 and is_nonterminal( p.rhs()[0]): # A->B, B is non-terminal to_remove_list.append(p) if p.rhs()[0] not in nonterminal_dict: nonterminal_dict[p.rhs()[0]] = [p.lhs()] need_another_loop = 1 elif p.lhs() not in nonterminal_dict[p.rhs()[0]]: a = nonterminal_dict[p.rhs()[0]] a.append(p.lhs()) nonterminal_dict[p.rhs()[0]] = a need_another_loop = 1 elif p.lhs() in nonterminal_dict: # B->C productions a = nonterminal_dict[p.lhs()] # productions with B on the left for item in a: # for every A in A->B new_production = Production(item, p.rhs()) # A->C if new_production not in production_set: production_set.add(new_production) # add to the grammar to_add_list.append(new_production) need_another_loop = 1 return to_add_list, nonterminal_dict, need_another_loop, to_remove_list
def _expand(symbol,grammar): if is_nonterminal(symbol): rules = grammar.productions(lhs=symbol) probs = [r.prob() for r in rules] rule = choice(rules,p=probs) return (rule.rhs(),log(rule.prob())) else: return ((symbol,),0.0)
def apply_production(sent, prod): res = [] for item in sent: if is_nonterminal(item) and item == prod.lhs(): res.extend(prod.rhs()) else: res.append(item) return res
def SampleFromPCFG(grammar,start=None): if start==None: tupleSymbols = (grammar.start(),) else: tupleSymbols = (start,) lprob = 0.0 while any(is_nonterminal(symbol) for symbol in tupleSymbols): tupleSymbols,lprob1 = _expand_seq(tupleSymbols,grammar) lprob += lprob1 return (tupleSymbols,lprob)
def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue new_edge = FeatureTreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge
def generate(symbol, grammar_dict): if symbol in grammar_dict: output = [] rule = random.choice(grammar_dict[symbol]) for element in rule: if grammar.is_nonterminal(element): output.append(generate(element.symbol(), grammar_dict)) else: output.append(element) return ' '.join(output) return symbol
def check_is_nonterminal(*nts): """ Asserts that all of one or more objects are Nonterminals. :param nts: An object, which may or may not be a Nonterminal :return: None """ for nt in nts: if not gr.is_nonterminal(nt): raise TypeError("{} must be a nonterminal".format({})) return
def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses): """Extract syntactic production rules for the given arg. Args: a_doc_id (str): id of the document a_arg (str): argument to extract productions for a_rel (dict): discourse relation to extract features for a_parses (dict): parsed sentences Returns: set: set of syntactic productions """ ret = set() # obtain token indices for each arg sentence snt_id = None snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST]) # obtain set of leaves corresponding to that argument arg_leaves = set() subt_leaves = set() processed_leaves = set() itree = itree_str = inode_path = None for snt_id, toks in snt2tok.iteritems(): itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE] itree = Tree.fromstring(itree_str) if not itree.leaves(): print("Invalid parse tree for sentence {:d}".format(snt_id), file=sys.stderr) continue # obtain all terminal syntactic nodes from the arg for itok in toks: inode_path = itree.leaf_treeposition(itok) arg_leaves.add(itree[inode_path]) # check all subtrees (not efficient, but easy to implement) for s_t in itree.subtrees(): subt_leaves.update(s_t.leaves()) if subt_leaves.issubset(arg_leaves) and \ not subt_leaves.issubset(processed_leaves): ret.update( str(p) for p in itree.productions() if any( is_nonterminal(n) for n in p.rhs())) processed_leaves.update(subt_leaves) subt_leaves.clear() if processed_leaves == arg_leaves: break arg_leaves.clear() processed_leaves.clear() return ret
def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses): """Extract syntactic production rules for the given arg. Args: a_doc_id (str): id of the document a_arg (str): argument to extract productions for a_rel (dict): discourse relation to extract features for a_parses (dict): parsed sentences Returns: set: set of syntactic productions """ ret = set() # obtain token indices for each arg sentence snt_id = None snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST]) # obtain set of leaves corresponding to that argument arg_leaves = set() subt_leaves = set() processed_leaves = set() itree = itree_str = inode_path = None for snt_id, toks in snt2tok.iteritems(): itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE] itree = Tree.fromstring(itree_str) if not itree.leaves(): print("Invalid parse tree for sentence {:d}".format(snt_id), file=sys.stderr) continue # obtain all terminal syntactic nodes from the arg for itok in toks: inode_path = itree.leaf_treeposition(itok) arg_leaves.add(itree[inode_path]) # check all subtrees (not efficient, but easy to implement) for s_t in itree.subtrees(): subt_leaves.update(s_t.leaves()) if subt_leaves.issubset(arg_leaves) and \ not subt_leaves.issubset(processed_leaves): ret.update(str(p) for p in itree.productions() if any(is_nonterminal(n) for n in p.rhs())) processed_leaves.update(subt_leaves) subt_leaves.clear() if processed_leaves == arg_leaves: break arg_leaves.clear() processed_leaves.clear() return ret
def generate_production(grammar): """Convert CNF grammar into dictionary where keys are RHS of the rules/productions and values are it's (rules/productions) corresponding LHS. Args: grammar ([type]): Object of type "nltk.grammar.CFG " containing the CNF grammar Returns: dict: CNF grammar with all productions. """ grammar_dict = {} for production in grammar.productions(): rhs = production.rhs() if len(rhs) == 2 and is_nonterminal(rhs[0]) and is_nonterminal(rhs[1]): key = (rhs[0].symbol(), rhs[1].symbol()) if key not in grammar_dict: grammar_dict[key] = [] grammar_dict[key].append(production) return grammar_dict
def gen_frame_line(self, nt): sentence = '' prods = random.sample(self.cfg.productions(lhs=nt),len(self.cfg.productions(lhs=nt))) valid = True for prod in prods: #valid = True for sym in prod.rhs(): if is_nonterminal(sym): if len(self.cfg.productions(lhs=sym)) < 1: valid = False if valid == True: for sym in prod.rhs(): if is_nonterminal(sym): sentence += self.gen_frame_line(sym) else: sentence += sym + ' ' break if valid == False: return "ERROR" else: return sentence #removed capitalize
def pcky(sentence, grammar): tokens = word_tokenize(sentence) ts = '[0]' for i, token in enumerate(tokens): ts += ' ' + token + ' [{}]'.format(i + 1) print(ts) non_terminal = set([ prod.lhs() for prod in grammar.productions() if is_nonterminal(prod.lhs()) ]) table = [[{nt: 0 for nt in non_terminal} for i in range(len(tokens) + 1)] for j in range(len(tokens) + 1)] for i, token in enumerate(tokens): productions = grammar.productions(rhs=token) for prod in productions: table[i][i + 1][prod.lhs()] = prod.prob() for span in range(2, len(tokens) + 1): for start in range(len(tokens) - span + 1): end = start + span for split in range(start + 1, end): non_term1 = table[start][split] non_term2 = table[split][end] for nt1 in non_term1: for nt2 in non_term2: if non_term1[nt1] > 0 and non_term2[nt2] > 0: prodlist = grammar.productions(rhs=nt1) for prod in prodlist: if prod.rhs() == (nt1, nt2): table[start][end][prod.lhs()] = prod.prob( ) * non_term1[nt1] * non_term2[nt2] print( '[{}] {}:({:.2f}) [{}] {}:({:.2f}) [{}] -> [{}] {}:({:.5f}) [{}]' .format(start, nt1, non_term1[nt1], split, nt2, non_term2[nt2], end, start, prod.lhs(), table[start][end][prod.lhs()], end)) if table[0][len(tokens)][grammar.start()] > 0: print('The sentence is derived from the grammar') return True else: print('The sentence is not derived from the grammar') return False
def pcky(sentence, grammar): tokens = word_tokenize(sentence) ts = '[0]' for i, token in enumerate(tokens): ts += ' ' + token + ' [{}]'.format(i + 1) print(ts) non_terminal = set([prod.lhs() for prod in grammar.productions() if is_nonterminal(prod.lhs())]) table = [[{nt: 0 for nt in non_terminal} for i in range(len(tokens) + 1)] for j in range(len(tokens) + 1)] for i, token in enumerate(tokens): productions = grammar.productions(rhs=token) for prod in productions: table[i][i + 1][prod.lhs()] = prod.prob() for span in range(2, len(tokens) + 1): for start in range(len(tokens) - span + 1): end = start + span for split in range(start + 1, end): non_term1 = table[start][split] non_term2 = table[split][end] for nt1 in non_term1: for nt2 in non_term2: if non_term1[nt1] > 0 and non_term2[nt2] > 0: prodlist = grammar.productions(rhs=nt1) for prod in prodlist: if prod.rhs() == (nt1, nt2): table[start][end][prod.lhs()] = prod.prob() * non_term1[nt1] * non_term2[nt2] print('[{}] {}:({:.2f}) [{}] {}:({:.2f}) [{}] -> [{}] {}:({:.5f}) [{}]'.format(start, nt1, non_term1[nt1], split, nt2, non_term2[nt2], end, start, prod.lhs(), table[start][end][prod.lhs()], end)) if table[0][len(tokens)][grammar.start()] > 0: print('The sentence is derived from the grammar') return True else: print('The sentence is not derived from the grammar') return False
def eliminate_singular_rules(prods, root_token): # eliminates all rules whose rhs has only one member, by substitution terminals, nonterminals = get_terminals_nonterminals(prods) # find all singular rules p with p.lhs = nt # check if there are any non-singular rules or whether we can eliminate nt singles = {nt: [] for nt in nonterminals} others = {nt: [] for nt in nonterminals} for prod in prods: if is_singular(prod): singles[prod.lhs()].append(prod) elif len(prod.rhs()) >= 1: others[prod.lhs()].append(prod) else: raise ValueError("rhs must have at least one member! " + str(prod)) lhs_has_others = set(others) new_prods = prods # first replace all singular rules starting with root token: new_prods = recursively_replace_root_singulars(new_prods, root_token) # for each lhs with singulars: for lhs, these_singles in singles.items(): new_prods = recursively_replace_lhs( [p for p in new_prods if p not in these_singles], lhs, these_singles, lhs in lhs_has_others) if len(these_singles) > 0: # after we replaced one lhs, need to index the remaining rules all over again break # check if we still have any singles left, and repeat until done if any([ len(prod.rhs()) == 1 and is_nonterminal(prod.rhs()[0]) for prod in prods ]): return eliminate_singular_rules(new_prods, root_token) else: return new_prods
def endings(g, n): """Get right hand sides that consist only of terminals Parameters ---------- g : nltk.CFG n : nltk.Nonterminal Returns ------- children : set of Nonterminal See Also -------- nltk.CFG, nltk.Nonterminal """ res = set() for prod in g.productions(n): if all(not is_nonterminal(item) for item in prod.rhs()): res.add(prod.rhs())
def get_symbol(element): if is_nonterminal(element): return element.symbol() else: return element
def apply_rule(S, this_index, this_rule, grammar, checks=False): this_token = dict(S[this_index]) this_inner_token = this_token['token'] # do some safety checks if checks: assert (this_inner_token == this_rule.lhs()) if ('cycle' in this_inner_token._symbol or 'num' in this_inner_token._symbol) \ and 'size' not in this_token: # 'cycle' and 'num' tokens only appear in cycles, where they are assigned ring_sizes raise ValueError("'cycle' and 'num' tokens only appear in cycles, where they are assigned ring_sizes") # get the expansion new_tokens = [{'token': x} for x in this_rule.rhs()] propagate_strings = ['cycle', 'num'] num_nonterms = ['num', 'num1'] # if the expansion is a new ring, assign the numeral to use num_map ={} if 'ring' in this_token['token']._symbol: num_id = uuid.uuid4() num1_id = uuid.uuid4() for x in new_tokens: if is_nonterminal(x['token']) and \ any([ps in x['token']._symbol for ps in propagate_strings]): x['size'] = 1 if grammar is not None: if x['token'] == Nonterminal('num1'): # this is very hacky, to do better want modular aromatic cycles x['num'] = num1_id else: x['num'] = num_id else: x['num'] = None else: this_token['num'] = None this_token['size'] = 0 elif 'num' in this_token: if this_token['token']._symbol in ['num', 'num1']: # tag the resulting terminal so we know it's a cycle numeral, not a charge numeral for x in new_tokens: x['is_cycle_numeral'] = True # if this_token is a cycle propagation token, propagate the numeral and size counter for x in new_tokens: if is_nonterminal(x['token']) and \ any([ps in x['token']._symbol for ps in propagate_strings]): x['num'] = this_token['num'] x['size'] = this_token['size'] + rule_adds_atom(this_rule) if checks: for x in new_tokens: if is_nonterminal(x['token']) and \ any([ps in x['token']._symbol for ps in propagate_strings]): assert('num' in x and 'size' in x) for x in new_tokens: try: x['term_dist'] = term_dist_calc(x) except: pass return new_tokens
def check_canonical(g): """Check grammar for canonical rules violation These rules are (in simple words): #. Starting symbol must not appear in any rhs #. There should be no unproductive or unreachable nonterminals #. All nonterminals, except starting one, must have more than one production #. For all nonterminals, except starting one, all its productions must not end with the same terminal #. Every pair of nonterminals, except with starting one, must produce different languages #. For all nonterminals, except starting one, all its productions must not end with the same nonterminal Parameters ---------- g : nltk.CFG Must be also separated grammar Returns ------- broken_rules : set of int: Set with broken rule numbers that have been broken So, if this set is empty - grammar may be canonical See Also -------- nltk.CFG, nltk.Nonterminal """ if not is_separated(g): raise ValueError("Non-separated grammar was given") nonts = nonterminals(g) broken_rules = set() ends = {nont: set() for nont in nonts} counts = {nont: 0 for nont in nonts} for prod in g.productions(): ends[prod.lhs()].add(prod.rhs()[-1]) counts[prod.lhs()] += 1 for item in prod.rhs(): if item == g.start(): broken_rules.add(1) for end in ends.values(): if len(end) == 1: if is_nonterminal(end.pop()): broken_rules.add(6) else: broken_rules.add(4) for nont, num in counts.items(): if nont == g.start(): continue if num == 1: broken_rules.add(3) trash1 = unproductive(g) trash2 = unreachable(g) if trash1 or trash2: broken_rules.add(2) for n1, n2 in itertools.combinations(nonts, 2): if nonterm_equal(g, n1, n2): broken_rules.add(5) return broken_rules
def is_propagator(x): return is_nonterminal(x) and any( [ps in x._symbol for ps in propagate_strings])
def min_pnet(g): """Generate a minimal Pnet that can be restored back to grammar Also calculates `t` and `h` values for a grammar These values are properties of the grammar, and used in restoration They are accessible via `graph` field of the Pnet Parameters ---------- g : nltk.CFG Returns ------- net : Pnet See Also -------- pnet.Pnet, nltk.grammar """ nont_sents = _minimal_different_sents(g) t = len(max((s for sents in nont_sents.values() for s in sents), key=len)) nets = {nont: Pnet(sents) for nont, sents in nont_sents.items()} start = g.start() res = Pnet(prod.rhs() for prod in g.productions(start)) res.graph['t'] = t completed = {start} change = True while change: change = False for (s, e, k) in list(res.edges(keys=True)): if is_nonterminal(k): if k in completed: res.remove_edge(s, e, k) res.insert(nets[k], s, e) else: change = True completed.add(k) temp = Pnet(prod.rhs() for prod in g.productions(k)) res.remove_edge(s, e, k) res.insert(temp, s, e) tree = res.subnet_tree() h = 0 for subnet in tree.nodes(): parent_start = subnet[0] for child_net in tree.successors(subnet): child_start = child_net[0] pathlen = len(max(nx.all_simple_edge_paths(res, parent_start, child_start), key=len)) h = max(h, pathlen) res.graph['h'] = h return res
def is_singular(prod): return len(prod.rhs()) == 1 and is_nonterminal(prod.rhs()[0])