def __init__(self, pcfg_filename): start = None with open(pcfg_filename, "r") as f: lines = f.readlines() productions = [] for line in lines: matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line) groups = matches.groups() group_count = len(groups) assert group_count == 4 lhs = nltk.Nonterminal(groups[0].strip()) if groups[2] is None: production = nltk.Production(lhs, [ groups[1].strip('\'') ]) else: production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ]) probability = float(groups[3].strip()) # Read the Production rule: if (start is None): start = lhs productions.append(production) self.probability_of_production[production] = probability self.grammar = nltk.grammar.CFG(start, productions, False)
def __count_productions_recursively(self, node: nltk.Tree) -> nltk.Nonterminal: """Recursively parses a tree representation of a sentence.""" label = node.label() # Traverse the tree: if (len(node) == 2): # Handle non-leaf nodes: left = self.__count_productions_recursively(node[0]) right = self.__count_productions_recursively(node[1]) production = nltk.Production( nltk.Nonterminal(label), [nltk.Nonterminal(left.lhs()), nltk.Nonterminal(right.lhs())]) else: # Handle leaf node. token = node[0] self.token_count += 1 if (token not in self.count_per_token): self.count_per_token[token] = 1 else: self.count_per_token[token] += 1 production = nltk.Production(nltk.Nonterminal(label), [token]) # Update our count of this particular productions. if (production not in self.count_per_production): self.count_per_production[production] = 1 else: self.count_per_production[production] += 1 # Update our count of all productions with a particular LHS. lhs = production.lhs() if (lhs not in self.lhs_count): self.lhs_count[lhs] = 1 else: self.lhs_count[lhs] += 1 return production
def __init__(self, pcfg_filename): start = None # Read the file: with open(pcfg_filename, "r") as f: lines = f.readlines() # Parse the file's contents: productions = [] for line in lines: matches = re.match("(\S+)\s*->\s*(\S+)(\s+\S+)?\s+\[([0-9.]+)\]?", line) groups = matches.groups() group_count = len(groups) assert group_count == 4 lhs = nltk.Nonterminal(groups[0].strip()) if groups[2] is None: production = nltk.Production(lhs, [ groups[1].strip('\'') ]) else: production = nltk.Production(lhs, [ nltk.Nonterminal(groups[1].strip()), nltk.Nonterminal(groups[2].strip()) ]) log_probability = math.log(float(groups[3].strip())) # Read the Production rule: if (start is None): start = lhs productions.append(production) self.log_probability_of_production[production] = log_probability if self.min_log_probability is None or math.fabs(log_probability) > math.fabs(self.min_log_probability): self.min_log_probability = log_probability self.grammar = nltk.grammar.CFG(start, productions, False) # Make it much less probable than the actual minimum_log_probability but still non-zero. self.min_log_probability = self.min_log_probability / 2
def convert2_nltk_CFG(G): terminals, NTs, P, S = G Prod = copy(P) # this is here to ensure full coverage of terminals # when parsing the grammar for testing Prod["DUMMY"] = [list(map(lambda x: (x, ), terminals))] assert len(S) > 0 # need a start symbol if len(S) > 1: if "NT0" not in Prod.keys(): Prod["NT0"] = [] for Si in S: Prod["NT0"].append([(Si, )]) assert "NT0" in S start = nltk.Nonterminal("NT0") nltk_nts = nltk.nonterminals(" ".join(list(NTs))) productions = [] # only look at nonterminals with productions for NT in Prod.keys(): for rule in Prod[NT]: rhs = rule_to_tuple(rule, NTs) #print("convert", NT, rhs) prod = nltk.Production(nltk.Nonterminal(NT), rhs) productions.append(prod) # production is empty here... return nltk.grammar.CFG(start, productions)
def hide_some_tokens(self): # Sort the dictionary of tokens by the token_count: for production in self.count_per_production.keys(): if self.count_per_production[production] == 0 or self.lhs_count[ production.lhs()] == 0: self.probability_per_production[production] = 0 else: self.probability_per_production[ production] = self.count_per_production[ production] / self.lhs_count[production.lhs()] sorted_productions = sorted(self.probability_per_production.items(), key=operator.itemgetter(1)) hide_target = int(round(self.hide_proportion * self.token_count, 0)) count_per_unk_production = {} # Look for the least probable productions and "delete" them by reducing their count to 0. # Instead, the weight is transferred to an corresponding <UNK> production, # which may pool over several productions that share the same LHS. i = 0 while hide_target > 0: production = sorted_productions[i][0] if len(production.rhs()) == 1: # We have a terminal: lhs = production.lhs() count = int(sorted_productions[i][1] * self.lhs_count[lhs]) assert self.count_per_production[production] == count hide_target -= count # We need to substitute this token production with an <UNK> production. unk_production = nltk.Production(lhs, {'<UNK>'}) # Transfer the weight of this production to the corresponding UNK-production: self.count_per_production[production] = 0 self.probability_per_production[production] = 0 if unk_production in count_per_unk_production: count_per_unk_production[unk_production] += count else: count_per_unk_production[unk_production] = count i += 1 # We couldn't add to or remove from the production collection in the previous iteration, # so we're going to make the necessary insertions now. for unk_production in count_per_unk_production.keys(): self.count_per_production[ unk_production] = count_per_unk_production[unk_production] if self.count_per_production[unk_production] == 0 or self.lhs_count[ unk_production.lhs()] == 0: self.probability_per_production[unk_production] = 0 else: self.probability_per_production[ unk_production] = self.count_per_production[ unk_production] / self.lhs_count[unk_production.lhs()] # Validation: Double check that we have the same number of tokens both before and after "hiding" tokens. count = 0 for production in self.count_per_production.keys(): if len(production.rhs()) == 1: count += self.count_per_production[production] assert count == self.token_count
def noleft_immediate(rules, lhs): (left, not_left) = group_rhs(rules, lhs, lhs) new_rules = [] if len(left) > 0: new_lhs = new_nonterminal(lhs) for r in not_left: new_rules.append(nltk.Production(lhs, r.rhs() + (new_lhs, ))) for r in left: old_rhs = r.rhs() if len(old_rhs) > 0: new_rules.append( nltk.Production(new_lhs, r.rhs()[1:] + (new_lhs, ))) else: new_rules.append(nltk.Production(lhs, old_rhs)) new_rules.append(nltk.Production(new_lhs, ())) else: new_rules = rules return new_rules
def expand_match(rules, expand_rules, lhs, match): (matched, not_matched) = group_rhs(expand_rules, lhs, match) new_rules = [] if len(matched) > 0: for r in matched: for match_rule in lhs_rules(rules, match): new_rules.append( nltk.Production(lhs, match_rule.rhs() + r.rhs()[1:])) new_rules.extend(not_matched) else: new_rules.extend(not_matched) return new_rules
def pcfg_train(trees, vocab): # Write a function pcfg_train() that takes as its input a collection # of nltk.tree.Tree objects. For example, it might be passed some # portion of nltk.corpus.treebank.parsed_sents(). This function # should return a nltk.PCFG object. all_productions = [] for t in trees: for p in t.productions(): all_productions.append(nltk.Production(p.lhs(), p.rhs())) pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions) return (pcfg)