def _trees(self, edge, complete, memo, tree_class): assert complete, "CCGChart cannot build incomplete trees" if edge in memo: return memo[edge] if isinstance(edge, CCGLeafEdge): word = tree_class(edge.token(), [self._tokens[edge.start()]]) leaf = tree_class((edge.token(), "Leaf"), [word]) memo[edge] = [leaf] return [leaf] memo[edge] = [] trees = [] for cpl in self.child_pointer_lists(edge): child_choices = [ self._trees(cp, complete, memo, tree_class) for cp in cpl ] for children in itertools.product(*child_choices): lhs = ( Token( self._tokens[edge.start():edge.end()], edge.lhs(), compute_semantics(children, edge), ), str(edge.rule()), ) trees.append(tree_class(lhs, children)) memo[edge] = trees return trees
def generateParseTree(cons, chart): """ Helper function that returns an NLTK Tree object representing a parse. """ token = Token(None, cons.cat, None) if isinstance(cons, AtomicConstituent): return nltk.tree.Tree((token, u"Leaf"), [nltk.tree.Tree(token, [cons.word])]) else: if cons.rule == CCGParser.typeRaising: return nltk.tree.Tree( (token, cons.rule.name), [CCGParser.generateParseTree(cons.ptrs[0], chart)]) else: return nltk.tree.Tree((token, cons.rule.name), [ CCGParser.generateParseTree(cons.ptrs[0], chart), CCGParser.generateParseTree(cons.ptrs[1], chart) ])
def parse(self, tokens, return_aux=False): """ Args: tokens: list of string tokens return_aux: return auxiliary information (`weights`, `valid_edges`) Returns: parses: list of CCG derivation results if return_aux, the list is actually a tuple with `parses` as its first element and the other following elements: weight: float parse weight edges: `tokens`-length list of the edge tokens used to generate this parse """ tokens = list(tokens) lex = self._lexicon # Collect potential leaf edges for each index. May be multiple per # token. edge_cands = [[ nchart.CCGLeafEdge(i, l_token, token) for l_token in lex.categories(token) ] for i, token in enumerate(tokens)] # Run a parse for each of the product of possible leaf nodes, # and merge results. results = [] used_edges = [] for edge_sequence in itertools.product(*edge_cands): chart = nchart.CCGChart(list(tokens)) for leaf_edge in edge_sequence: chart.insert(leaf_edge, ()) partial_results = list(self._parse_inner(chart)) results.extend(partial_results) if return_aux: # Track which edge values were used to generate these parses. used_edges.extend([edge_sequence] * len(partial_results)) ## Monkey-patches. for result in results: root, operation = result.label() sem = root.semantics() new_sem = None if not sem: continue # #1: post-hoc type raise on negation # (not(\x.foo(x)))(a) ==> not(foo(a)) # TODO this is probably logically inconsistent, and should be fixed # elsewhere upstream (e.g. in clevros.lexicon:attempt_candidate_parse) if isinstance(sem, l.ApplicationExpression) \ and isinstance(sem.pred, l.NegatedExpression): new_sem = l.NegatedExpression( l.ApplicationExpression(sem.pred.term, sem.args[0])) if new_sem is not None: new_sem = new_sem.simplify() new_root = Token(root._token, root.categ(), new_sem, root.weight()) result.set_label((new_root, operation)) # Score using Bayes' rule, calculated with lexicon weights. cat_priors = self._lexicon.observed_category_distribution() total_cat_masses = self._lexicon.total_category_masses() def score_parse(parse): score = 0.0 for _, token in parse.pos(): if total_cat_masses[token.categ()] == 0: return -np.inf # TODO not the same scoring logic as in novel word induction .. an # ideal Bayesian model would have these aligned !! (No smoothing here) likelihood = max(token.weight(), 1e-6) / total_cat_masses[token.categ()] logp = 0.5 * np.log(cat_priors[token.categ()]) logp += np.log(likelihood) score += logp return score results = sorted(results, key=score_parse, reverse=True) if not return_aux: return results return [(parse, score_parse(parse), used_edges_i) for parse, used_edges_i in zip(results, used_edges)]
def infer_listener_rsa(lexicon, entry, alpha=1.0, step_size=5.0): """ Infer the semantic form of an entry in a weighted lexicon via RSA inference. Args: lexicon: CCGLexicon entry: string word alpha: RSA optimality parameter step_size: total amount of weight mass to add to potential token weights. percent of allocated mass for each token is determined by RSA probability of entry -> token mapping Returns: tokens: list of tokens with newly inferred weights """ # literal listener weights p(s|u) are encoded in lexicon token weights # -- no need to calculate. # derive pragmatic speaker weights # p(u|s) speaker_weights = defaultdict(dict) # iterate over tokens allowed by listener (for now, anything in the # lexicon) for token in lexicon._entries[entry]: # gather possible ways to express the meaning # TODO cache this reverse lookup? semantic_weights = {} for alt_word, alt_tokens in lexicon._entries.items(): for alt_token in alt_tokens: if alt_token.categ() == token.categ() \ and alt_token.semantics() == token.semantics(): semantic_weights[alt_token._token] = np.exp( alpha * alt_token.weight()) # Normalize. total = sum(semantic_weights.values()) speaker_weights[token] = { k: v / total for k, v in semantic_weights.items() } # derive pragmatic listener weights: transpose and renormalize pl_weights = {} total = 0 for token, word_weights in speaker_weights.items(): for word, weight in word_weights.items(): if word == entry: pl_weights[token] = weight total += weight pl_weights = {k: v / total for k, v in pl_weights.items()} # Create a list of reweighted tokens # Add `step_size` weight mass in total to tokens, allocating according to # inferred weights. new_tokens = [ Token(token=entry, categ=t.categ(), semantics=t.semantics(), weight=t.weight() + step_size * p) for t, p in pl_weights.items() ] return new_tokens