Exemplo n.º 1
0
    def _trees(self, edge, complete, memo, tree_class):
        assert complete, "CCGChart cannot build incomplete trees"

        if edge in memo:
            return memo[edge]

        if isinstance(edge, CCGLeafEdge):
            word = tree_class(edge.token(), [self._tokens[edge.start()]])
            leaf = tree_class((edge.token(), "Leaf"), [word])
            memo[edge] = [leaf]
            return [leaf]

        memo[edge] = []
        trees = []

        for cpl in self.child_pointer_lists(edge):
            child_choices = [
                self._trees(cp, complete, memo, tree_class) for cp in cpl
            ]
            for children in itertools.product(*child_choices):
                lhs = (
                    Token(
                        self._tokens[edge.start():edge.end()],
                        edge.lhs(),
                        compute_semantics(children, edge),
                    ),
                    str(edge.rule()),
                )
                trees.append(tree_class(lhs, children))

        memo[edge] = trees
        return trees
Exemplo n.º 2
0
    def generateParseTree(cons, chart):
        """
		Helper function that returns an NLTK Tree object representing a parse.
		"""
        token = Token(None, cons.cat, None)
        if isinstance(cons, AtomicConstituent):
            return nltk.tree.Tree((token, u"Leaf"),
                                  [nltk.tree.Tree(token, [cons.word])])
        else:
            if cons.rule == CCGParser.typeRaising:
                return nltk.tree.Tree(
                    (token, cons.rule.name),
                    [CCGParser.generateParseTree(cons.ptrs[0], chart)])
            else:
                return nltk.tree.Tree((token, cons.rule.name), [
                    CCGParser.generateParseTree(cons.ptrs[0], chart),
                    CCGParser.generateParseTree(cons.ptrs[1], chart)
                ])
Exemplo n.º 3
0
    def parse(self, tokens, return_aux=False):
        """
    Args:
      tokens: list of string tokens
      return_aux: return auxiliary information (`weights`, `valid_edges`)

    Returns:
      parses: list of CCG derivation results
      if return_aux, the list is actually a tuple with `parses` as its first
      element and the other following elements:
        weight: float parse weight
        edges: `tokens`-length list of the edge tokens used to generate this
          parse
    """
        tokens = list(tokens)
        lex = self._lexicon

        # Collect potential leaf edges for each index. May be multiple per
        # token.
        edge_cands = [[
            nchart.CCGLeafEdge(i, l_token, token)
            for l_token in lex.categories(token)
        ] for i, token in enumerate(tokens)]

        # Run a parse for each of the product of possible leaf nodes,
        # and merge results.
        results = []
        used_edges = []
        for edge_sequence in itertools.product(*edge_cands):
            chart = nchart.CCGChart(list(tokens))
            for leaf_edge in edge_sequence:
                chart.insert(leaf_edge, ())

            partial_results = list(self._parse_inner(chart))
            results.extend(partial_results)

            if return_aux:
                # Track which edge values were used to generate these parses.
                used_edges.extend([edge_sequence] * len(partial_results))

        ## Monkey-patches.
        for result in results:
            root, operation = result.label()
            sem = root.semantics()
            new_sem = None
            if not sem:
                continue

            # #1: post-hoc type raise on negation
            # (not(\x.foo(x)))(a) ==> not(foo(a))
            # TODO this is probably logically inconsistent, and should be fixed
            # elsewhere upstream (e.g. in clevros.lexicon:attempt_candidate_parse)
            if isinstance(sem, l.ApplicationExpression) \
                and isinstance(sem.pred, l.NegatedExpression):
                new_sem = l.NegatedExpression(
                    l.ApplicationExpression(sem.pred.term, sem.args[0]))

            if new_sem is not None:
                new_sem = new_sem.simplify()
                new_root = Token(root._token, root.categ(), new_sem,
                                 root.weight())
                result.set_label((new_root, operation))

        # Score using Bayes' rule, calculated with lexicon weights.
        cat_priors = self._lexicon.observed_category_distribution()
        total_cat_masses = self._lexicon.total_category_masses()

        def score_parse(parse):
            score = 0.0
            for _, token in parse.pos():
                if total_cat_masses[token.categ()] == 0:
                    return -np.inf
                # TODO not the same scoring logic as in novel word induction .. an
                # ideal Bayesian model would have these aligned !! (No smoothing here)
                likelihood = max(token.weight(),
                                 1e-6) / total_cat_masses[token.categ()]
                logp = 0.5 * np.log(cat_priors[token.categ()])
                logp += np.log(likelihood)

                score += logp
            return score

        results = sorted(results, key=score_parse, reverse=True)
        if not return_aux:
            return results
        return [(parse, score_parse(parse), used_edges_i)
                for parse, used_edges_i in zip(results, used_edges)]
Exemplo n.º 4
0
def infer_listener_rsa(lexicon, entry, alpha=1.0, step_size=5.0):
    """
  Infer the semantic form of an entry in a weighted lexicon via RSA
  inference.

  Args:
    lexicon: CCGLexicon
    entry: string word
    alpha: RSA optimality parameter
    step_size: total amount of weight mass to add to potential token
      weights. percent of allocated mass for each token is determined by
      RSA probability of entry -> token mapping

  Returns:
    tokens: list of tokens with newly inferred weights
  """

    # literal listener weights p(s|u) are encoded in lexicon token weights
    # -- no need to calculate.

    # derive pragmatic speaker weights
    # p(u|s)
    speaker_weights = defaultdict(dict)
    # iterate over tokens allowed by listener (for now, anything in the
    # lexicon)
    for token in lexicon._entries[entry]:
        # gather possible ways to express the meaning
        # TODO cache this reverse lookup?
        semantic_weights = {}

        for alt_word, alt_tokens in lexicon._entries.items():
            for alt_token in alt_tokens:
                if alt_token.categ() == token.categ() \
                    and alt_token.semantics() == token.semantics():
                    semantic_weights[alt_token._token] = np.exp(
                        alpha * alt_token.weight())

        # Normalize.
        total = sum(semantic_weights.values())
        speaker_weights[token] = {
            k: v / total
            for k, v in semantic_weights.items()
        }

    # derive pragmatic listener weights: transpose and renormalize
    pl_weights = {}
    total = 0
    for token, word_weights in speaker_weights.items():
        for word, weight in word_weights.items():
            if word == entry:
                pl_weights[token] = weight
            total += weight

    pl_weights = {k: v / total for k, v in pl_weights.items()}

    # Create a list of reweighted tokens
    # Add `step_size` weight mass in total to tokens, allocating according to
    # inferred weights.
    new_tokens = [
        Token(token=entry,
              categ=t.categ(),
              semantics=t.semantics(),
              weight=t.weight() + step_size * p)
        for t, p in pl_weights.items()
    ]
    return new_tokens