Пример #1
0
    def can_generate_greedy(self, tree, da):
        """Check if the candidate generator can generate a given tree greedily, always
        pursuing the first viable path.

        This is for debugging purposes only.
        Uses `get_all_successors` and always goes on with the first one that increases coverage
        of the current tree.
        """
        self.init_run(da)
        cur_subtree = TreeData()
        found = True

        while found and cur_subtree != tree:
            found = False
            for succ in self.get_all_successors(cur_subtree):
                # use the first successor that is still a subtree of the target tree
                if tree.common_subtree_size(succ) == len(succ):
                    cur_subtree = succ
                    found = True
                    break

        # we have hit a dead end
        if cur_subtree != tree:
            log_info('Did not find tree: ' + str(tree) + ' for DA: ' + str(da))
            return False

        # everything alright
        log_info('Found tree: %s for DA: %s' % (str(tree), str(da)))
        return True
Пример #2
0
    def ids_to_tree(self, emb, postprocess=True):
        """Create a fake (flat) t-tree from token embeddings (IDs).

        @param emb: source embeddings (token IDs)
        @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \
            markers)? True by default.
        @return: the corresponding tree
        """

        tree = TreeData()
        tokens = self.ids_to_strings(emb)

        for token in tokens:
            if token in ['<GO>', '<STOP>', '<VOID>']:
                continue
            if postprocess:
                # casing (only if set to lowercase)
                if self.lowercase and len(tree) == 1 or tree.nodes[-1].t_lemma in ['.', '?', '!']:
                    token = token[0].upper() + token[1:]
                # plural merging (if plural tokens come up)
                if token == '<-s>' and tree.nodes[-1].t_lemma is not None:
                    token = self._singular_to_plural(tree.nodes[-1].t_lemma)
                    tree.remove_node(len(tree) - 1)
                elif token == '<-s>':
                    continue

            tree.create_child(0, len(tree), NodeData(token, 'x'))

        return tree
Пример #3
0
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Пример #4
0
    def can_generate(self, tree, da):
        """Check if the candidate generator can generate a given tree at all.

        This is for debugging purposes only.
        Tries if get_all_successors always returns a successor that leads to the given tree
        (puts on the open list only successors that are subtrees of the given tree).
        """
        self.init_run(da)
        open_list = CandidateList({TreeData(): 1})
        found = False
        tree_no = 0

        while open_list and not found:
            cur_st, _ = open_list.pop()
            if cur_st == tree:
                found = True
                break
            for succ in self.get_all_successors(cur_st):
                tree_no += 1
                # only push on the open list if the successor is still a subtree of the target tree
                if tree.common_subtree_size(succ) == len(succ):
                    open_list.push(succ, len(succ))

        if not found:
            log_info('Did not find tree: ' + str(tree) + ' for DA: ' +
                     str(da) + ('(total %d trees)' % tree_no))
            return False
        log_info('Found tree: %s for DA: %s (as %d-th tree)' %
                 (str(tree), str(da), tree_no))
        return tree_no
Пример #5
0
    def _beam_search(self, enc_inputs, da):
        """Run beam search decoding."""

        # true "batches" not implemented
        assert len(enc_inputs[0]) == 1

        # run greedy decoder for comparison (debugging purposes)
        log_debug("GREEDY DEC WOULD RETURN:\n" +
                  " ".join(self.tree_embs.ids_to_strings(
                      [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]])))

        # initialize
        self._init_beam_search(enc_inputs)
        empty_tree_emb = self.tree_embs.get_embeddings(TreeData())
        dec_inputs = cut_batch_into_steps([empty_tree_emb])

        paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])]

        # beam search steps
        for step in xrange(len(dec_inputs)):

            new_paths = []

            for path in paths:
                out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states)
                new_paths.extend(path.expand(self.beam_size, out_probs, st))

            def cmp_func(p, q):
                """Length-weighted comparison of two paths' logprobs."""
                return cmp(p.logprob / (len(p) ** self.length_norm_weight),
                           q.logprob / (len(q) ** self.length_norm_weight))

            paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size]

            if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]):
                break  # stop decoding if we have reached the end in all paths

            log_debug(("\nBEAM SEARCH STEP %d\n" % step) +
                      "\n".join([("%f\t" % p.logprob) +
                                 " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs]))
                                 for p in paths]) + "\n")

        # rerank paths by their distance to the input DA
        if self.classif_filter or self.context_bleu_weight:
            paths = self._rerank_paths(paths, da)

        # measure slot error on the top k paths
        if self.slot_err_stats:
            for path in paths[:self.sample_top_k]:
                self.slot_err_stats.append(
                        da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs]))

        # select the "best" path -- either the best, or one in top k
        if self.sample_top_k > 1:
            best_path = self._sample_path(paths[:self.sample_top_k])
        else:
            best_path = paths[0]

        # return just the best path (as token IDs)
        return np.array(best_path.dec_inputs)
Пример #6
0
    def _get_greedy_decoder_output(self, enc_inputs, dec_inputs, compute_cost=False):
        """Run greedy decoding with the given inputs; return decoder outputs and the cost
        (if required). For ensemble decoding, the gready search is implemented as a beam
        search with a beam size of 1.

        @param enc_inputs: encoder inputs (list of token IDs)
        @param dec_inputs: decoder inputs (list of token IDs)
        @param compute_cost: if True, decoding cost is computed (the dec_inputs must be valid trees)
        @return a tuple of list of decoder outputs + decoding cost (None if not required)
        """
        # TODO batches and cost computation not implemented
        assert len(enc_inputs[0]) == 1 and not compute_cost

        self._init_beam_search(enc_inputs)

        # for simplicity, this is implemented exacly like a beam search, but with a path sized one
        empty_tree_emb = self.tree_embs.get_embeddings(TreeData())
        dec_inputs = cut_batch_into_steps([empty_tree_emb])
        path = self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])

        for step in xrange(len(dec_inputs)):
            out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states)
            path = path.expand(1, out_probs, st)[0]

            if path.dec_inputs[-1] == self.tree_embs.VOID:
                break  # stop decoding if we have reached the end of path

        # return just token IDs, ignore cost computation here
        return np.array(path.dec_inputs), None
Пример #7
0
    def ids_to_tree(self, emb):
        """Rebuild a tree from the embeddings (token IDs).

        @param emb: source embeddings (token IDs)
        @return: the corresponding tree
        """

        tree = TreeData()
        tree.nodes = []  # override the technical root -- the tree will be created including the technical root
        tree.parents = []

        # build the tree recursively (start at position 2 to skip the <GO> symbol and 1st opening bracket)
        self._create_subtree(tree, -1, emb, 2)
        return tree
Пример #8
0
    def _greedy_decoding(self, enc_inputs, gold_trees):
        """Run greedy decoding with the given encoder inputs; optionally use given gold trees
        as decoder inputs for cost computation."""

        # prepare decoder inputs (either fake, or true but used just for cost computation)
        if gold_trees is None:
            empty_tree_emb = self.tree_embs.get_embeddings(TreeData())
            dec_inputs = cut_batch_into_steps([empty_tree_emb for _ in enc_inputs[0]])
        else:
            dec_inputs = cut_batch_into_steps([self.tree_embs.get_embeddings(tree)
                                               for tree in gold_trees])

        # run the decoding per se
        dec_output_ids, dec_cost = self._get_greedy_decoder_output(
                enc_inputs, dec_inputs, compute_cost=gold_trees is not None)

        return dec_output_ids, dec_cost
Пример #9
0
    def ids_to_tree(self, emb, postprocess=True):
        """Create a fake (flat) t-tree from token embeddings (IDs).

        @param emb: source embeddings (token IDs)
        @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \
            markers)? True by default.
        @return: the corresponding tree
        """

        tree = TreeData()
        tokens = self.ids_to_strings(emb)

        for token in tokens:
            if token in ['<GO>', '<STOP>', '<VOID>']:
                continue
            tree.create_child(0, len(tree), NodeData(token, 'x'))

        return tree
Пример #10
0
    def _init_training(self, das, trees, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (0.0-1.0)
        """
        # read input from files or take it directly from parameters
        if not isinstance(das, list):
            log_info('Reading DAs from ' + das + '...')
            das = read_das(das)
        if not isinstance(trees, list):
            log_info('Reading t-trees from ' + trees + '...')
            ttree_doc = read_ttrees(trees)
            if self.mode == 'tokens':
                tokens = tokens_from_doc(ttree_doc, self.language,
                                         self.selector)
                trees = self._tokens_to_flat_trees(tokens)
            elif self.mode == 'tagged_lemmas':
                tls = tagged_lemmas_from_doc(ttree_doc, self.language,
                                             self.selector)
                trees = self._tokens_to_flat_trees(tls, use_tags=True)
            else:
                trees = trees_from_doc(ttree_doc, self.language, self.selector)
        elif self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(
                trees, use_tags=self.mode == 'tagged_lemmas')

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # ignore contexts, if they are contained in the DAs
        if isinstance(self.train_das[0], tuple):
            self.train_das = [da for (context, da) in self.train_das]
        # delexicalize if DAs are lexicalized and we don't want that
        if self.delex_slots:
            self.train_das = [
                da.get_delexicalized(self.delex_slots) for da in self.train_das
            ]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DA.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([
                self.tree_embs.get_embeddings(tree)
                for tree in self.train_trees
            ])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False,
                                            binarize_numeric=True)
            self.X = [
                self.tree_feats.get_features(tree, {})
                for tree in self.train_trees
            ]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [
            self.da_feats.get_features(None, {'da': da})
            for da in self.train_das
        ]
        self.y = self.da_vect.fit_transform(self.y)
        log_info('Number of binary classes: %d.' %
                 len(self.da_vect.get_feature_names()))

        # initialize I/O shapes
        if not self.tree_embs:
            self.input_shape = list(self.X[0].shape)
        else:
            self.input_shape = self.tree_embs.get_embeddings_shape()
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
        # initialize the NN variables
        self.session.run(tf.global_variables_initializer())
Пример #11
0
from tgen.planner import CandidateList
from tgen.tree import TreeData, NodeData
import random
import zlib

random.seed(1206)

l = CandidateList()
for i in xrange(10000):
    #    l[str(i)] = random.randint(0, 100)
    #    l[str(random.randint(0,1000))] = random.randint(0, 100)
    #    l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100)
    #    tree = TreeData()
    #    tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000))))
    #    l[tree] = random.randint(0, 100)
    tree = TreeData()
    for j in xrange(random.randint(1, 10)):
        tree.create_child(
            random.randint(0,
                           len(tree) - 1),
            random.randint(0, 1) == 1,
            NodeData(str(random.randint(0, 1000)), str(random.randint(0,
                                                                      1000))))
    l[tree] = random.randint(0, 100)
x = []
while l:
    x.append(l.pop())
print zlib.crc32(str(x))