예제 #1
0
    def _create_subtree(self, tree, parent_idx, emb, pos):
        """Recursive subroutine used for `ids_to_tree()`, do not use otherwise.
        Solves a subtree (starting just after the opening bracket, returning a position
        just after the corresponding closing bracket).

        @param tree: the tree to work on (will be enhanced by the subtree)
        @param parent_idx: the ID of the parent for the current subtree
        @param emb: the source embeddings
        @param pos: starting position in the source embeddings
        @return: the final position used in the current subtree
        """

        if pos >= len(emb):  # avoid running out of the tree (for invalid trees)
            return pos

        node_idx = tree.create_child(parent_idx, len(tree), NodeData(None, None))
        t_lemma = None
        formeme = None

        while pos < len(emb) and emb[pos] not in [self.BR_CLOSE, self.STOP, self.VOID]:

            if emb[pos] == self.BR_OPEN:
                # recurse into subtree
                pos = self._create_subtree(tree, node_idx, emb, pos + 1)

            elif emb[pos] == self.UNK_T_LEMMA:
                if t_lemma is None:
                    t_lemma = self.id_to_string[self.UNK_T_LEMMA]
                pos += 1

            elif emb[pos] == self.UNK_FORMEME:
                if formeme is None:
                    formeme = self.id_to_string[self.UNK_FORMEME]
                pos += 1

            elif emb[pos] >= self.MIN_VALID:
                # remember the t-lemma and formeme for normal nodes
                token = self.id_to_string.get(emb[pos])
                if t_lemma is None:
                    t_lemma = token
                elif formeme is None:
                    formeme = token

                # move the node to its correct position
                # (which we now know it's at the current end of the tree)
                if node_idx != len(tree) - 1:
                    tree.move_node(node_idx, len(tree) - 1)
                    node_idx = len(tree) - 1
                pos += 1

        if pos < len(emb) and emb[pos] == self.BR_CLOSE:
            # skip this closing bracket so that we don't process it next time
            pos += 1

        # fill in the t-lemma and formeme that we've found
        if t_lemma is not None or formeme is not None:
            tree.nodes[node_idx] = NodeData(t_lemma, formeme)

        return pos
예제 #2
0
    def lexicalize(self, gen_trees, abst_file):
        """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas).
        Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object,
        otherwise nothing will happen. The actual operation depends on the generator mode.

        @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas
        @param abst_file: abstraction/delexicalization instructions file path
        @return: None
        """
        abstss = smart_load_absts(abst_file, len(gen_trees))
        for sent_no, (tree, absts) in enumerate(zip(gen_trees, abstss)):
            log_debug("Lexicalizing sentence %d: %s" % ((sent_no + 1), str(tree)))
            sent = self._tree_to_sentence(tree)
            log_debug(str(sent))
            for idx, tok in enumerate(sent):
                if tok and tok.startswith('X-'):  # we would like to lexicalize
                    slot = tok[2:]
                    # check if we have a value to substitute; if yes, do it
                    abst = self._first_abst(absts, slot)
                    if abst:
                        # tagged lemmas: one token with appropriate value
                        if self.mode == 'tagged_lemmas':
                            tag = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        # trees: one node with appropriate value, keep formeme
                        elif self.mode == 'trees':
                            formeme = sent[idx+1] if idx < len(sent) - 1 else None
                            val = self.get_surface_form(sent, idx, slot, abst.value,
                                                        formeme=formeme)
                            tree.nodes[old_div(idx,2)+1] = NodeData(t_lemma=val,
                                                           formeme=tree[old_div(idx,2)+1].formeme)
                        # tokens: one token with all words from the value (postprocessed below)
                        else:
                            val = self.get_surface_form(sent, idx, slot, abst.value)
                            tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x')
                        sent[idx] = val  # save value to be used in LM next time
            # postprocess tokens (split multi-word nodes)
            if self.mode == 'tokens':
                idx = 1
                while idx < len(tree):
                    if ' ' in tree[idx].t_lemma:
                        value = tree[idx].t_lemma
                        tree.remove_node(idx)
                        for shift, tok in enumerate(value.split(' ')):
                            tree.create_child(0, idx + shift,
                                              NodeData(t_lemma=tok, formeme='x'))
                        idx += shift
                    idx += 1
예제 #3
0
    def ids_to_tree(self, emb, postprocess=True):
        """Create a fake (flat) t-tree from token embeddings (IDs).

        @param emb: source embeddings (token IDs)
        @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \
            markers)? True by default.
        @return: the corresponding tree
        """

        tree = TreeData()
        tokens = self.ids_to_strings(emb)

        for token in tokens:
            if token in ['<GO>', '<STOP>', '<VOID>']:
                continue
            if postprocess:
                # casing (only if set to lowercase)
                if self.lowercase and len(tree) == 1 or tree.nodes[-1].t_lemma in ['.', '?', '!']:
                    token = token[0].upper() + token[1:]
                # plural merging (if plural tokens come up)
                if token == '<-s>' and tree.nodes[-1].t_lemma is not None:
                    token = self._singular_to_plural(tree.nodes[-1].t_lemma)
                    tree.remove_node(len(tree) - 1)
                elif token == '<-s>':
                    continue

            tree.create_child(0, len(tree), NodeData(token, 'x'))

        return tree
예제 #4
0
    def ids_to_tree(self, emb, postprocess=True):
        """Create a fake (flat) t-tree from token embeddings (IDs).

        @param emb: source embeddings (token IDs)
        @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \
            markers)? True by default.
        @return: the corresponding tree
        """

        tree = TreeData()
        tokens = self.ids_to_strings(emb)

        for token in tokens:
            if token in ['<GO>', '<STOP>', '<VOID>']:
                continue
            tree.create_child(0, len(tree), NodeData(token, 'x'))

        return tree
예제 #5
0
from tgen.planner import CandidateList
from tgen.tree import TreeData, NodeData
import random
import zlib

random.seed(1206)

l = CandidateList()
for i in xrange(10000):
    #    l[str(i)] = random.randint(0, 100)
    #    l[str(random.randint(0,1000))] = random.randint(0, 100)
    #    l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100)
    #    tree = TreeData()
    #    tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000))))
    #    l[tree] = random.randint(0, 100)
    tree = TreeData()
    for j in xrange(random.randint(1, 10)):
        tree.create_child(
            random.randint(0,
                           len(tree) - 1),
            random.randint(0, 1) == 1,
            NodeData(str(random.randint(0, 1000)), str(random.randint(0,
                                                                      1000))))
    l[tree] = random.randint(0, 100)
x = []
while l:
    x.append(l.pop())
print zlib.crc32(str(x))