def _create_subtree(self, tree, parent_idx, emb, pos): """Recursive subroutine used for `ids_to_tree()`, do not use otherwise. Solves a subtree (starting just after the opening bracket, returning a position just after the corresponding closing bracket). @param tree: the tree to work on (will be enhanced by the subtree) @param parent_idx: the ID of the parent for the current subtree @param emb: the source embeddings @param pos: starting position in the source embeddings @return: the final position used in the current subtree """ if pos >= len(emb): # avoid running out of the tree (for invalid trees) return pos node_idx = tree.create_child(parent_idx, len(tree), NodeData(None, None)) t_lemma = None formeme = None while pos < len(emb) and emb[pos] not in [self.BR_CLOSE, self.STOP, self.VOID]: if emb[pos] == self.BR_OPEN: # recurse into subtree pos = self._create_subtree(tree, node_idx, emb, pos + 1) elif emb[pos] == self.UNK_T_LEMMA: if t_lemma is None: t_lemma = self.id_to_string[self.UNK_T_LEMMA] pos += 1 elif emb[pos] == self.UNK_FORMEME: if formeme is None: formeme = self.id_to_string[self.UNK_FORMEME] pos += 1 elif emb[pos] >= self.MIN_VALID: # remember the t-lemma and formeme for normal nodes token = self.id_to_string.get(emb[pos]) if t_lemma is None: t_lemma = token elif formeme is None: formeme = token # move the node to its correct position # (which we now know it's at the current end of the tree) if node_idx != len(tree) - 1: tree.move_node(node_idx, len(tree) - 1) node_idx = len(tree) - 1 pos += 1 if pos < len(emb) and emb[pos] == self.BR_CLOSE: # skip this closing bracket so that we don't process it next time pos += 1 # fill in the t-lemma and formeme that we've found if t_lemma is not None or formeme is not None: tree.nodes[node_idx] = NodeData(t_lemma, formeme) return pos
def lexicalize(self, gen_trees, abst_file): """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas). Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object, otherwise nothing will happen. The actual operation depends on the generator mode. @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas @param abst_file: abstraction/delexicalization instructions file path @return: None """ abstss = smart_load_absts(abst_file, len(gen_trees)) for sent_no, (tree, absts) in enumerate(zip(gen_trees, abstss)): log_debug("Lexicalizing sentence %d: %s" % ((sent_no + 1), str(tree))) sent = self._tree_to_sentence(tree) log_debug(str(sent)) for idx, tok in enumerate(sent): if tok and tok.startswith('X-'): # we would like to lexicalize slot = tok[2:] # check if we have a value to substitute; if yes, do it abst = self._first_abst(absts, slot) if abst: # tagged lemmas: one token with appropriate value if self.mode == 'tagged_lemmas': tag = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') # trees: one node with appropriate value, keep formeme elif self.mode == 'trees': formeme = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, formeme=formeme) tree.nodes[old_div(idx,2)+1] = NodeData(t_lemma=val, formeme=tree[old_div(idx,2)+1].formeme) # tokens: one token with all words from the value (postprocessed below) else: val = self.get_surface_form(sent, idx, slot, abst.value) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') sent[idx] = val # save value to be used in LM next time # postprocess tokens (split multi-word nodes) if self.mode == 'tokens': idx = 1 while idx < len(tree): if ' ' in tree[idx].t_lemma: value = tree[idx].t_lemma tree.remove_node(idx) for shift, tok in enumerate(value.split(' ')): tree.create_child(0, idx + shift, NodeData(t_lemma=tok, formeme='x')) idx += shift idx += 1
def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue if postprocess: # casing (only if set to lowercase) if self.lowercase and len(tree) == 1 or tree.nodes[-1].t_lemma in ['.', '?', '!']: token = token[0].upper() + token[1:] # plural merging (if plural tokens come up) if token == '<-s>' and tree.nodes[-1].t_lemma is not None: token = self._singular_to_plural(tree.nodes[-1].t_lemma) tree.remove_node(len(tree) - 1) elif token == '<-s>': continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
from tgen.planner import CandidateList from tgen.tree import TreeData, NodeData import random import zlib random.seed(1206) l = CandidateList() for i in xrange(10000): # l[str(i)] = random.randint(0, 100) # l[str(random.randint(0,1000))] = random.randint(0, 100) # l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100) # tree = TreeData() # tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) # l[tree] = random.randint(0, 100) tree = TreeData() for j in xrange(random.randint(1, 10)): tree.create_child( random.randint(0, len(tree) - 1), random.randint(0, 1) == 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) l[tree] = random.randint(0, 100) x = [] while l: x.append(l.pop()) print zlib.crc32(str(x))