def can_generate_greedy(self, tree, da): """Check if the candidate generator can generate a given tree greedily, always pursuing the first viable path. This is for debugging purposes only. Uses `get_all_successors` and always goes on with the first one that increases coverage of the current tree. """ self.init_run(da) cur_subtree = TreeData() found = True while found and cur_subtree != tree: found = False for succ in self.get_all_successors(cur_subtree): # use the first successor that is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): cur_subtree = succ found = True break # we have hit a dead end if cur_subtree != tree: log_info('Did not find tree: ' + str(tree) + ' for DA: ' + str(da)) return False # everything alright log_info('Found tree: %s for DA: %s' % (str(tree), str(da))) return True
def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue if postprocess: # casing (only if set to lowercase) if self.lowercase and len(tree) == 1 or tree.nodes[-1].t_lemma in ['.', '?', '!']: token = token[0].upper() + token[1:] # plural merging (if plural tokens come up) if token == '<-s>' and tree.nodes[-1].t_lemma is not None: token = self._singular_to_plural(tree.nodes[-1].t_lemma) tree.remove_node(len(tree) - 1) elif token == '<-s>': continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def can_generate(self, tree, da): """Check if the candidate generator can generate a given tree at all. This is for debugging purposes only. Tries if get_all_successors always returns a successor that leads to the given tree (puts on the open list only successors that are subtrees of the given tree). """ self.init_run(da) open_list = CandidateList({TreeData(): 1}) found = False tree_no = 0 while open_list and not found: cur_st, _ = open_list.pop() if cur_st == tree: found = True break for succ in self.get_all_successors(cur_st): tree_no += 1 # only push on the open list if the successor is still a subtree of the target tree if tree.common_subtree_size(succ) == len(succ): open_list.push(succ, len(succ)) if not found: log_info('Did not find tree: ' + str(tree) + ' for DA: ' + str(da) + ('(total %d trees)' % tree_no)) return False log_info('Found tree: %s for DA: %s (as %d-th tree)' % (str(tree), str(da), tree_no)) return tree_no
def _beam_search(self, enc_inputs, da): """Run beam search decoding.""" # true "batches" not implemented assert len(enc_inputs[0]) == 1 # run greedy decoder for comparison (debugging purposes) log_debug("GREEDY DEC WOULD RETURN:\n" + " ".join(self.tree_embs.ids_to_strings( [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]]))) # initialize self._init_beam_search(enc_inputs) empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb]) paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])] # beam search steps for step in xrange(len(dec_inputs)): new_paths = [] for path in paths: out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states) new_paths.extend(path.expand(self.beam_size, out_probs, st)) def cmp_func(p, q): """Length-weighted comparison of two paths' logprobs.""" return cmp(p.logprob / (len(p) ** self.length_norm_weight), q.logprob / (len(q) ** self.length_norm_weight)) paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size] if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]): break # stop decoding if we have reached the end in all paths log_debug(("\nBEAM SEARCH STEP %d\n" % step) + "\n".join([("%f\t" % p.logprob) + " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs])) for p in paths]) + "\n") # rerank paths by their distance to the input DA if self.classif_filter or self.context_bleu_weight: paths = self._rerank_paths(paths, da) # measure slot error on the top k paths if self.slot_err_stats: for path in paths[:self.sample_top_k]: self.slot_err_stats.append( da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs])) # select the "best" path -- either the best, or one in top k if self.sample_top_k > 1: best_path = self._sample_path(paths[:self.sample_top_k]) else: best_path = paths[0] # return just the best path (as token IDs) return np.array(best_path.dec_inputs)
def _get_greedy_decoder_output(self, enc_inputs, dec_inputs, compute_cost=False): """Run greedy decoding with the given inputs; return decoder outputs and the cost (if required). For ensemble decoding, the gready search is implemented as a beam search with a beam size of 1. @param enc_inputs: encoder inputs (list of token IDs) @param dec_inputs: decoder inputs (list of token IDs) @param compute_cost: if True, decoding cost is computed (the dec_inputs must be valid trees) @return a tuple of list of decoder outputs + decoding cost (None if not required) """ # TODO batches and cost computation not implemented assert len(enc_inputs[0]) == 1 and not compute_cost self._init_beam_search(enc_inputs) # for simplicity, this is implemented exacly like a beam search, but with a path sized one empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb]) path = self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]]) for step in xrange(len(dec_inputs)): out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states) path = path.expand(1, out_probs, st)[0] if path.dec_inputs[-1] == self.tree_embs.VOID: break # stop decoding if we have reached the end of path # return just token IDs, ignore cost computation here return np.array(path.dec_inputs), None
def ids_to_tree(self, emb): """Rebuild a tree from the embeddings (token IDs). @param emb: source embeddings (token IDs) @return: the corresponding tree """ tree = TreeData() tree.nodes = [] # override the technical root -- the tree will be created including the technical root tree.parents = [] # build the tree recursively (start at position 2 to skip the <GO> symbol and 1st opening bracket) self._create_subtree(tree, -1, emb, 2) return tree
def _greedy_decoding(self, enc_inputs, gold_trees): """Run greedy decoding with the given encoder inputs; optionally use given gold trees as decoder inputs for cost computation.""" # prepare decoder inputs (either fake, or true but used just for cost computation) if gold_trees is None: empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb for _ in enc_inputs[0]]) else: dec_inputs = cut_batch_into_steps([self.tree_embs.get_embeddings(tree) for tree in gold_trees]) # run the decoding per se dec_output_ids, dec_cost = self._get_greedy_decoder_output( enc_inputs, dec_inputs, compute_cost=gold_trees is not None) return dec_output_ids, dec_cost
def ids_to_tree(self, emb, postprocess=True): """Create a fake (flat) t-tree from token embeddings (IDs). @param emb: source embeddings (token IDs) @param postprocess: postprocess the sentence (capitalize sentence start, merge plural \ markers)? True by default. @return: the corresponding tree """ tree = TreeData() tokens = self.ids_to_strings(emb) for token in tokens: if token in ['<GO>', '<STOP>', '<VOID>']: continue tree.create_child(0, len(tree), NodeData(token, 'x')) return tree
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [ da.get_delexicalized(self.delex_slots) for da in self.train_das ] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
from tgen.planner import CandidateList from tgen.tree import TreeData, NodeData import random import zlib random.seed(1206) l = CandidateList() for i in xrange(10000): # l[str(i)] = random.randint(0, 100) # l[str(random.randint(0,1000))] = random.randint(0, 100) # l[(str(random.randint(0,1000)), str(random.randint(0,1000)))] = random.randint(0, 100) # tree = TreeData() # tree.create_child(0, 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) # l[tree] = random.randint(0, 100) tree = TreeData() for j in xrange(random.randint(1, 10)): tree.create_child( random.randint(0, len(tree) - 1), random.randint(0, 1) == 1, NodeData(str(random.randint(0, 1000)), str(random.randint(0, 1000)))) l[tree] = random.randint(0, 100) x = [] while l: x.append(l.pop()) print zlib.crc32(str(x))