def _beam_search(self, enc_inputs, da): """Run beam search decoding.""" # true "batches" not implemented assert len(enc_inputs[0]) == 1 # run greedy decoder for comparison (debugging purposes) log_debug("GREEDY DEC WOULD RETURN:\n" + " ".join(self.tree_embs.ids_to_strings( [out_tok[0] for out_tok in self._greedy_decoding(enc_inputs, None)[0]]))) # initialize self._init_beam_search(enc_inputs) empty_tree_emb = self.tree_embs.get_embeddings(TreeData()) dec_inputs = cut_batch_into_steps([empty_tree_emb]) paths = [self.DecodingPath(stop_token_id=self.tree_embs.STOP, dec_inputs=[dec_inputs[0]])] # beam search steps for step in xrange(len(dec_inputs)): new_paths = [] for path in paths: out_probs, st = self._beam_search_step(path.dec_inputs, path.dec_states) new_paths.extend(path.expand(self.beam_size, out_probs, st)) def cmp_func(p, q): """Length-weighted comparison of two paths' logprobs.""" return cmp(p.logprob / (len(p) ** self.length_norm_weight), q.logprob / (len(q) ** self.length_norm_weight)) paths = sorted(new_paths, cmp=cmp_func, reverse=True)[:self.beam_size] if all([p.dec_inputs[-1] == self.tree_embs.VOID for p in paths]): break # stop decoding if we have reached the end in all paths log_debug(("\nBEAM SEARCH STEP %d\n" % step) + "\n".join([("%f\t" % p.logprob) + " ".join(self.tree_embs.ids_to_strings([inp[0] for inp in p.dec_inputs])) for p in paths]) + "\n") # rerank paths by their distance to the input DA if self.classif_filter or self.context_bleu_weight: paths = self._rerank_paths(paths, da) # measure slot error on the top k paths if self.slot_err_stats: for path in paths[:self.sample_top_k]: self.slot_err_stats.append( da, self.tree_embs.ids_to_strings([inp[0] for inp in path.dec_inputs])) # select the "best" path -- either the best, or one in top k if self.sample_top_k > 1: best_path = self._sample_path(paths[:self.sample_top_k]) else: best_path = paths[0] # return just the best path (as token IDs) return np.array(best_path.dec_inputs)
def get_all_successors(self, cand_tree): """Get all possible successors of a candidate tree, given CDFS and node number limits. NB: This assumes projectivity (will never create a non-projective tree). @param cand_tree: The current candidate tree to be expanded """ # TODO possibly avoid creating TreeNode instances for iterating nodes = TreeNode(cand_tree).get_descendants(add_self=1, ordered=1) nodes_on_level = defaultdict(int) res = [] if self.cur_limits is not None: # stop if maximum number of nodes is reached if len(nodes) >= self.cur_limits['total']: return [] # remember number of nodes on all levels for node in nodes: nodes_on_level[node.get_depth()] += 1 # try adding one node to all possible places for node_num, node in enumerate(nodes): # skip nodes that can't have more children parent_id = self._parent_node_id(node) if (len(node.get_children()) >= self.max_children.get(parent_id, 0) or parent_id not in self.cur_cdfs): continue # skip nodes above child_depth levels where the maximum number of nodes has been reached if self.cur_limits is not None: child_depth = node.get_depth() + 1 if nodes_on_level[child_depth] >= self.cur_limits[child_depth]: continue # try all formeme/t-lemma/direction variants of a new child under the given parent node for formeme, t_lemma, right in map(lambda item: item[0], self.cur_cdfs[parent_id]): # place the child directly following/preceding the parent succ_tree = cand_tree.clone() succ_tree.create_child(node_num, right, NodeData(t_lemma, formeme)) res.append(succ_tree) # if the parent already has some left/right children, try to place the new node # in all possible positions before/after their subtrees (for left/right child, # respectively) children_idxs = cand_tree.children_idxs(node_num, left_only=not right, right_only=right) for child_idx in children_idxs: succ_tree = cand_tree.clone() subtree_bound = succ_tree.subtree_bound(child_idx, right) succ_tree.create_child(node_num, subtree_bound + (1 if right else 0), NodeData(t_lemma, formeme)) res.append(succ_tree) # if we have the tree classifier available, discard all successors that talk about something # not present in the current DA if self.classif and res: orig_len = len(res) is_subset = self.classif.is_subset_of_cur_da(res) res = [tree for tree, is_sub in zip(res, is_subset) if is_sub] final_len = len(res) if orig_len > final_len: log_debug('Tree classification reduced successors %d -> %d' % (orig_len, final_len)) # return all created successors return res
def _init_training(self, das_file, ttree_file, data_portion): # load data, determine number of features etc. etc. super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion) self._init_neural_network() self.w_after_iter = [] self.update_weights_sum() log_debug('\n***\nINIT:') log_debug(self._feat_val_str()) log_info('Training ...')
def lexicalize(self, gen_trees, abst_file): """Lexicalize nodes in the generated trees (which may represent trees, tokens, or tagged lemmas). Expects lexicalization file (and surface forms file) to be loaded in the Lexicalizer object, otherwise nothing will happen. The actual operation depends on the generator mode. @param gen_trees: list of TreeData objects representing generated trees/tokens/tagged lemmas @param abst_file: abstraction/delexicalization instructions file path @return: None """ abstss = smart_load_absts(abst_file, len(gen_trees)) for sent_no, (tree, absts) in enumerate(zip(gen_trees, abstss)): log_debug("Lexicalizing sentence %d: %s" % ((sent_no + 1), str(tree))) sent = self._tree_to_sentence(tree) log_debug(str(sent)) for idx, tok in enumerate(sent): if tok and tok.startswith('X-'): # we would like to lexicalize slot = tok[2:] # check if we have a value to substitute; if yes, do it abst = self._first_abst(absts, slot) if abst: # tagged lemmas: one token with appropriate value if self.mode == 'tagged_lemmas': tag = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, tag=tag) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') # trees: one node with appropriate value, keep formeme elif self.mode == 'trees': formeme = sent[idx+1] if idx < len(sent) - 1 else None val = self.get_surface_form(sent, idx, slot, abst.value, formeme=formeme) tree.nodes[old_div(idx,2)+1] = NodeData(t_lemma=val, formeme=tree[old_div(idx,2)+1].formeme) # tokens: one token with all words from the value (postprocessed below) else: val = self.get_surface_form(sent, idx, slot, abst.value) tree.nodes[idx+1] = NodeData(t_lemma=val, formeme='x') sent[idx] = val # save value to be used in LM next time # postprocess tokens (split multi-word nodes) if self.mode == 'tokens': idx = 1 while idx < len(tree): if ' ' in tree[idx].t_lemma: value = tree[idx].t_lemma tree.remove_node(idx) for shift, tok in enumerate(value.split(' ')): tree.create_child(0, idx + shift, NodeData(t_lemma=tok, formeme='x')) idx += shift idx += 1
def _rerank_paths(self, paths, da): """Rerank the n-best decoded paths according to the reranking classifier and/or BLEU against context.""" trees = [ self.tree_embs.ids_to_tree( np.array(path.dec_inputs).transpose()[0]) for path in paths ] # rerank using BLEU against context if set to do so if self.context_bleu_weight: bm = BLEUMeasure(max_ngram=2) bleus = [] for path, tree in zip(paths, trees): bm.reset() bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]]) bleu = (bm.ngram_precision() if self.context_bleu_metric == 'ngram_prec' else bm.bleu()) bleus.append(bleu) path.logprob += self.context_bleu_weight * bleu log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) + "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]]) for b, t in zip(bleus, trees)])) # add distances to logprob so that non-fitting will be heavily penalized if self.classif_filter: self.classif_filter.init_run(da) fits = self.classif_filter.dist_to_cur_da(trees) for path, fit in zip(paths, fits): path.logprob -= self.misfit_penalty * fit log_debug(("Misfits for DA: %s\n\n" % str(da)) + "\n".join([("%.5f\t" % fit) + " ".join( [unicode(n.t_lemma) for n in tree.nodes[1:]]) for fit, tree in zip(fits, trees)])) # adjust paths for length (if set to do so) if self.length_norm_weight: for path in paths: path.logprob /= len(path)**self.length_norm_weight return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)
def generate_tree(self, da, gen_doc=None): """Generate one tree, saving it into the document provided (if applicable). @param da: the input DA @param gen_doc: the document where the tree should be saved (defaults to None) """ # generate the tree log_debug("GENERATE TREE FOR DA: " + unicode(da)) tree = self.process_das([da])[0] log_debug("RESULT: %s" % unicode(tree)) # append the tree to a t-tree document, if requested if gen_doc: zone = self.get_target_zone(gen_doc) zone.ttree = tree.create_ttree() zone.sentence = unicode(da) # return the result return tree
def _rerank_paths(self, paths, da): """Rerank the n-best decoded paths according to the reranking classifier and/or BLEU against context.""" trees = [self.tree_embs.ids_to_tree(np.array(path.dec_inputs).transpose()[0]) for path in paths] # rerank using BLEU against context if set to do so if self.context_bleu_weight: bm = BLEUMeasure(max_ngram=2) bleus = [] for path, tree in zip(paths, trees): bm.reset() bm.append([(n.t_lemma, None) for n in tree.nodes[1:]], [da[0]]) bleu = (bm.ngram_precision() if self.context_bleu_metric == 'ngram_prec' else bm.bleu()) bleus.append(bleu) path.logprob += self.context_bleu_weight * bleu log_debug(("BLEU for context: %s\n\n" % " ".join([form for form, _ in da[0]])) + "\n".join([("%.5f\t" % b) + " ".join([n.t_lemma for n in t.nodes[1:]]) for b, t in zip(bleus, trees)])) # add distances to logprob so that non-fitting will be heavily penalized if self.classif_filter: self.classif_filter.init_run(da) fits = self.classif_filter.dist_to_cur_da(trees) for path, fit in zip(paths, fits): path.logprob -= self.misfit_penalty * fit log_debug(("Misfits for DA: %s\n\n" % str(da)) + "\n".join([("%.5f\t" % fit) + " ".join([unicode(n.t_lemma) for n in tree.nodes[1:]]) for fit, tree in zip(fits, trees)])) # adjust paths for length (if set to do so) if self.length_norm_weight: for path in paths: path.logprob /= len(path) ** self.length_norm_weight return sorted(paths, cmp=lambda p, q: cmp(p.logprob, q.logprob), reverse=True)
def _update_nn(self, bad_feats, good_feats, rate): """Changing the NN update call to support arrays of parameters.""" # TODO: this is just adding another dimension to fit the parallelized scoring # (even if updates are not parallelized). Make it nicer. bad_feats = ([bad_feats[0]], [bad_feats[1]]) good_feats = ([good_feats[0]], [good_feats[1]]) cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,))) log_debug('Cost:' + str(cost_gcost[0])) param_vals = [param.get_value() for param in self.nn.params] log_debug('Param norms : ' + str(self._l2s(param_vals))) log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:]))) l1_params = param_vals[2] log_debug('Layer 1 parts :' + str(self._l2s([l1_params[0:100, :], l1_params[100:200, :], l1_params[200:350, :], l1_params[350:500, :], l1_params[500:, :]]))) l1_gparams = cost_gcost[3] log_debug('Layer 1 gparts:' + str(self._l2s([l1_gparams[0:100, :], l1_gparams[100:200, :], l1_gparams[200:350, :], l1_gparams[350:500, :], l1_gparams[500:, :]])))
def get_surface_form(self, sentence, pos, possible_forms): log_debug("Pos: %d, forms: %s" % (pos, unicode(", ".join(possible_forms)))) # get unnormalized scores for the whole vocabulary if pos >= self.max_sent_len: # don't use whole sentence if it's too long pos -= pos - self.max_sent_len + 1 sentence = sentence[pos - self.max_sent_len + 1:] inputs = np.array([self._sent_to_ids(sentence)[:-1]], dtype=np.int32) logits = self.session.run([self._logits], {self._inputs: inputs}) # pick out scores for possible forms scores = [ logits[0][pos][self.vocab.get(form.lower(), self.vocab.get('<UNK>'))] for form in possible_forms ] probs = softmax(scores) log_debug("Vocab: %s" % unicode(", ".join([ unicode(self.vocab.get(form.lower(), self.vocab.get('<UNK>'))) for f in possible_forms ]))) log_debug("Scores: %s, Probs: %s" % (unicode(", ".join( ["%.3f" % s for s in scores])), unicode(", ".join(["%.3f" % p for p in probs])))) # sample from the prob. dist. if self._sample: return np.random.choice(possible_forms, p=probs) # get just the most probable option max_idx, _ = max(enumerate(probs), key=operator.itemgetter(1)) return possible_forms[max_idx]
def generate_tree(self, da, gen_doc=None): """Generate one tree, saving it into the document provided (if applicable). @param da: the input DA @param gen_doc: the document where the tree should be saved (defaults to None) """ # generate the tree log_debug("GENERATE TREE FOR DA: " + unicode(da)) tree = self.process_das([da])[0] log_debug("RESULT: %s" % unicode(tree)) # if requested, append the result to the "document" # just lists (generated tokens only, disregarding syntax; keep None for POS tags) if isinstance(gen_doc, list): # ignore tree technical root, take just "lemmas" gen_doc.append([(n.t_lemma, None) for n in tree.nodes[1:]]) # full Pytreex documents (full trees) elif gen_doc: zone = self.get_target_zone(gen_doc) zone.ttree = tree.create_ttree() zone.sentence = unicode(da) # return the result return tree
def _check_pending_request(self, sc, job_no, req): """Check whether the given request has finished (i.e., job is loaded or job has processed the given data portion. If the request is finished, the worker that processed it is moved to the pool of free services. @param iter_no: current iteration number (for logging) @param sc: a ServiceConn object that stores the worker connection parameters @param job_no: current job number (is None for jobs loading) @param req: the request itself @return: the value returned by the finished data processing request, or None \ (for loading requests or unfinished requests) """ result = None if job_no is not None: log_debug('Checking %d' % job_no) # checking if the request has finished if req.ready: if job_no is not None: log_debug('Ready %d' % job_no) log_info('Retrieved finished request %d' % job_no) if req.error: log_info( 'Error found on request: job #%d, worker %s:%d' % (job_no if job_no is not None else -1, sc.host, sc.port)) result = req.value # remove from list of pending requests # TODO return to pool of free requests (but needs to store the results somewhere) self.pending_requests.remove((sc, job_no, req)) if job_no is None: self.free_services.append(sc) return result
def _update_nn(self, bad_feats, good_feats, rate): """Changing the NN update call to support arrays of parameters.""" # TODO: this is just adding another dimension to fit the parallelized scoring # (even if updates are not parallelized). Make it nicer. bad_feats = ([bad_feats[0]], [bad_feats[1]]) good_feats = ([good_feats[0]], [good_feats[1]]) cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,))) log_debug('Cost:' + str(cost_gcost[0])) param_vals = [param.get_value() for param in self.nn.params] log_debug('Param norms : ' + str(self._l2s(param_vals))) log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))
def append(self, gold_tree, open_list, close_list): """Analyze the open and close lists of a generator for the presence of the gold-standard tree and add the results to statistics.""" self.total += 1 best_tree = close_list.peek()[0] if gold_tree == best_tree: self.gold_best += 1 log_debug('GOLD TREE IS BEST') if gold_tree in close_list: self.gold_on_close += 1 log_debug('GOLD TREE IS ON CLOSE LIST') if gold_tree in open_list: self.gold_on_open += 1 log_debug('GOLD TREE IS ON OPEN LIST')
def _training_pass(self, pass_no): """Perform one training pass through the whole training data, print statistics.""" pass_start_time = time.time() log_debug('\n***\nTR %05d:' % pass_no) log_debug("Train order: " + str(self.train_order)) pass_cost = 0 pass_diff = 0 for tree_nos in self.batches(): log_debug('TREE-NOS: ' + str(tree_nos)) log_debug("\n".join( unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i]) for i in tree_nos)) log_debug('Y: ' + str(self.y[tree_nos])) results = self.classif.classif(self.X[tree_nos]) cost_gcost = self.classif.update(self.X[tree_nos], self.y[tree_nos], self.alpha) bin_result = np.array([[1. if r > 0.5 else 0. for r in result] for result in results]) log_debug('R: ' + str(bin_result)) log_debug('COST: %f' % cost_gcost[0]) log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result))) pass_cost += cost_gcost[0] pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result)) # print and return statistics self._print_pass_stats( pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)), pass_cost, pass_diff)
def get_all_successors(self, cand_tree): """Get all possible successors of a candidate tree, given CDFS and node number limits. NB: This assumes projectivity (will never create a non-projective tree). @param cand_tree: The current candidate tree to be expanded """ # TODO possibly avoid creating TreeNode instances for iterating nodes = TreeNode(cand_tree).get_descendants(add_self=1, ordered=1) nodes_on_level = defaultdict(int) res = [] if self.cur_limits is not None: # stop if maximum number of nodes is reached if len(nodes) >= self.cur_limits['total']: return [] # remember number of nodes on all levels for node in nodes: nodes_on_level[node.get_depth()] += 1 # try adding one node to all possible places for node_num, node in enumerate(nodes): # skip nodes that can't have more children parent_id = self._parent_node_id(node) if (len(node.get_children()) >= self.max_children.get( parent_id, 0) or parent_id not in self.cur_cdfs): continue # skip nodes above child_depth levels where the maximum number of nodes has been reached if self.cur_limits is not None: child_depth = node.get_depth() + 1 if nodes_on_level[child_depth] >= self.cur_limits[child_depth]: continue # try all formeme/t-lemma/direction variants of a new child under the given parent node for formeme, t_lemma, right in [ item[0] for item in self.cur_cdfs[parent_id] ]: # place the child directly following/preceding the parent succ_tree = cand_tree.clone() succ_tree.create_child(node_num, right, NodeData(t_lemma, formeme)) res.append(succ_tree) # if the parent already has some left/right children, try to place the new node # in all possible positions before/after their subtrees (for left/right child, # respectively) children_idxs = cand_tree.children_idxs(node_num, left_only=not right, right_only=right) for child_idx in children_idxs: succ_tree = cand_tree.clone() subtree_bound = succ_tree.subtree_bound(child_idx, right) succ_tree.create_child(node_num, subtree_bound + (1 if right else 0), NodeData(t_lemma, formeme)) res.append(succ_tree) # if we have the tree classifier available, discard all successors that talk about something # not present in the current DA if self.classif and res: orig_len = len(res) is_subset = self.classif.is_subset_of_cur_da(res) res = [tree for tree, is_sub in zip(res, is_subset) if is_sub] final_len = len(res) if orig_len > final_len: log_debug('Tree classification reduced successors %d -> %d' % (orig_len, final_len)) # return all created successors return res
def _training_pass(self, pass_no): """Perform one training pass through the whole training data, print statistics.""" pass_start_time = time.time() log_debug('\n***\nTR %05d:' % pass_no) log_debug("Train order: " + str(self.train_order)) pass_cost = 0 pass_diff = 0 for tree_nos in self._batches(): log_debug('TREE-NOS: ' + str(tree_nos)) log_debug("\n".join(unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i]) for i in tree_nos)) log_debug('Y: ' + str(self.y[tree_nos])) fd = {self.targets: self.y[tree_nos]} self._add_inputs_to_feed_dict(self.X[tree_nos], fd) if self.train_summary_dir: # also compute Tensorboard summaries results, cost, _, train_summary_op = self.session.run( [self.outputs, self.cost, self.train_func, self.train_summary_op], feed_dict=fd) else: results, cost, _ = self.session.run([self.outputs, self.cost, self.train_func], feed_dict=fd) bin_result = np.array([[1. if r > 0 else 0. for r in result] for result in results]) log_debug('R: ' + str(bin_result)) log_debug('COST: %f' % cost) log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result))) pass_cost += cost pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result)) # print and return statistics self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)), pass_cost, pass_diff) if self.train_summary_dir: # Tensorboard: iteration summary self.train_summary_writer.add_summary(train_summary_op, pass_no) return pass_cost, pass_diff
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # prepare evaluation if args.eval_file is None or args.eval_file.endswith('.txt'): # just tokens gen_doc = [] else: # Trees: depending on PyTreex from pytreex.core.document import Document eval_doc = read_ttrees(args.eval_file) if args.ref_selector == args.target_selector: gen_doc = Document() else: gen_doc = eval_doc if args.eval_file: tgen.init_slot_err_stats() # generate log_info('Generating...') tgen.selector = args.target_selector # override target selector for generation for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) tgen.generate_tree(da, gen_doc) # evaluate if args.eval_file is not None: log_info(tgen.get_slot_err_stats()) # evaluate the generated tokens (F1 and BLEU scores) if args.eval_file.endswith('.txt'): lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file)) eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc) # evaluate the generated trees against golden trees else: eval_trees(das, ttrees_from_doc(eval_doc, tgen.language, args.ref_selector), ttrees_from_doc(gen_doc, tgen.language, args.target_selector), eval_doc, tgen.language, tgen.selector) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_doc, args.output_file) else: write_ttrees(gen_doc, args.output_file)
def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None): """Run parallel perceptron training, start and manage workers.""" # initialize the ranker instance log_info('Initializing...') # run server to process registering clients self._init_server() # spawn training jobs log_info('Spawning jobs...') host_short, _ = self.host.split('.', 1) # short host name for job names for j in range(self.jobs_number): # set up debugging logfile only if we have it on the head debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None' job = Job(header='from tgen.parallel_seq2seq_train import run_training', code=('run_training("%s", %d, %s)' % (self.host, self.port, debug_logfile)), name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)), work_dir=self.work_dir) job.submit(memory=self.job_memory, queue=self.queue_settings) self.jobs.append(job) # run the training passes try: cur_assign = 0 results = [None] * self.jobs_number rnd_seeds = [rnd.random() for _ in range(self.jobs_number)] # assign training and wait for it to finish while cur_assign < self.jobs_number or self.pending_requests: log_debug('Starting loop over services.') # check if some of the pending computations have finished for sc, job_no, req in list(self.pending_requests): res = self._check_pending_request(sc, job_no, req) if res is not None: results[job_no] = res, sc # check for free services and assign new computation while cur_assign < self.jobs_number and self.free_services: log_debug('Assigning request %d' % cur_assign) sc = self.free_services.popleft() log_info('Assigning request %d to %s:%d' % (cur_assign, sc.host, sc.port)) if validation_files is not None: validation_files = ','.join([os.path.relpath(f, self.work_dir) for f in validation_files.split(',')]) train_func = async(sc.conn.root.train) req = train_func(rnd_seeds[cur_assign], os.path.relpath(das_file, self.work_dir), os.path.relpath(ttree_file, self.work_dir), data_portion, os.path.relpath(context_file, self.work_dir) if context_file else None, validation_files) self.pending_requests.add((sc, cur_assign, req)) cur_assign += 1 log_debug('Assigned %d' % cur_assign) # sleep for a while log_debug('Sleeping.') time.sleep(self.poll_interval) log_info("Results:\n" + "\n".join("%.5f %s:%d" % (cost, sc.host, sc.port) for cost, sc in results)) self.model_temp_path = os.path.join(self.work_dir, self.TEMPFILE_NAME) results.sort(key=lambda res: res[0]) # average the computed models if self.average_models: log_info('Creating ensemble models...') # use only top k if required results_for_ensemble = (results[:self.average_models_top_k] if self.average_models_top_k > 0 else results) ensemble_model = self.build_ensemble_model(results_for_ensemble) log_info('Saving the ensemble model temporarily to %s...' % self.model_temp_path) ensemble_model.save_to_file(self.model_temp_path) # select the best result on devel data + save it else: best_cost, best_sc = results[0] log_info('Best cost: %f (computed at %s:%d).' % (best_cost, best_sc.host, best_sc.port)) log_info('Saving best generator temporarily to %s...' % self.model_temp_path) # use relative path (working directory of worker jobs is different) best_sc.conn.root.save_model(os.path.relpath(self.model_temp_path, self.work_dir)) # kill all jobs finally: for job in self.jobs: job.delete()
def _training_pass(self, pass_no): """Perform one training pass through the whole training data, print statistics.""" pass_start_time = time.time() log_debug('\n***\nTR %05d:' % pass_no) log_debug("Train order: " + str(self.train_order)) pass_cost = 0 pass_diff = 0 for tree_nos in self.batches(): log_debug('TREE-NOS: ' + str(tree_nos)) log_debug("\n".join(unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i]) for i in tree_nos)) log_debug('Y: ' + str(self.y[tree_nos])) results = self.classif.classif(self.X[tree_nos]) cost_gcost = self.classif.update(self.X[tree_nos], self.y[tree_nos], self.alpha) bin_result = np.array([[1. if r > 0.5 else 0. for r in result] for result in results]) log_debug('R: ' + str(bin_result)) log_debug('COST: %f' % cost_gcost[0]) log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result))) pass_cost += cost_gcost[0] pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result)) # print and return statistics self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)), pass_cost, pass_diff)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_trees, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip( das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores( da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip( eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text( eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info( "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def _training_pass(self, pass_no): """Perform one training pass through the whole training data, print statistics.""" pass_start_time = time.time() log_debug('\n***\nTR %05d:' % pass_no) log_debug("Train order: " + str(self.train_order)) pass_cost = 0 pass_diff = 0 for tree_nos in self._batches(): log_debug('TREE-NOS: ' + str(tree_nos)) log_debug("\n".join( unicode(self.train_trees[i]) + "\n" + unicode(self.train_das[i]) for i in tree_nos)) log_debug('Y: ' + str(self.y[tree_nos])) fd = {self.targets: self.y[tree_nos]} self._add_inputs_to_feed_dict(self.X[tree_nos], fd) if self.train_summary_dir: # also compute Tensorboard summaries results, cost, _, train_summary_op = self.session.run( [ self.outputs, self.cost, self.train_func, self.train_summary_op ], feed_dict=fd) else: results, cost, _ = self.session.run( [self.outputs, self.cost, self.train_func], feed_dict=fd) bin_result = np.array([[1. if r > 0 else 0. for r in result] for result in results]) log_debug('R: ' + str(bin_result)) log_debug('COST: %f' % cost) log_debug('DIFF: %d' % np.sum(np.abs(self.y[tree_nos] - bin_result))) pass_cost += cost pass_diff += np.sum(np.abs(self.y[tree_nos] - bin_result)) # print and return statistics self._print_pass_stats( pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)), pass_cost, pass_diff) if self.train_summary_dir: # Tensorboard: iteration summary self.train_summary_writer.add_summary(train_summary_op, pass_no) return pass_cost, pass_diff
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def seq2seq_gen(args): """Sequence-to-sequence generation""" def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language, selector): """Decide to write t-trees or tokens based on the output file name.""" if output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, output_file) else: write_ttrees( create_ttree_doc(gen_trees, base_doc, language, selector), output_file) ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument( '-a', '--abstr-file', type=str, help= 'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)' ) ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument( '-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-D', '--delex-output-file', type=str, help='Output file for trees/text before lexicalization') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn( 'Generator is not trained to use context, ignoring context input file.' ) else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) if args.delex_output_file is not None: log_info('Writing delex output...') write_trees_or_tokens(args.delex_output_file, das, gen_trees, None, tgen.language, args.target_selector or tgen.selector) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)