def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DialogueAct() empty_da.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def load_surface_forms(self, surface_forms_fname): """Load all proper name surface forms from a file.""" log_info('Loading surface forms from %s...' % surface_forms_fname) with file_stream(surface_forms_fname) as fh: data = json.load(fh) for slot, values in data.iteritems(): sf_all = {} sf_formeme = {} sf_tag = {} for value in values.keys(): for surface_form in values[value]: form, tag = surface_form.split("\t") if slot == 'street': # add street number placeholders to addresses value += ' _' slot = 'address' # store the value globally + for all possible tag subsets/formemes sf_all[value] = sf_all.get(value, []) + [form] sf_tag[value] = sf_tag.get(value, {}) sf_formeme[value] = sf_formeme.get(value, {}) for tag_subset in self._get_tag_subsets(tag): sf_tag[value][tag_subset] = sf_tag[value].get( tag_subset, []) + [form] for formeme in self._get_compatible_formemes(tag): sf_formeme[value][formeme] = sf_formeme[value].get( formeme, []) + [form] self._sf_all[slot] = sf_all self._sf_by_formeme[slot] = sf_formeme self._sf_by_tag[slot] = sf_tag
def save_to_file(self, model_fname): """This will actually just move the best generator (which is saved in a temporary file) to the final location.""" log_info('Moving generator to %s...' % model_fname) orig_model_fname = self.model_temp_path shutil.move(orig_model_fname, model_fname) orig_tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', orig_model_fname) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if os.path.isfile(orig_tf_session_fname): shutil.move(orig_tf_session_fname, tf_session_fname) # move the reranking classifier model files as well, if they exist orig_clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', orig_model_fname) orig_clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', orig_clfilter_fname) if os.path.isfile(orig_clfilter_fname) and os.path.isfile( orig_clfilter_tf_fname): clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess', clfilter_fname) shutil.move(orig_clfilter_fname, clfilter_fname) shutil.move(orig_clfilter_tf_fname, clfilter_tf_fname)
def train(self, train_sents, valid_sents=None): """Train the RNNLM on the given data (list of lists of tokens). @param train_sents: training data (list of lists of tokens, lexicalized) @param valid_sents: validation data (list of lists of tokens, lexicalized, may be None \ if no validation should be performed) """ self._init_training(train_sents, valid_sents) top_perp = float('nan') for iter_no in xrange(1, self.passes + 1): # preparing parameters iter_alpha = self.alpha * np.exp(-self.alpha_decay * iter_no) self._train_order = range(len(self._train_data)) if self.randomize: rnd.shuffle(self._train_order) # training self._training_pass(iter_no, iter_alpha) # validation if (self.validation_freq and iter_no > self.min_passes and iter_no % self.validation_freq == 0): perp = self._valid_perplexity() log_info("Perplexity: %.3f" % perp) # if we have the best model so far, save it as a checkpoint (overwrite previous) if math.isnan(top_perp) or perp < top_perp: top_perp = perp self._save_checkpoint() self._restore_checkpoint() # restore the best parameters so far
def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:r:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg elif opt == '-r' and arg: rnd.seed(arg) if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): from tgen.rank_nn import SimpleNNRanker, EmbNNRanker if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']: ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker log_info('Using %s for ranking' % ranker_class.__name__) if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) # avoid the "maximum recursion depth exceeded" error sys.setrecursionlimit(100000) ranker.save_to_file(fname_rank_model)
def read_system_training_data(filename): insts = [] for inst in pd.read_csv(filename, index_col=None, encoding='UTF-8').to_dict('records'): insts.append({ 'dataset': 'E2E', 'mr': DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(), 'delex_mr': DA.parse_diligent_da(inst['mr']).get_delexicalized( set(['name', 'near'])).to_cambridge_da_string(), 'system': 'HUMAN', 'system_ref': None, 'orig_ref': inst['ref'], 'informativeness': None, 'naturalness': None, 'quality': None, 'is_real': 0 }) log_info( "Using %d different training human references to create fake pairs" % len(insts)) return insts
def _delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delexed_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._sents, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): slot = da.has_value(lemma) if slot and slot in self._abst_slots: delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, lemma, form, tok_idx, tok_idx + 1)) else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info( "Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, unicode(da), " ".join( [form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delexed_texts.append(delex_text) self._absts.append(absts)
def save_to_file(self, model_fname): log_info("Saving classifier to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def create_fake_pairs(fake_insts, data_len): """Given fake instances (ordered by the level of distortion & in the same order across the distortion levels: A-0, B-0..., A-1, B-1..., A-2, B-2... etc.), this creates pairs of instances for ranking (e.g. A-0 is better than A-2 etc.).""" log_info('Creating fake pairs...') # create a new dataframe with the same columns, plus 2nd system reference fake_pairs = [] max_distort = len(fake_insts) / data_len # should be an integer for inst_no in xrange(data_len): # add perfect vs. imperfect distort_levels = [(0, lev) for lev in range(1, max_distort)] # sample 5 pairs of different degrees of distortion pairs = list(combinations(range(1, max_distort), 2)) distort_levels += [ pairs[i] for i in np.random.choice(len(pairs), 5, replace=False) ] # choose the instances based on the distortion levels, create the pairs instanecs for better, worse in distort_levels: new_inst = dict(fake_insts.iloc[inst_no + better * data_len]) new_inst['system_ref2'] = fake_insts.iloc[inst_no + worse * data_len]['system_ref'] del new_inst['informativeness'] del new_inst['naturalness'] del new_inst['quality'] # add both naturalness and quality, ignore informativeness here for quant in ['naturalness', 'quality']: fake_pairs.append(dict(new_inst, **{quant: 1})) log_info('Created %d fake pairs.' % len(fake_pairs)) return pd.DataFrame.from_records(fake_pairs)
def train(self, fnames, train_trees, valid_trees=None): """Train the lexicalizer (including its LM, if applicable). @param fnames: file names for surface forms (JSON) and training data lexicalization \ instructions @param train_trees: loaded generator training data (TreeData trees/lists of lemma-tag \ or form-tag pairs) """ log_info('Training lexicalizer...') if not fnames: return valid_abst_fname = None if ',' in fnames: fnames = fnames.split(',') if len(fnames) == 3: surface_forms_fname, train_abst_fname, valid_abst_fname = fnames else: surface_forms_fname, train_abst_fname = fnames else: surface_forms_fname, train_abst_fname = fnames, None self.load_surface_forms(surface_forms_fname) if train_abst_fname and not isinstance(self._form_select, RandomFormSelect): log_info( 'Training lexicalization LM from training trees and %s...' % train_abst_fname) self._form_select.train(*self._prepare_train_toks( train_trees, train_abst_fname, valid_trees, valid_abst_fname))
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file( classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def save_to_file(self, model_fname): """Save the whole ensemble into a file (get all settings and parameters, dump them in a pickle).""" # TODO support for lexicalizer log_info("Saving generator to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL) gens_dump = [] for gen in self.gens: setting = gen.get_all_settings() parset = gen.get_model_params() setting['classif_filter'] = self.classif_filter is not None gens_dump.append((setting, parset)) pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL) if self.classif_filter: pickle.dump(self.classif_filter.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.classif_filter.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def evaluate_file(self, das_file, ttree_file): """Evaluate the reranking classifier on a given pair of DA/tree files (show the total Hamming distance and total number of DAIs) @param das_file: DA file path @param ttree_file: trees/sentences file path @return: a tuple (total DAIs, distance) """ log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees/tokens from ' + ttree_file + '...') trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector) if self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') tot_len = 0 tot_dist = 0 classif_das = [] for da, tree in zip(das, trees): tot_len += len(da) dist, classif = self.dist_to_da(da, [tree], return_classif=True) tot_dist += dist[0] classif_das.append(DA.parse_features(classif[0])) return tot_len, tot_dist, classif_das
def save_to_file(self, model_fname): """Save the generator to a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph will be stored with a \ different extension """ log_info("Saving generator to %s..." % model_fname) if self.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) self.classif_filter.save_to_file(classif_filter_fname) if self.lexicalizer: lexicalizer_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.lexic\1', model_fname) self.lexicalizer.save_to_file(lexicalizer_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if hasattr(self, 'checkpoint_path') and self.checkpoint_path: shutil.copyfile(self.checkpoint_path, tf_session_fname) else: self.saver.save(self.session, tf_session_fname)
def get_sys_outputs(data): """Get instances with individual system outputs (regardless of pairs).""" sys_outputs = {} mrs = {} for inst in data: mrs[inst['mr']] = inst['delex_mr'] sys_outputs[(inst['mr'], inst['system'])] = inst['system_ref'] sys_outputs[(inst['mr'], inst['system2'])] = inst['system_ref2'] sys_outs_list = [] for (mr, sys_name), output in sys_outputs.iteritems(): sys_outs_list.append({ 'dataset': 'E2E', 'mr': mr, 'delex_mr': mrs[mr], 'system': sys_name, 'system_ref': None, 'orig_ref': output, 'informativeness': None, 'naturalness': None, 'quality': None, 'is_real': 0 }) log_info('Using %d different system outputs to create fake pairs.' % len(sys_outs_list)) return sys_outs_list
def exposed_init_training(self, cfg): """Create the Seq2SeqGen object.""" cfg = pickle.loads(cfg) tstart = time.time() log_info('Initializing training...') self.seq2seq = Seq2SeqGen(cfg) log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
def _save_checkpoint(self): """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path where it is saved; if called repeatedly, will always overwrite the last checkpoint.""" if not self.checkpoint_path: fh, path = tempfile.mkstemp(".ckpt", "tgen-", self.checkpoint_path) self.checkpoint_path = path log_info('Saving checkpoint to %s' % self.checkpoint_path) self.saver.save(self.session, self.checkpoint_path)
def save_to_file(self, lexicalizer_fname): """Save the lexicalizer model to a file (and a second file with the LM, if needed).""" log_info("Saving lexicalizer to %s..." % lexicalizer_fname) with file_stream(lexicalizer_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) if not isinstance(self._form_select, RandomFormSelect): self._form_select.save_model(lexicalizer_fname)
def _save_checkpoint(self): """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path where it is saved; if called repeatedly, will always overwrite the last checkpoint.""" if not self.checkpoint_path: path = tempfile.mkdtemp(suffix="", prefix="tftreecl-") self.checkpoint_path = os.path.join(path, "ckpt") log_info('Saving checkpoint to %s' % self.checkpoint_path) self.saver.save(self.session, self.checkpoint_path)
def add_fake_data(train_data, real_data, add_from='', create_pairs=''): """Adding fake data to the training set (return just the training set if there's nothing to add). @param train_data: training data (correct CV part if applicable) @param real_data: basis on which the fake data should be created @param add_from: T = include human refs from training data, \ S = include system outputs in training data (in addition to real_data) @param create_pairs: create training pairs to rank ('' - not at all, \ 'add' - in addition to regular fakes, 'only' - exclusively) @return the enhanced (or unchanged) training set """ if 'T' in add_from: log_info( "Will create fake data from human references in training data.") human_data = train_data.copy() refs = human_data['orig_ref'].str.split(' <\|> ').apply(pd.Series, 1).stack() refs.index = refs.index.droplevel(-1) refs.name = 'orig_ref' del human_data['orig_ref'] human_data = human_data.join(refs).reset_index() human_data = human_data.groupby( ['mr', 'orig_ref'], # delete scores as_index=False).agg(lambda vals: None) real_data = pd.concat((real_data, human_data), sort=True) train_data['orig_ref'] = '' if 'S' in add_from: log_info("Will create fake data from system outputs in training data.") # we keep the scores here, but use the outputs as orig references sys_outs = train_data.copy() del sys_outs['orig_ref'] # delete original human refs first sys_outs = sys_outs.rename(columns={'system_ref': 'orig_ref'}) real_data = pd.concat((real_data, sys_outs), sort=True) # there is some fake data to be created and added if len(real_data): log_info("Creating fake data...") fake_data = create_fake_data( real_data, train_data.columns, score_type=('hter' if args.hter_score else 'nlg')) log_info("Created %d fake instances." % len(fake_data)) # now we can add fake pairwise rankings if create_pairs: fake_pairs = create_fake_pairs(fake_data, len(real_data)) if create_pairs == 'only': return pd.concat([fake_pairs, train_data], sort=True) else: log_info( 'Only keeping fake pairs, forgetting individual instances.' ) return pd.concat([fake_data, fake_pairs, train_data], sort=True) return pd.concat([fake_data, train_data]) # no fake data to be added -> return just the original return train_data
def train(self, das, trees, data_portion=1.0, valid_das=None, valid_trees=None): """Run training on the given training data. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (defaults to 1.0) @param valid_das: validation data DAs @param valid_trees: list of lists of corresponding paraphrases (same length as valid_das) """ log_info('Training reranking classifier...') # initialize training self._init_training(das, trees, data_portion) if self.mode in ['tokens', 'tagged_lemmas' ] and valid_trees is not None: valid_trees = [ self._tokens_to_flat_trees( paraphrases, use_tags=self.mode == 'tagged_lemmas') for paraphrases in valid_trees ] # start training top_comb_cost = float('nan') for iter_no in xrange(1, self.passes + 1): self.train_order = range(len(self.train_trees)) if self.randomize: rnd.shuffle(self.train_order) pass_cost, pass_diff = self._training_pass(iter_no) if self.validation_freq and iter_no > self.min_passes and iter_no % self.validation_freq == 0: valid_diff = 0 if valid_das: valid_diff = np.sum([ np.sum(self.dist_to_da(d, t)) for d, t in zip(valid_das, valid_trees) ]) # cost combining validation and training data performance # (+ "real" cost with negligible weight) comb_cost = 1000 * valid_diff + 100 * pass_diff + pass_cost log_info('Combined validation cost: %8.3f' % comb_cost) # if we have the best model so far, save it as a checkpoint (overwrite previous) if math.isnan(top_comb_cost) or comb_cost < top_comb_cost: top_comb_cost = comb_cost self._save_checkpoint() # restore last checkpoint (best performance on devel data) self.restore_checkpoint()
def convert_model(model_fname): reset_default_graph() param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname) log_info('Converting %s to %s...' % (model_fname, param_fname)) model = Seq2SeqBase.load_from_file(model_fname) with file_stream(param_fname, 'wb', encoding=None) as fh: pickle.dump(model.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def seq2seq_train(args): ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-s', '--train-size', type=float, help='Portion of the training data to use (default: 1.0)', default=1.0) ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use') ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs') ap.add_argument('-e', '--experiment-id', type=str, help='Experiment ID for parallel jobs (used as job name prefix)') ap.add_argument('-r', '--random-seed', type=str, help='Initial random seed (used as string).') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('-v', '--valid-data', type=str, help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)') ap.add_argument('-l', '--lexic-data', type=str, help='Lexicalization data paths (1-2 comma-separated files: surface forms,' + 'training lexic. instructions)') ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str, help='Directory where Tensorboard summaries are saved during training') ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file') ap.add_argument('da_train_file', type=str, help='Input training DAs') ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences') ap.add_argument('seq2seq_model_file', type=str, help='File name where to save the trained Seq2Seq generator model') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) if args.random_seed: rnd.seed(args.random_seed) log_info('Training sequence-to-sequence generator...') config = Config(args.seq2seq_config_file) if args.tb_summary_dir: # override Tensorboard setting config['tb_summary_dir'] = args.tb_summary_dir if args.jobs: # parallelize when training config['jobs_number'] = args.jobs if not args.work_dir: work_dir, _ = os.path.split(args.seq2seq_config_file) generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id) else: # just a single training instance generator = Seq2SeqGen(config) generator.train(args.da_train_file, args.tree_train_file, data_portion=args.train_size, context_file=args.context_file, validation_files=args.valid_data, lexic_files=args.lexic_data) sys.setrecursionlimit(100000) generator.save_to_file(args.seq2seq_model_file)
def expand(self): log_info("Expanding...") for da_key, (da, orig_pos) in self.orig_da_positions.iteritems(): if da_key not in self.transl_da_positions: print >> sys.stderr, "DA key not found: %s" % da_key print >> sys.stderr, "Original positions: %s" % ", ".join( [str(p) for p in orig_pos]) continue _, transl_pos = self.transl_da_positions[da_key] self.expand_da(da, orig_pos, transl_pos)
def write_outputs(self): log_info("Writing outputs...") write_texts(self.out_texts_file, self.out_texts) write_toks(self.out_delex_texts_file, self.out_delex_texts, capitalize=False, detok=False, lowercase=True) write_das(self.out_das_file, self.out_das) write_das(self.out_delex_das_file, self.out_delex_das)
def _create_delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delex_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._texts, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get( (abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it (check if the value corresponds to the DA!) if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care'] and value in da.value_for_slot(slot)): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info( "Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, str(da), " ".join( [form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delex_texts.append(delex_text) self._absts.append(absts)
def _init_training(self, das_file, ttree_file, data_portion): # load data, determine number of features etc. etc. super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion) self._init_neural_network() self.w_after_iter = [] self.update_weights_sum() log_debug('\n***\nINIT:') log_debug(self._feat_val_str()) log_info('Training ...')
def _load_contexts(self, das, context_file): """Load input context utterances from a .yaml.gz/.pickle.gz/.txt file and add them to the given DAs (each returned item is then a tuple of context + DA).""" # read contexts, combine them with corresponding DAs for easier handling if context_file is None: raise ValueError('Expected context utterances file name!') log_info('Reading context utterances from %s...' % context_file) if context_file.endswith('.txt'): contexts = read_tokens(context_file) else: contexts = tokens_from_doc(read_ttrees(context_file), self.language, self.selector) return [(context, da) for context, da in zip(contexts, das)]
def load_from_file(lexicalizer_fname): """Load the lexicalizer model from a file (and a second file with the LM, if needed).""" log_info("Loading lexicalizer from %s..." % lexicalizer_fname) with file_stream(lexicalizer_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Lexicalizer(cfg=data['cfg']) ret.__dict__.update(data) ret._form_select = ret._form_select(data['cfg']) if not isinstance(ret._form_select, RandomFormSelect): ret._form_select.load_model(lexicalizer_fname) return ret
def load_from_file(model_fname): log_info("Loading classifier from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: typeid = pickle.load(fh) if typeid != E2EPatternClassifier: raise ValueError('Wrong type identifier in file %s' % model_fname) cfg = pickle.load(fh) ret = E2EPatternClassifier(cfg) ret.__dict__.update(cfg) # load the trained settings return ret
def convert(args): """Main conversion function (using command-line arguments as parsed by Argparse).""" log_info('Loading...') reader = Reader(args.tagger_model, args.abst_slots) reader.load_surface_forms(args.surface_forms) log_info('Processing input files...') insts = reader.process_dataset(args.input_data) log_info('Loaded %d data items.' % len(insts)) # write all data groups # outputs: plain delex, plain lex, interleaved delex & lex, CoNLL-U delex & lex, DAs, abstrs writer = Writer() log_info('Writing %s (size: %d)...' % (args.out_prefix, len(insts))) writer.write_absts(args.out_prefix + '-abst.txt', insts) writer.write_das(args.out_prefix + '-das_l.txt', insts) writer.write_das(args.out_prefix + '-das.txt', insts, delex=True) writer.write_text(args.out_prefix + '-text_l.txt', 'plain', insts) writer.write_text(args.out_prefix + '-text.txt', 'plain', insts, delex=True) writer.write_text(args.out_prefix + '-tls_l.txt', 'interleaved', insts) writer.write_text(args.out_prefix + '-tls.txt', 'interleaved', insts, delex=True) writer.write_text(args.out_prefix + '-text_l.conll', 'conll', insts) writer.write_text(args.out_prefix + '-text.conll', 'conll', insts, delex=True)
def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): if rank_config['nn'] == 'emb': ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) ranker.save_to_file(fname_rank_model)
def _load_trees(self, ttree_file, selector=None): """Load input trees/sentences from a .yaml.gz/.pickle.gz (trees) or .txt (sentences) file.""" log_info('Reading t-trees/sentences from ' + ttree_file + '...') if ttree_file.endswith('.txt'): if not self.use_tokens: raise ValueError("Cannot read trees from a .txt file (%s)!" % ttree_file) return read_tokens(ttree_file) else: ttree_doc = read_ttrees(ttree_file) if selector is None: selector = self.selector if self.use_tokens: return tokens_from_doc(ttree_doc, self.language, selector) else: return trees_from_doc(ttree_doc, self.language, selector)
def save_to_file(self, model_fname): """Save the generator to a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph will be stored with a \ different extension """ log_info("Saving classifier to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) if self.checkpoint_path: shutil.copyfile(self.checkpoint_path, tf_session_fname) else: self.saver.save(self.session, tf_session_fname)
def _delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delexed_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._sents, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care']): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info("Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, unicode(da), " ".join([form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delexed_texts.append(delex_text) self._absts.append(absts)
def train(self, das, trees, data_portion=1.0, valid_das=None, valid_trees=None): """Run training on the given training data. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (defaults to 1.0) @param valid_das: validation data DAs @param valid_trees: list of lists of corresponding paraphrases (same length as valid_das) """ log_info('Training reranking classifier...') # initialize training self._init_training(das, trees, data_portion) if self.mode in ['tokens', 'tagged_lemmas'] and valid_trees is not None: valid_trees = [self._tokens_to_flat_trees(paraphrases, use_tags=self.mode == 'tagged_lemmas') for paraphrases in valid_trees] # start training top_comb_cost = float('nan') for iter_no in xrange(1, self.passes + 1): self.train_order = range(len(self.train_trees)) if self.randomize: rnd.shuffle(self.train_order) pass_cost, pass_diff = self._training_pass(iter_no) if self.validation_freq and iter_no > self.min_passes and iter_no % self.validation_freq == 0: valid_diff = 0 if valid_das: valid_diff = np.sum([np.sum(self.dist_to_da(d, t)) for d, t in zip(valid_das, valid_trees)]) # cost combining validation and training data performance # (+ "real" cost with negligible weight) comb_cost = 1000 * valid_diff + 100 * pass_diff + pass_cost log_info('Combined validation cost: %8.3f' % comb_cost) # if we have the best model so far, save it as a checkpoint (overwrite previous) if math.isnan(top_comb_cost) or comb_cost < top_comb_cost: top_comb_cost = comb_cost self._save_checkpoint() # restore last checkpoint (best performance on devel data) self.restore_checkpoint()
def load_from_file(model_fname): """Load the reranker from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading reranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = RerankingClassifier(cfg=data['cfg']) ret.load_all_settings(data) # re-build TF graph and restore the TF session tf_session_fname = os.path.abspath(re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def candgen_train(args): opts, files = getopt(args, 'p:lnc:sd:t:') prune_threshold = 1 parent_lemmas = False node_limits = False comp_type = None comp_limit = None comp_slots = False tree_classif = False for opt, arg in opts: if opt == '-p': prune_threshold = int(arg) elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-l': parent_lemmas = True elif opt == '-n': node_limits = True elif opt == '-c': comp_type = arg if ':' in comp_type: comp_type, comp_limit = comp_type.split(':', 1) comp_limit = int(comp_limit) elif opt == '-t': tree_classif = Config(arg) elif opt == '-s': comp_slots = True if len(files) != 3: sys.exit("Invalid arguments.\n" + __doc__) fname_da_train, fname_ttrees_train, fname_cand_model = files log_info('Training candidate generator...') candgen = RandomCandidateGenerator({'prune_threshold': prune_threshold, 'parent_lemmas': parent_lemmas, 'node_limits': node_limits, 'compatible_dais_type': comp_type, 'compatible_dais_limit': comp_limit, 'compatible_slots': comp_slots, 'tree_classif': tree_classif}) candgen.train(fname_da_train, fname_ttrees_train) candgen.save_to_file(fname_cand_model)
def _init_neural_network(self): """Create the neural network for classification, according to the self.nn_shape parameter (as set in configuration).""" layers = [] if self.tree_embs: layers.append([Embedding('emb', self.dict_size, self.emb_size, 'uniform_005')]) # feedforward networks if self.nn_shape.startswith('ff'): if self.tree_embs: layers.append([Flatten('flat')]) num_ff_layers = 2 if self.nn_shape[-1] in ['0', '1', '3', '4']: num_ff_layers = int(self.nn_shape[-1]) layers += self._ff_layers('ff', num_ff_layers) # convolutional networks elif 'conv' in self.nn_shape or 'pool' in self.nn_shape: assert self.tree_embs # convolution makes no sense without embeddings num_conv = 0 if 'conv' in self.nn_shape: num_conv = 1 if 'conv2' in self.nn_shape: num_conv = 2 pooling = None if 'maxpool' in self.nn_shape: pooling = T.max elif 'avgpool' in self.nn_shape: pooling = T.mean layers += self._conv_layers('conv', num_conv, pooling) layers.append([Flatten('flat')]) layers += self._ff_layers('ff', 1) # input types: integer 3D for tree embeddings (batch + 2D embeddings), # float 2D (matrix) for binary input (batch + features) input_types = (T.itensor3,) if self.tree_embs else (T.fmatrix,) # create the network, connect layers self.classif = ClassifNN(layers, self.input_shape, input_types, normgrad=False) log_info("Network shape:\n\n" + str(self.classif))
def convert(args): """Main conversion function (using command-line arguments as parsed by Argparse).""" log_info('Loading...') analyzer = MorphoAnalyzer(args.tagger_model, args.abst_slots) analyzer.load_surface_forms(args.surface_forms) log_info('Processing input files...') analyzer.process_files(args.input_text_file, args.input_da_file, args.skip_hello) log_info('Loaded %d data items.' % analyzer.buf_length()) # outputs: plain delex, plain lex, interleaved delex & lex, CoNLL-U delex & lex, DAs, abstrs # TODO maybe do relexicalization, but not now (no time) if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_prefix) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them) total = float(sum(data_sizes)) remain = analyzer.buf_length() for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(round(analyzer.buf_length() * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [analyzer.buf_length()] out_names = [args.out_prefix] # write all data parts offset = 0 for part_size, part_name in zip(data_sizes, out_names): log_info('Writing %s (size: %d)...' % (part_name, part_size)) subrange = slice(offset, offset + part_size) analyzer.write_absts(part_name + '-abst.txt', subrange) analyzer.write_das(part_name + '-das_l.txt', subrange) analyzer.write_das(part_name + '-das.txt', subrange, delex=True) analyzer.write_text(part_name + '-text_l.txt', 'plain', subrange) analyzer.write_text(part_name + '-text.txt', 'plain', subrange, delex=True) analyzer.write_text(part_name + '-tls_l.txt', 'interleaved', subrange) analyzer.write_text(part_name + '-tls.txt', 'interleaved', subrange, delex=True) analyzer.write_text(part_name + '-text_l.conll', 'conll', subrange) analyzer.write_text(part_name + '-text.conll', 'conll', subrange, delex=True) offset += part_size
def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None): """ The main training process – initialize and perform a specified number of training passes, validating every couple iterations. @param das_file: training data file with DAs @param ttree_file: training data file with output t-trees/sentences @param data_portion: portion of training data to be actually used, defaults to 1.0 @param context_file: path to training file with contexts (trees/sentences) @param validation_files: paths to validation data (DAs, trees/sentences, possibly contexts) """ # load and prepare data and initialize the neural network self._init_training(das_file, ttree_file, data_portion, context_file, validation_files) # do the training passes for iter_no in xrange(1, self.passes + 1): self.train_order = range(len(self.train_enc)) if self.randomize: rnd.shuffle(self.train_order) self._training_pass(iter_no) # validate every couple iterations if iter_no % self.validation_freq == 0 and self.validation_size > 0: cur_train_out = self.process_das(self.train_das[:self.batch_size]) log_info("Current train output:\n" + "\n".join([" ".join(n.t_lemma for n in tree.nodes[1:]) if self.use_tokens else unicode(tree) for tree in cur_train_out])) cur_valid_out = self.process_das(self.valid_das[:self.batch_size]) cur_cost = self._compute_valid_cost(cur_valid_out, self.valid_trees) log_info("Current validation output:\n" + "\n".join([" ".join(n.t_lemma for n in tree.nodes[1:]) if self.use_tokens else unicode(tree) for tree in cur_valid_out])) log_info('IT %d validation cost: %5.4f' % (iter_no, cur_cost)) # if we have the best model so far, save it as a checkpoint (overwrite previous) if math.isnan(self.top_k_costs[0]) or cur_cost < self.top_k_costs[0]: self._save_checkpoint() if self._should_stop(iter_no, cur_cost): log_info("Stoping criterion met.") break
def load_from_file(model_fname): """Load the whole ensemble from a file (load settings and model parameters, then build the ensemble network).""" log_info("Loading ensemble generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: typeid = pickle.load(fh) if typeid != Seq2SeqEnsemble: raise ValueError('Wrong type identifier in file %s' % model_fname) cfg = pickle.load(fh) ret = Seq2SeqEnsemble(cfg) gens_dump = pickle.load(fh) if 'classif_filter' in cfg: rerank_settings = pickle.load(fh) rerank_params = pickle.load(fh) else: rerank_settings = None rerank_params = None ret.build_ensemble(gens_dump, rerank_settings, rerank_params) return ret
def _load_valid_data(self, valid_data_paths): """Load validation data from separate files (comma-separated list of files with DAs, trees, and optionally contexts is expected).""" # parse validation data file specification valid_data_paths = valid_data_paths.split(',') if len(valid_data_paths) == 3: # with contexts (this does not determine if they're used) valid_das_file, valid_trees_file, valid_context_file = valid_data_paths else: valid_das_file, valid_trees_file = valid_data_paths # load the validation data log_info('Reading DAs from ' + valid_das_file + '...') self.valid_das = read_das(valid_das_file) self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors) if self.use_context: self.valid_das = self._load_contexts(self.valid_das, valid_context_file) # reorder validation data for multiple references (see also _cut_valid_data) valid_size = len(self.valid_trees) if self.multiple_refs: num_refs, refs_stored = self._check_multiple_ref_type(valid_size) # serial: different instances next to each other, then synonymous in the same order if refs_stored == 'serial': valid_tree_chunks = [chunk for chunk in chunk_list(self.valid_trees, valid_size / num_refs)] self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks] for i in xrange(valid_size / num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[0:valid_size / num_refs] # parallel: synonymous instances next to each other elif refs_stored == 'parallel': self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)] if len(self.valid_das) > len(self.valid_trees): self.valid_das = self.valid_das[::num_refs] # no multiple references; make lists of size 1 to simplify working with the data else: self.valid_trees = [[tree] for tree in self.valid_trees]
def save_to_file(self, model_fname): """Save the whole ensemble into a file (get all settings and parameters, dump them in a pickle).""" log_info("Saving generator to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL) gens_dump = [] for gen in self.gens: setting = gen.get_all_settings() parset = gen.get_model_params() setting['classif_filter'] = self.classif_filter is not None gens_dump.append((setting, parset)) pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL) if self.classif_filter: pickle.dump(self.classif_filter.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(self.classif_filter.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
def _training_pass(self, iter_no): """Perform one pass through the training data (epoch). @param iter_no: pass number (for logging) """ it_cost = 0.0 it_learning_rate = self.alpha * np.exp(-self.alpha_decay * iter_no) log_info('IT %d alpha: %8.5f' % (iter_no, it_learning_rate)) for batch_no in self.train_order: # feed data into the TF session: # initial state initial_state = np.zeros([self.batch_size, self.emb_size]) feed_dict = {self.initial_state: initial_state, self.learning_rate: it_learning_rate} # encoder inputs for i in xrange(len(self.train_enc[batch_no])): feed_dict[self.enc_inputs[i]] = self.train_enc[batch_no][i] # decoder inputs for i in xrange(len(self.train_dec[batch_no])): feed_dict[self.dec_inputs[i]] = self.train_dec[batch_no][i] # the last target output (padding, to have the same number of step as there are decoder # inputs) is always 'VOID' for all instances of the batch feed_dict[self.targets[-1]] = len(self.train_dec[batch_no][0]) * [self.tree_embs.VOID] # run the TF session (one optimizer step == train_func) and get the cost # (1st value returned is None, throw it away) _, cost = self.session.run([self.train_func, self.cost], feed_dict=feed_dict) it_cost += cost log_info('IT %d total cost: %8.5f' % (iter_no, cost))
def eval_tokens(das, eval_tokens, gen_tokens): """Evaluate generated tokens and print out statistics.""" postprocess_tokens(eval_tokens, das) postprocess_tokens(gen_tokens, das) evaluator = BLEUMeasure() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): evaluator.append(pred_sent, gold_sents) log_info("BLEU score: %.4f" % (evaluator.bleu() * 100)) evaluator = Evaluator() for pred_sent, gold_sents in zip(gen_tokens, eval_tokens): for gold_sent in gold_sents: # effectively an average over all gold paraphrases evaluator.append(gold_sent, pred_sent) log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN)) log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats()) log_info("Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaluator.common_substruct_stats())
def rerank_cl_eval(args): ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-l', '--language', type=str, help='Override classifier language (for t-tree input files)') ap.add_argument('-s', '--selector', type=str, help='Override classifier selector (for t-tree input files)') ap.add_argument('fname_cl_model', type=str, help='Path to trained reranking classifier model') ap.add_argument('fname_test_da', type=str, help='Path to test DA file') ap.add_argument('fname_test_sent', type=str, help='Path to test trees file (must be trees!)') args = ap.parse_args(args) log_info("Loading reranking classifier...") rerank_cl = RerankingClassifier.load_from_file(args.fname_cl_model) if args.language is not None: rerank_cl.language = args.language if args.selector is not None: rerank_cl.selector = args.selector log_info("Evaluating...") tot_len, dist = rerank_cl.evaluate_file(args.fname_test_da, args.fname_test_sent) log_info("Penalty: %d, Total DAIs %d." % (dist, tot_len))
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
def asearch_gen(args): """A*search generation""" from pytreex.core.document import Document opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) gen_tree = tgen.generate_tree(da, gen_doc) lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list) if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_substruct_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def sample_gen(args): from pytreex.core.document import Document opts, files = getopt(args, 'r:n:o:w:') num_to_generate = 1 oracle_eval_file = None fname_ttrees_out = None for opt, arg in opts: if opt == '-n': num_to_generate = int(arg) elif opt == '-o': oracle_eval_file = arg elif opt == '-w': fname_ttrees_out = arg if len(files) != 2: sys.exit(__doc__) fname_cand_model, fname_da_test = files # load model log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = candgen tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker}) # generate log_info('Generating...') gen_doc = Document() das = read_das(fname_da_test) for da in das: for _ in xrange(num_to_generate): # repeat generation n times tgen.generate_tree(da, gen_doc) # evaluate if needed if oracle_eval_file is not None: log_info('Evaluating oracle F1...') log_info('Loading gold data from ' + oracle_eval_file) gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector) gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Gold data loaded.') correct, predicted, gold = 0, 0, 0 for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)): # find best of predicted trees (in terms of F1) _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree), gen_trees)], key=lambda x: x[0]) correct += tc predicted += tp gold += tg # evaluate oracle F1 log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold)) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
rerank_cl.selector = args.selector log_info("Evaluating...") tot_len, dist = rerank_cl.evaluate_file(args.fname_test_da, args.fname_test_sent) log_info("Penalty: %d, Total DAIs %d." % (dist, tot_len)) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit(__doc__) action = sys.argv[1] args = sys.argv[2:] log_info('Running on %s version %s' % (platform.python_implementation(), platform.python_version())) if action == 'candgen_train': candgen_train(args) elif action == 'percrank_train': percrank_train(args) elif action == 'sample_gen': sample_gen(args) elif action == 'asearch_gen': asearch_gen(args) elif action == 'seq2seq_train': seq2seq_train(args) elif action == 'seq2seq_gen': seq2seq_gen(args) elif action == 'treecl_train': treecl_train(args)
def _init_training(self, das_file, ttree_file, data_portion, context_file, validation_files): """Load training data, prepare batches, build the NN. @param das_file: training DAs (file path) @param ttree_file: training t-trees (file path) @param data_portion: portion of the data to be actually used for training @param context_file: training contexts (file path) @param validation_files: validation file paths (or None) """ # read training data log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) trees = self._load_trees(ttree_file) if self.use_context: das = self._load_contexts(das, context_file) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # load separate validation data files... if validation_files: self._load_valid_data(validation_files) # ... or save part of the training data for validation: elif self.validation_size > 0: self._cut_valid_data() # will set train_trees, valid_trees, train_das, valid_das log_info('Using %d training, %d validation instances.' % (len(self.train_das), len(self.valid_das))) # initialize embeddings if self.use_context: self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg) if self.use_tokens: self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg) else: self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg) self.da_dict_size = self.da_embs.init_dict(self.train_das) self.tree_dict_size = self.tree_embs.init_dict(self.train_trees) self.max_tree_len = self.tree_embs.get_embeddings_shape()[0] self.max_da_len = self.da_embs.get_embeddings_shape()[0] # prepare training batches self.train_enc = [cut_batch_into_steps(b) for b in grouper([self.da_embs.get_embeddings(da) for da in self.train_das], self.batch_size, None)] self.train_dec = [cut_batch_into_steps(b) for b in grouper([self.tree_embs.get_embeddings(tree) for tree in self.train_trees], self.batch_size, None)] # train the classifier for filtering n-best lists if self.classif_filter: self.classif_filter.train(self.train_das, self.train_trees, valid_das=self.valid_das, valid_trees=self.valid_trees) self.classif_filter.restore_checkpoint() # restore the best performance on devel data # convert validation data to flat trees to enable F1 measuring if self.validation_size > 0 and self.use_tokens: self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees) # initialize top costs self.top_k_costs = [float('nan')] * self.top_k self.checkpoint_path = None # build the NN self._init_neural_network() # initialize the NN variables self.session.run(tf.initialize_all_variables())