def run(self): """ Trains the model. """ # vocabulary self._vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, train_features_file=self._model_configs["data"]["train_features_file"], train_labels_file=self._model_configs["data"]["train_labels_file"], eval_features_file=self._model_configs["data"]["eval_features_file"], eval_labels_file=self._model_configs["data"]["eval_labels_file"]) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, dataset=dataset, name=self._model_configs["problem_name"]) train_op = estimator_spec.train_op hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=None, hooks=hooks) train_text_inputter = ParallelTextInputter( dataset, "train_features_file", "train_labels_file", self._model_configs["train"]["batch_size"], self._model_configs["train"]["batch_tokens_size"], self._model_configs["train"]["shuffle_every_epoch"]) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, maximum_features_length=self._model_configs["train"]["maximum_features_length"], maximum_labels_length=self._model_configs["train"]["maximum_labels_length"]) eidx = 0 while True: if sess.should_stop(): break tf.logging.info("STARTUP Epoch {}".format(eidx)) for data in train_data: if sess.should_stop(): break sess.run(train_op, feed_dict=data["feed_dict"]) eidx += 1
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). Furthermore, if the decay_type of optimizer is "loss_decay", creates the controller variables/operations. """ features_file = self._dataset["features_file"] labels_file = self._dataset["labels_file"] vocab_source = self._dataset["vocab_source"] vocab_target = self._dataset["vocab_target"] text_inputter = ParallelTextInputter( LineReader(data=features_file, preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), LineReader(data=labels_file, preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)), vocab_source.pad_id, vocab_target.pad_id, batch_size=self._batch_size, batch_tokens_size=None, shuffle_every_epoch=None, bucketing=True) estimator_spec = model_fn( model_configs=self._model_configs, mode=ModeKeys.EVAL, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_name, reuse=True, verbose=False) self._eval_feeding_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True) self._loss_op = estimator_spec.loss # for learning decay decay self._half_lr = False self._start_decay_at = 0 if self._model_configs["optimizer_params"]["optimizer.lr_decay"]["decay_type"] == "loss_decay": self._half_lr = True lr_tensor_dict = get_dict_from_collection(Constants.LEARNING_RATE_VAR_NAME) self._learning_rate = lr_tensor_dict[Constants.LEARNING_RATE_VAR_NAME] self._max_patience = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["patience"] self._start_decay_at = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["start_decay_at"] assert self._start_decay_at >= self._start_at, ( "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec.") self._half_lr_op = lr_tensor_dict[Constants.LR_AUTO_HALF_OP_NAME] self._bad_count = 0 self._min_loss = 10000.
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). Furthermore, if the decay_type of optimizer is "loss_decay", creates the controller variables/operations. """ text_inputter = ParallelTextInputter( dataset=self._dataset, features_field_name="eval_features_file", labels_field_name="eval_labels_file", batch_size=self._batch_size, batch_tokens_size=None, shuffle_every_epoch=None, bucketing=True) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._eval_feeding_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True) self._loss_op = estimator_spec.loss # for learning decay decay self._half_lr = False self._start_decay_at = 0 if self._model_configs["optimizer_params"]["optimizer.lr_decay"][ "decay_type"] == "loss_decay": self._half_lr = True lr_tensor_dict = get_dict_from_collection( Constants.LEARNING_RATE_VAR_NAME) self._learning_rate = lr_tensor_dict[ Constants.LEARNING_RATE_VAR_NAME] self._max_patience = self._model_configs["optimizer_params"][ "optimizer.lr_decay"]["patience"] self._start_decay_at = self._model_configs["optimizer_params"][ "optimizer.lr_decay"]["start_decay_at"] assert self._start_decay_at >= self._start_at, ( "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec." ) div_factor = lr_tensor_dict[Constants.LR_ANNEAL_DIV_FACTOR_NAME] self._half_lr_op = div_factor.assign(div_factor * 2.) self._bad_count = 0 self._min_loss = 10000.
def testParallelInputterEval(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, eval_src_file, eval_trg_file) inputter = ParallelTextInputter(dataset, "eval_features_file", "eval_labels_file", batch_size=13, maximum_features_length=None, maximum_labels_length=None) eval_iter1 = EvalTextIterator(eval_src_file, eval_trg_file, vocab_src, vocab_trg, batch_size=13) eval_iter2 = TrainTextIterator(eval_src_file, eval_trg_file + "0", vocab_src, vocab_trg, batch_size=13, maxlen_src=1000, maxlen_trg=1000) input_fields = dataset.input_fields eval_data = inputter.make_feeding_data() for a, b, c in zip(eval_iter1, eval_iter2, eval_data): x1 = a[0][0] x_len1 = a[0][1] y1 = a[1][0] y_len1 = a[1][1] x2 = b[0][0] x_len2 = b[0][1] y2 = b[1][0] y_len2 = b[1][1] x_new = c[1][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = c[1][input_fields[Constants.FEATURE_LENGTH_NAME]] y_new = c[1][input_fields[Constants.LABEL_IDS_NAME]] y_len_new = c[1][input_fields[Constants.LABEL_LENGTH_NAME]] assert x1.all() == x_new.all() == x2.all() assert x_len1.all() == x_len_new.all() == x_len2.all() assert y1.all() == y_new.all() == y2.all() assert y_len1.all() == y_len_new.all() == y_len2.all() print("Test Passed...")
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). Furthermore, if the decay_type of optimizer is "loss_decay", creates the controller variables/operations. """ text_inputter = ParallelTextInputter( dataset=self._dataset, features_field_name="eval_features_file", labels_field_name="eval_labels_file", batch_size=self._batch_size, batch_tokens_size=None, shuffle_every_epoch=None, bucketing=True) estimator_spec = model_fn( model_configs=self._model_configs, mode=ModeKeys.EVAL, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._eval_feeding_data = text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True) self._loss_op = estimator_spec.loss # for learning decay decay self._half_lr = False self._start_decay_at = 0 if self._model_configs["optimizer_params"]["optimizer.lr_decay"]["decay_type"] == "loss_decay": self._half_lr = True lr_tensor_dict = get_dict_from_collection(Constants.LEARNING_RATE_VAR_NAME) self._learning_rate = lr_tensor_dict[Constants.LEARNING_RATE_VAR_NAME] self._max_patience = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["patience"] self._start_decay_at = self._model_configs["optimizer_params"]["optimizer.lr_decay"]["start_decay_at"] assert self._start_decay_at >= self._start_at, ( "start_decay_at in optimizer.lr_decay should be no less than start_at in LossMetricSpec.") div_factor = lr_tensor_dict[Constants.LR_ANNEAL_DIV_FACTOR_NAME] self._half_lr_op = div_factor.assign(div_factor * 2.) self._bad_count = 0 self._min_loss = 10000.
def testParallelInputterTrain(self): vocab_src = Vocab(vocab_src_file) vocab_trg = Vocab(vocab_trg_file) dataset = Dataset(vocab_src, vocab_trg, train_src_file, train_trg_file, eval_src_file, eval_trg_file) inputter = ParallelTextInputter(dataset, "train_features_file", "train_labels_file", batch_size=13, maximum_features_length=20, maximum_labels_length=20) inputter._cache_size = 10 train_iter = TrainTextIterator(train_src_file, train_trg_file, vocab_src, vocab_trg, batch_size=13, maxlen_src=20, maxlen_trg=20) train_iter.k = 10 input_fields = dataset.input_fields train_data = inputter.make_feeding_data() for a, b in zip(train_iter, train_data): x = a[0][0] x_len = a[0][1] y = a[1][0] y_len = a[1][1] x_new = b[1][input_fields[Constants.FEATURE_IDS_NAME]] x_len_new = b[1][input_fields[Constants.FEATURE_LENGTH_NAME]] y_new = b[1][input_fields[Constants.LABEL_IDS_NAME]] y_len_new = b[1][input_fields[Constants.LABEL_LENGTH_NAME]] assert x.all() == x_new.all() assert x_len.all() == x_len_new.all() assert y.all() == y_new.all() assert y_len.all() == y_len_new.all() print("Test Passed...")
def run(self): """Infers data files. """ # build datasets self._vocab_source = Vocab( filename=self._model_configs["eval"]["source_words_vocabulary"], bpe_codes=self._model_configs["eval"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["eval"]["target_words_vocabulary"], bpe_codes=self._model_configs["eval"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, eval_features_file=[p["features_file"] for p in self._model_configs["eval_data"]], eval_labels_file=[p["labels_file"] for p in self._model_configs["eval_data"]]) # update evaluation model config self._model_configs, metric_str = update_eval_metric( self._model_configs, self._model_configs["eval"]["metric"]) tf.logging.info("Evaluating using {}".format(metric_str)) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, dataset=dataset, name=self._model_configs["problem_name"]) sess = self._build_default_session() do_bucketing = (sum([p["output_attention"] for p in self._model_configs["eval_data"]]) == 0) text_inputter = ParallelTextInputter( dataset=dataset, features_field_name="eval_features_file", labels_field_name="eval_labels_file", batch_size=self._model_configs["eval"]["batch_size"], bucketing=do_bucketing) # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to load checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start evaluation.") overall_start_time = time.time() for eval_data, param in zip(text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True), self._model_configs["eval_data"]): tf.logging.info("Evaluation Source File: {}.".format(param["features_file"])) tf.logging.info("Evaluation Target File: {}.".format(param["labels_file"])) start_time = time.time() result = evaluate_with_attention( sess=sess, eval_op=estimator_spec.loss, eval_data=eval_data, vocab_source=self._vocab_source, vocab_target=self._vocab_target, attention_op=estimator_spec.predictions \ if param["output_attention"] else None, output_filename_prefix=param["labels_file"].strip().split("/")[-1]) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(param["features_file"], str(time.time() - start_time))) tf.logging.info("Evaluation Score ({} on {}): {}" .format(metric_str, param["features_file"], result)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Trains the model. """ # vocabulary vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=self._model_configs["train"]["features_r2l"]) vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["labels_r2l"]) eval_dataset = { "vocab_source": vocab_source, "vocab_target": vocab_target, "features_file": self._model_configs["data"]["eval_features_file"], "labels_file": self._model_configs["data"]["eval_labels_file"] } config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_configs["problem_name"]) train_ops = estimator_spec.train_ops hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(), checkpoint_dir=None, master="", config=config), hooks=tuple(hooks) + tuple( build_eval_metrics(self._model_configs, eval_dataset, model_name=estimator_spec.name))) train_text_inputter = ParallelTextInputter( LineReader( data=self._model_configs["data"]["train_features_file"], maximum_length=self._model_configs["train"] ["maximum_features_length"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), LineReader( data=self._model_configs["data"]["train_labels_file"], maximum_length=self._model_configs["train"] ["maximum_labels_length"], preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)), vocab_source.pad_id, vocab_target.pad_id, batch_size=self._model_configs["train"]["batch_size"], batch_tokens_size=self._model_configs["train"] ["batch_tokens_size"], shuffle_every_epoch=self._model_configs["train"] ["shuffle_every_epoch"], fill_full_batch=True, bucketing=True) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) eidx = [0, 0] update_cycle = [self._model_configs["train"]["update_cycle"], 1] def step_fn(step_context): step_context.session.run(train_ops["zeros_op"]) try: while update_cycle[0] != update_cycle[1]: data = train_data.next() step_context.session.run(train_ops["collect_op"], feed_dict=data["feed_dict"]) update_cycle[1] += 1 data = train_data.next() update_cycle[1] = 1 return step_context.run_with_hooks(train_ops["train_op"], feed_dict=data["feed_dict"]) except StopIteration: eidx[1] += 1 while not sess.should_stop(): if eidx[0] != eidx[1]: tf.logging.info("STARTUP Epoch {}".format(eidx[1])) eidx[0] = eidx[1] sess.run_step_fn(step_fn)
def run(self): """Infers data files. """ # build datasets self._vocab_source = Vocab( filename=self._model_configs["eval"]["source_words_vocabulary"], bpe_codes=self._model_configs["eval"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["eval"]["target_words_vocabulary"], bpe_codes=self._model_configs["eval"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, eval_features_file=[p["features_file"] for p in self._model_configs["eval_data"]], eval_labels_file=[p["labels_file"] for p in self._model_configs["eval_data"]]) # update evaluation model config self._model_configs, metric_str = update_eval_metric( self._model_configs, self._model_configs["eval"]["metric"]) tf.logging.info("Evaluating using {}".format(metric_str)) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, dataset=dataset, name=self._model_configs["problem_name"]) sess = self._build_default_session() text_inputter = ParallelTextInputter( dataset=dataset, features_field_name="eval_features_file", labels_field_name="eval_labels_file", batch_size=self._model_configs["eval"]["batch_size"], bucketing=(sum([p["output_attention"] for p in self._model_configs["eval_data"]]) == 0)) # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to load checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start evaluation.") overall_start_time = time.time() for eval_data, param in zip(text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True), self._model_configs["eval_data"]): tf.logging.info("Evaluation Source File: {}.".format(param["features_file"])) tf.logging.info("Evaluation Target File: {}.".format(param["labels_file"])) start_time = time.time() result = evaluate_with_attention( sess=sess, loss_op=estimator_spec.loss, eval_data=eval_data, vocab_source=self._vocab_source, vocab_target=self._vocab_target, attention_op=estimator_spec.predictions \ if param["output_attention"] else None, output_filename_prefix=param["labels_file"].strip().split("/")[-1]) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(param["features_file"], str(time.time() - start_time))) tf.logging.info("Evaluation Score ({} on {}): {}" .format(metric_str, param["features_file"], result)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Trains the model. """ # vocabulary self._vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=False) self._vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["reverse_target"]) # build dataset dataset = Dataset( self._vocab_source, self._vocab_target, train_features_file=self._model_configs["data"]["train_features_file"], train_labels_file=self._model_configs["data"]["train_labels_file"], eval_features_file=self._model_configs["data"]["eval_features_file"], eval_labels_file=self._model_configs["data"]["eval_labels_file"]) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, dataset=dataset, name=self._model_configs["problem_name"]) train_ops = estimator_spec.train_ops hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(), checkpoint_dir=None, master="", config=config), hooks=hooks) train_text_inputter = ParallelTextInputter( dataset, "train_features_file", "train_labels_file", self._model_configs["train"]["batch_size"], self._model_configs["train"]["batch_tokens_size"], self._model_configs["train"]["shuffle_every_epoch"], fill_full_batch=True) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields, maximum_features_length=self._model_configs["train"]["maximum_features_length"], maximum_labels_length=self._model_configs["train"]["maximum_labels_length"]) eidx = [0, 0] update_cycle = [self._model_configs["train"]["update_cycle"], 1] def step_fn(step_context): step_context.session.run(train_ops["zeros_op"]) try: while update_cycle[0] != update_cycle[1]: data = train_data.next() step_context.session.run( train_ops["collect_op"], feed_dict=data["feed_dict"]) update_cycle[1] += 1 data = train_data.next() update_cycle[1] = 1 return step_context.run_with_hooks( train_ops["train_op"], feed_dict=data["feed_dict"]) except StopIteration: eidx[1] += 1 while not sess.should_stop(): if eidx[0] != eidx[1]: tf.logging.info("STARTUP Epoch {}".format(eidx[1])) eidx[0] = eidx[1] sess.run_step_fn(step_fn)