def run(self): """ Runs ensemble model. """ vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"]) vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"]) estimator_spec = model_fn_ensemble( self._model_dirs, vocab_source, vocab_target, weight_scheme=self._weight_scheme, inference_options=self._model_configs["infer"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter(line_readers=[ LineReader( data=p["features_file"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)) for p in self._model_configs["infer_data"] ], padding_id=vocab_source.pad_id, batch_size=self. _model_configs["infer"]["batch_size"]) sess.run(tf.global_variables_initializer()) tf.logging.info("Start inference.") overall_start_time = time.time() for feeding_data, param in zip( text_inputter.make_feeding_data(estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source Features File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=feeding_data, output=param["output_file"], vocab_source=vocab_source, vocab_target=vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=False, to_char_level=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """Infers data files. """ # build datasets vocab_source = Vocab( filename=self._model_configs["infer"]["source_words_vocabulary"], bpe_codes=self._model_configs["infer"]["source_bpecodes"], reverse_seq=self._model_configs["train"]["features_r2l"]) vocab_target = Vocab( filename=self._model_configs["infer"]["target_words_vocabulary"], bpe_codes=self._model_configs["infer"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["labels_r2l"]) self._model_configs = update_infer_params( self._model_configs, beam_size=self._model_configs["infer"]["beam_size"], maximum_labels_length=self._model_configs["infer"] ["maximum_labels_length"], length_penalty=self._model_configs["infer"]["length_penalty"]) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_configs["problem_name"]) predict_op = estimator_spec.predictions sess = self._build_default_session() text_inputter = TextLineInputter(line_readers=[ LineReader( data=p["features_file"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)) for p in self._model_configs["infer_data"] ], padding_id=vocab_source.pad_id, batch_size=self. _model_configs["infer"]["batch_size"]) # reload checkpoint_path = tf.train.latest_checkpoint( self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError( "File NOT Found. Fail to find checkpoint file from: {}".format( self._model_configs["model_dir"])) tf.logging.info("Start inference.") overall_start_time = time.time() for infer_data, param in zip( text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields), self._model_configs["infer_data"]): tf.logging.info("Infer Source File: {}.".format( param["features_file"])) start_time = time.time() infer(sess=sess, prediction_op=predict_op, infer_data=infer_data, output=param["output_file"], vocab_source=vocab_source, vocab_target=vocab_target, delimiter=self._model_configs["infer"]["delimiter"], output_attention=param["output_attention"], tokenize_output=self._model_configs["infer"]["char_level"], verbose=True) tf.logging.info("FINISHED {}. Elapsed Time: {}.".format( param["features_file"], str(time.time() - start_time))) if param["labels_file"] is not None: bleu_score = multi_bleu_score_from_file( hypothesis_file=param["output_file"], references_files=param["labels_file"], char_level=self._model_configs["infer"]["char_level"]) tf.logging.info("BLEU score (%s): %.2f" % (param["features_file"], bleu_score)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))
def run(self): """ Trains the model. """ # vocabulary vocab_source = Vocab( filename=self._model_configs["data"]["source_words_vocabulary"], bpe_codes=self._model_configs["data"]["source_bpecodes"], reverse_seq=self._model_configs["train"]["features_r2l"]) vocab_target = Vocab( filename=self._model_configs["data"]["target_words_vocabulary"], bpe_codes=self._model_configs["data"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["labels_r2l"]) eval_dataset = { "vocab_source": vocab_source, "vocab_target": vocab_target, "features_file": self._model_configs["data"]["eval_features_file"], "labels_file": self._model_configs["data"]["eval_labels_file"] } config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.TRAIN, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_configs["problem_name"]) train_ops = estimator_spec.train_ops hooks = estimator_spec.training_hooks # build training session sess = tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(), checkpoint_dir=None, master="", config=config), hooks=tuple(hooks) + tuple( build_eval_metrics(self._model_configs, eval_dataset, model_name=estimator_spec.name))) train_text_inputter = ParallelTextInputter( LineReader( data=self._model_configs["data"]["train_features_file"], maximum_length=self._model_configs["train"] ["maximum_features_length"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), LineReader( data=self._model_configs["data"]["train_labels_file"], maximum_length=self._model_configs["train"] ["maximum_labels_length"], preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)), vocab_source.pad_id, vocab_target.pad_id, batch_size=self._model_configs["train"]["batch_size"], batch_tokens_size=self._model_configs["train"] ["batch_tokens_size"], shuffle_every_epoch=self._model_configs["train"] ["shuffle_every_epoch"], fill_full_batch=True, bucketing=True) train_data = train_text_inputter.make_feeding_data( input_fields=estimator_spec.input_fields) eidx = [0, 0] update_cycle = [self._model_configs["train"]["update_cycle"], 1] def step_fn(step_context): step_context.session.run(train_ops["zeros_op"]) try: while update_cycle[0] != update_cycle[1]: data = train_data.next() step_context.session.run(train_ops["collect_op"], feed_dict=data["feed_dict"]) update_cycle[1] += 1 data = train_data.next() update_cycle[1] = 1 return step_context.run_with_hooks(train_ops["train_op"], feed_dict=data["feed_dict"]) except StopIteration: eidx[1] += 1 while not sess.should_stop(): if eidx[0] != eidx[1]: tf.logging.info("STARTUP Epoch {}".format(eidx[1])) eidx[0] = eidx[1] sess.run_step_fn(step_fn)
def run(self): """Infers data files. """ # build datasets vocab_source = Vocab( filename=self._model_configs["eval"]["source_words_vocabulary"], bpe_codes=self._model_configs["eval"]["source_bpecodes"], reverse_seq=self._model_configs["train"]["features_r2l"]) vocab_target = Vocab( filename=self._model_configs["eval"]["target_words_vocabulary"], bpe_codes=self._model_configs["eval"]["target_bpecodes"], reverse_seq=self._model_configs["train"]["labels_r2l"]) # update evaluation model config self._model_configs, metric_str = update_eval_metric( self._model_configs, self._model_configs["eval"]["metric"]) tf.logging.info("Evaluating using {}".format(metric_str)) # build model estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.EVAL, vocab_source=vocab_source, vocab_target=vocab_target, name=self._model_configs["problem_name"]) sess = self._build_default_session() # reload checkpoint_path = tf.train.latest_checkpoint(self._model_configs["model_dir"]) if checkpoint_path: tf.logging.info("reloading models...") saver = tf.train.Saver() saver.restore(sess, checkpoint_path) else: raise OSError("File NOT Found. Fail to load checkpoint file from: {}" .format(self._model_configs["model_dir"])) tf.logging.info("Start evaluation.") overall_start_time = time.time() for data_param in self._model_configs["eval_data"]: tf.logging.info("Evaluation Source File: {}.".format(data_param["features_file"])) tf.logging.info("Evaluation Target File: {}.".format(data_param["labels_file"])) eval_data = ParallelTextInputter( LineReader(data=data_param["features_file"], preprocessing_fn=lambda x: vocab_source.convert_to_idlist(x)), LineReader(data=data_param["labels_file"], preprocessing_fn=lambda x: vocab_target.convert_to_idlist(x)), vocab_source.pad_id, vocab_target.pad_id, batch_size=self._model_configs["eval"]["batch_size"], bucketing=False).make_feeding_data( input_fields=estimator_spec.input_fields, in_memory=True) start_time = time.time() result = evaluate_with_attention( sess=sess, loss_op=estimator_spec.loss, eval_data=eval_data, vocab_source=vocab_source, vocab_target=vocab_target, attention_op=estimator_spec.predictions \ if data_param["output_attention"] else None, output_filename_prefix=data_param["labels_file"].strip().split("/")[-1]) tf.logging.info("FINISHED {}. Elapsed Time: {}." .format(data_param["features_file"], str(time.time() - start_time))) tf.logging.info("Evaluation Score ({} on {}): {}" .format(metric_str, data_param["features_file"], result)) tf.logging.info("Total Elapsed Time: %s" % str(time.time() - overall_start_time))