def predict(override_cfg, model_dir): """Run model over a dataset and dump predictions to json file.""" assert FLAGS.predict_path cfg = _load_config(model_dir) cfg = utils.merge(cfg, override_cfg) input_fn = data.get_input_fn( split=cfg.dataset.eval_split, max_length=None, repeat=False, shuffle=False, cache=False, limit=None, data_path=cfg.dataset.data_path, vocab_path=cfg.dataset.vocab_path, is_tpu=False, use_generator=True, is_training=False) estimator = model.get_estimator(**cfg) predictions = dict() for i, prediction in enumerate(estimator.predict(input_fn)): predictions[prediction["id"]] = prediction["answer"] if i % 100 == 0: tf.logging.info("Prediction %s | %s: %s" % (i, prediction["id"], prediction["answer"])) # Dump results to a file with tf.gfile.GFile(FLAGS.predict_path, "w") as f: json.dump(predictions, f)
def evaluate(override_cfg, model_dir, continuous=True): """Run training and evaluation.""" tf.logging.info("model_dir = " + model_dir) try: cfg = _load_config(model_dir) except tf.errors.NotFoundError: tf.logging.info("Model directory does not exist yet. Creating new config.") cfg = model.build_config(model_dir=model_dir, data_path=FLAGS.data_path) tf.logging.info(cfg) tf.logging.info(override_cfg) cfg = utils.merge(cfg, override_cfg) cfg.tpu.enable = False cfg.dataset.max_length = None # Construct inputs and estimator _, eval_input = data.build_dataset(cfg.dataset, is_tpu=cfg.tpu.enable) estimator = model.get_estimator(**cfg) if continuous: checkpoints_iterator = tf.contrib.training.checkpoints_iterator( cfg.model_dir) eval_metrics = None for ckpt_path in checkpoints_iterator: eval_metrics = estimator.evaluate( input_fn=eval_input, checkpoint_path=ckpt_path) tf.logging.info(pprint.pformat(eval_metrics)) return eval_metrics else: eval_metrics = estimator.evaluate(input_fn=eval_input) return eval_metrics
def train_run(estimator_params, x_train_prep, y_train, x_test_prep, y_test, temp_dir): temp_dir.mkdir(parents=True, exist_ok=True) _logger.info("Fitting the estimator") estimator, estimator_tags = get_estimator(**estimator_params) estimator.fit(x_train_prep, y_train) estimator_metrics, estimator_artifacts = evaluate_binary_classifier( model=estimator, data={ 'train': { 'x': x_train_prep, 'y': y_train }, 'test': { 'x': x_test_prep, 'y': y_test } }, temp_dir=temp_dir) return estimator, estimator_tags, estimator_metrics, estimator_artifacts
def train_and_eval(cfg, do_eval=True, report_fn=None): """Run training (and evaluation if on a GPU).""" tf.logging.info("cfg.model_dir = " + cfg.model_dir) # Save out config to model directory assert FLAGS.mode == "train" tf.gfile.MakeDirs(cfg.model_dir) with tf.gfile.GFile(os.path.join(cfg.model_dir, "config.json"), "w") as f: json.dump(cfg, f) if not cfg.dataset.num_repeats and not cfg.steps_per_epoch: raise ValueError("Must have a fixed num repeats or epoch step size.") # Construct inputs and estimator train_input, eval_input = data.build_dataset( cfg.dataset, is_tpu=cfg.tpu.enable) estimator = model.get_estimator(**cfg) if do_eval: eval_metrics = None for i in range(cfg.num_epochs): tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs)) train_metrics = estimator.train( input_fn=train_input, steps=cfg.steps_per_epoch or None) tf.logging.info(pprint.pformat(train_metrics)) eval_metrics = estimator.evaluate(input_fn=eval_input) tf.logging.info(pprint.pformat(eval_metrics)) if report_fn: report_fn(eval_metrics) return eval_metrics else: for i in range(cfg.num_epochs): tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs)) train_metrics = estimator.train( input_fn=train_input, steps=cfg.steps_per_epoch) tf.logging.info(pprint.pformat(train_metrics)) return dict()
def main(argv=None): """Run the CLV model.""" argv = sys.argv if argv is None else argv args = create_parser().parse_args(args=argv[1:]) # Set logging mode tf.logging.set_verbosity(tf.logging.INFO) # execute non-estimator models if args.model_type in PROBABILISTIC_MODEL_TYPES: run_btyd(args.model_type, args.data_src, args.threshold_date, args.predict_end) return if args.hypertune: # if tuning, join the trial number to the output path config = json.loads(os.environ.get('TF_CONFIG', '{}')) trial = config.get('task', {}).get('trial', '') model_dir = os.path.join(args.job_dir, trial) else: model_dir = args.job_dir print('Running training with model {}'.format(args.model_type)) # data path data_folder = '{}/'.format(args.data_src) # Calculate train steps and checkpoint steps based on approximate # training set size, batch size, and requested number of training # epochs. train_steps = (args.train_size / args.batch_size) * args.num_epochs checkpoint_steps = int( (args.train_size / args.batch_size) * (args.num_epochs / NUM_EVAL)) # create RunConfig config = tf.estimator.RunConfig(save_checkpoints_steps=checkpoint_steps) hidden_units = [int(n) for n in args.hidden_units.split()] # Hyperparameters params = tf.contrib.training.HParams( num_epochs=args.num_epochs, train_steps=train_steps, batch_size=args.batch_size, hidden_units=hidden_units, learning_rate=args.learning_rate, ignore_crosses=args.ignore_crosses, buffer_size=args.buffer_size, learning_rate_decay=(args.learning_rate_decay == 'True'), l1_regularization=args.l1_regularization, l2_regularization=args.l2_regularization, optimizer=args.optimizer, dropout=(None if args.dropout == 0.0 else args.dropout), checkpoint_steps=checkpoint_steps) print(params) print('') print('Dataset Size:', args.train_size) print('Batch Size:', args.batch_size) print('Steps per Epoch:', args.train_size / args.batch_size) print('Total Train Steps:', train_steps) print('Required Evaluation Steps:', NUM_EVAL) print('Perform evaluation step after each', args.num_epochs / NUM_EVAL, 'epochs') print('Save Checkpoint After', checkpoint_steps, 'steps') print('**********************************************') # Creates the relevant estimator (canned or custom) estimator = None # get model estimator estimator = get_estimator(estimator_name=args.model_type, config=config, params=params, model_dir=model_dir) # Creates the training and eval specs by reading the relevant datasets # Note that TrainSpec needs max_steps otherwise it runs forever. train_spec = tf.estimator.TrainSpec( input_fn=lambda: read_train(data_folder, params), max_steps=train_steps) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: read_eval(data_folder, params), exporters=[ tf.estimator.LatestExporter( name='estimate', serving_input_receiver_fn=csv_serving_input_fn, exports_to_keep=1, as_text=True) ], steps=1000, throttle_secs=1, start_delay_secs=1) if not args.resume: print('Removing previous trained model...') shutil.rmtree(model_dir, ignore_errors=True) else: print('Resuming training...') # Runs the training and evaluation using the chosen estimator. # Saves model data into export/estimate/1234567890/... tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)