def run(): """Run the model training and return evaluation output.""" resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver) model_cls = MODELS[FLAGS.model] if FLAGS.use_synthetic_data: data = SyntheticDataset(FLAGS.batch_size) else: data = Cifar10Dataset(FLAGS.batch_size) with strategy.scope(): model = model_cls(weights=None, input_shape=data.input_shape, classes=data.num_classes) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) history = model.fit( data.train_dataset, epochs=FLAGS.epochs, steps_per_epoch=data.num_train_images // FLAGS.batch_size, validation_data=data.test_dataset, validation_steps=data.num_test_images // FLAGS.batch_size) return history.history
def _create_tpu_strategy(): resolver = cluster_resolver.TPUClusterResolver("") tpu_lib.initialize_tpu_system(resolver) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run, **kwargs) return strategy
def _get_tpu_estimator(): tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() run_config = contrib_tpu_python_tpu_tpu_config.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.work_dir, save_checkpoints_steps=max(1000, FLAGS.iterations_per_loop), save_summary_steps=FLAGS.summary_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=contrib_tpu_python_tpu_tpu_config.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=contrib_tpu_python_tpu_tpu_config. InputPipelineConfig.PER_HOST_V2)) return contrib_tpu_python_tpu_tpu_estimator.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, eval_batch_size=FLAGS.train_batch_size * FLAGS.num_tpu_cores, params=FLAGS.flag_values_dict())
def run(): """Run the model training and return evaluation output.""" resolver = contrib_cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) contrib_distribute.initialize_tpu_system(resolver) strategy = contrib_distribute.TPUStrategy(resolver, steps_per_run=100) if FLAGS.fake_data: print("Using fake data") x_train = np.random.random((BATCH_SIZE, IMG_ROWS, IMG_COLS)) y_train = np.zeros([BATCH_SIZE, 1], dtype=np.int32) x_test, y_test = x_train, y_train else: # the data, split between train and test sets print("Using real data") (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(x_train.shape[0], IMG_ROWS, IMG_COLS, 1) x_test = x_test.reshape(x_test.shape[0], IMG_ROWS, IMG_COLS, 1) input_shape = (IMG_ROWS, IMG_COLS, 1) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print("x_train shape:", x_train.shape) print(x_train.shape[0], "train samples") print(x_test.shape[0], "test samples") # convert class vectors to binary class matrices y_train = tf.keras.utils.to_categorical(y_train, NUM_CLASSES) y_test = tf.keras.utils.to_categorical(y_test, NUM_CLASSES) with strategy.scope(): model = mnist_model(input_shape) model.compile( loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05), metrics=["accuracy"], ) callbacks = [] if FLAGS.model_dir: callbacks = [tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir)] model.fit( x_train, y_train, batch_size=BATCH_SIZE, callbacks=callbacks, epochs=EPOCHS, verbose=1, validation_data=(x_test, y_test), ) return model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=1)
def main(unused_argv): flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') tpu_cluster_resolver = ( contrib_cluster_resolver.TPUClusterResolver( tpu=[FLAGS.tpu_name], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)) tpu_grpc_url = tpu_cluster_resolver.get_master() config = contrib_tpu.RunConfig( master=tpu_grpc_url, evaluation_master=tpu_grpc_url, model_dir=FLAGS.model_dir, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards)) kwargs = {} if FLAGS.train_batch_size: kwargs['batch_size'] = FLAGS.train_batch_size train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples), use_tpu_estimator=True, use_tpu=FLAGS.use_tpu, num_shards=FLAGS.num_shards, save_final_config=FLAGS.mode == 'train', **kwargs) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fns = train_and_eval_dict['eval_input_fns'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] train_steps = train_and_eval_dict['train_steps'] if FLAGS.mode == 'train': estimator.train(input_fn=train_input_fn, max_steps=train_steps) # Continuously evaluating. if FLAGS.mode == 'eval': if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' # Currently only a single eval input is allowed. input_fn = eval_input_fns[0] model_lib.continuous_eval(estimator, FLAGS.model_dir, input_fn, train_steps, name, FLAGS.max_eval_retries)
def _create_tpu_strategy(): resolver = cluster_resolver.TPUClusterResolver("") topology = tpu_lib.initialize_tpu_system(resolver) device_assignment = None if use_single_core: device_assignment = device_assignment_lib.DeviceAssignment( topology, core_assignment=device_assignment_lib. SINGLE_CORE_ASSIGNMENT) strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run, device_assignment=device_assignment, **kwargs) return strategy
def create_estimator(model_fn, model_dir, hparams, use_tpu=False, master='', tpu_cluster=None, save_checkpoint_steps=300, save_summary_steps=300, keep_checkpoint_max=None, warm_start_from=None): """Creates an estimator.""" def wrapped_model_fn(features, labels, mode, params, config): """Wrap model_fn to restore labels value if present in features.""" # Workaround for Estimator API that forces 'labels' to be None when in # predict mode. # https://github.com/tensorflow/tensorflow/issues/17824 # See also infer_util.labels_to_features_wrapper if labels is None and hasattr(features, 'labels'): labels = features.labels return model_fn(features, labels, mode, params, config) if tpu_cluster: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( tpu_cluster) master = None else: tpu_cluster_resolver = None config = contrib_tpu.RunConfig( tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=save_checkpoint_steps), master=master, cluster=tpu_cluster_resolver, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=1) params = copy.deepcopy(hparams) params.del_hparam('batch_size') return contrib_tpu.TPUEstimator( use_tpu=use_tpu, model_fn=wrapped_model_fn, model_dir=model_dir, params=params, train_batch_size=hparams.batch_size, eval_batch_size=hparams.eval_batch_size, predict_batch_size=hparams.predict_batch_size, config=config, warm_start_from=warm_start_from, eval_on_tpu=False)
def construct_estimator(flags_obj, params, schedule_manager): """Construct an estimator from either Estimator or TPUEstimator. Args: flags_obj: The FLAGS object parsed from command line. params: A dict of run specific parameters. schedule_manager: A schedule.Manager object containing the run schedule. Returns: An estimator object to be used for training and eval. """ if not params["use_tpu"]: distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg) return tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, params=params, config=tf.estimator.RunConfig( train_distribute=distribution_strategy)) tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( tpu=flags_obj.tpu, zone=flags_obj.tpu_zone, project=flags_obj.tpu_gcp_project) tpu_config = contrib_tpu.TPUConfig( iterations_per_loop=schedule_manager.single_iteration_train_steps, num_shards=flags_obj.num_tpu_shards) run_config = contrib_tpu.RunConfig(cluster=tpu_cluster_resolver, model_dir=flags_obj.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config) return contrib_tpu.TPUEstimator( model_fn=model_fn, use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL, train_batch_size=schedule_manager.batch_size, eval_batch_size=schedule_manager.batch_size, params={ # TPUEstimator needs to populate batch_size itself due to sharding. key: value for key, value in params.items() if key != "batch_size" }, config=run_config)
def main(argv): del argv # Unused if FLAGS.use_tpu: assert FLAGS.model_dir.startswith("gs://"), ("'model_dir' should be a " "GCS bucket path!") # Fetch the data (train_x, train_y), (test_x, test_y) = load_data() # Feature columns describe how to use the input. my_feature_columns = [] for key in train_x.keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) # Resolve TPU cluster and runconfig for this. tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(FLAGS.tpu) run_config = contrib_tpu.RunConfig( model_dir=FLAGS.model_dir, cluster=tpu_cluster_resolver, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=contrib_tpu.TPUConfig(FLAGS.iterations), ) # Build 2 hidden layer DNN with 10, 10 units respectively. classifier = contrib_tpu.TPUEstimator( model_fn=my_model, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, config=run_config, params={ # Name of the feature columns in the input data. "feature_columns": my_feature_columns, # Two hidden layers of 10 nodes each. "hidden_units": [10, 10], # The model must choose between 3 classes. "n_classes": 3, "use_tpu": FLAGS.use_tpu, }) # Train the Model. classifier.train( input_fn=lambda params: train_input_fn( train_x, train_y, params["batch_size"]), max_steps=FLAGS.train_steps)
def freeze_graph_tpu(model_path): """Custom freeze_graph implementation for Cloud TPU.""" assert model_path assert FLAGS.tpu_name if FLAGS.tpu_name.startswith('grpc://'): tpu_grpc_url = FLAGS.tpu_name else: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() sess = tf.Session(tpu_grpc_url) output_names = [] with sess.graph.as_default(): # Replicate the inference function for each TPU core. replicated_features = [] feature_type = tf.bool if FLAGS.bool_features else tf.float32 for i in range(FLAGS.num_tpu_cores): name = 'pos_tensor_%d' % i features = tf.placeholder(feature_type, [None], name=name) replicated_features.append((features, )) outputs = contrib_tpu.replicate(tpu_model_inference_fn, replicated_features) # The replicate op assigns names like output_0_shard_0 to the output # names. Give them human readable names. for i, (policy_output, value_output, _) in enumerate(outputs): policy_name = 'policy_output_%d' % i value_name = 'value_output_%d' % i output_names.extend([policy_name, value_name]) tf.identity(policy_output, policy_name) tf.identity(value_output, value_name) tf.train.Saver().restore(sess, model_path) out_graph = tf.graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(), output_names) metadata = make_model_metadata({ 'engine': 'tpu', 'num_replicas': FLAGS.num_tpu_cores, }) minigo_model.write_graph_def(out_graph, metadata, model_path + '.minigo')
def main(unused_argv): assert FLAGS.tpu_name if FLAGS.tpu_name.startswith('grpc://'): tpu_grpc_url = FLAGS.tpu_name else: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=None, project=None) tpu_grpc_url = tpu_cluster_resolver.get_master() sess = tf.Session(tpu_grpc_url) with sess.graph.as_default(): contrib_tpu.initialize_system() contrib_tpu.shutdown_system() output_names = ['ConfigureDistributedTPU', 'ShutdownDistributedTPU'] model_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(), output_names) print(model_def)
def main(argv): del argv # Unused. tf.logging.set_verbosity(tf.logging.INFO) tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=contrib_tpu.TPUConfig(FLAGS.iterations, FLAGS.num_shards), ) estimator = contrib_tpu.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.batch_size, predict_batch_size=FLAGS.batch_size, params={"data_dir": FLAGS.data_dir}, config=run_config) # TPUEstimator.train *requires* a max_steps argument. estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) # TPUEstimator.evaluate *requires* a steps argument. # Note that the number of examples used during evaluation is # --eval_steps * --batch_size. # So if you change --batch_size then change --eval_steps too. if FLAGS.eval_steps: estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_steps) # Run prediction on top few samples of test data. if FLAGS.enable_predict: predictions = estimator.predict(input_fn=predict_input_fn) for pred_dict in predictions: template = ('Prediction is "{}" ({:.1f}%).') class_id = pred_dict['class_ids'] probability = pred_dict['probabilities'][class_id] print(template.format(class_id, 100 * probability))
def main(argv): del argv tpu_address = FLAGS.tpu if not any(pref in FLAGS.tpu for pref in ['http://', 'grpc://']): tpu_address = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu).master() tpu_address = '{}:{}'.format(tpu_address[:-len(':1234')], '8470' if FLAGS.grpc else '8473') tpu_address = tpu_address[len('abcd://'):] tf.logging.info('ModelServer at: {}'.format(tpu_address)) if FLAGS.grpc: grpc_channel = grpc.insecure_channel(tpu_address) stub = prediction_service_pb2_grpc.PredictionServiceStub(grpc_channel) run_grpc_load_test(FLAGS.num_requests, FLAGS.qps, generate_grpc_request(), stub) else: payload = generate_rest_payload() run_rest_load_test(FLAGS.num_requests, FLAGS.qps, tpu_address, payload)
def build_run_config(): """Return RunConfig for TPU estimator.""" tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size iterations_per_loop = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations_per_loop) save_checkpoints_steps = FLAGS.save_checkpoints_steps or iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=None, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_shards, per_host_input_for_training=contrib_tpu.InputPipelineConfig. PER_HOST_V2)) return run_config
def main(argv): del argv # Unused. tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=3600, session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True), tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_shards), ) estimator = contrib_tpu.TPUEstimator(model_fn=model_fn, use_tpu=FLAGS.use_tpu, config=run_config, train_batch_size=FLAGS.batch_size) estimator.train(input_fn=input_fn, max_steps=FLAGS.train_steps)
def get_estimator(**kwargs): """Construct an estimator.""" cfg = utils.Config(kwargs) if cfg.tpu.get('name'): tf.logging.info('Using cluster resolver.') cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( cfg.tpu.name, zone=cfg.tpu.zone, project=cfg.tpu.gcp_project) master = None else: cluster_resolver = None master = cfg.master tf.logging.info('Config:\n %s' % cfg) if cfg.tpu.enable: if not cfg.steps_per_epoch: raise ValueError('steps_per_epoch must be nonzero on TPU.') exp = contrib_tpu.TPUEstimator( model_fn=model_fn, config=contrib_tpu.RunConfig( cluster=cluster_resolver, master=master, model_dir=cfg.model_dir, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=cfg.steps_per_epoch)), use_tpu=True, eval_on_tpu=False, # TPU requires these args, but they are ignored inside the input # function, which directly get train_batch_size or eval_batch_size. train_batch_size=cfg.dataset.train_batch_size, eval_batch_size=cfg.dataset.eval_batch_size, params=cfg, ) else: exp = tf.estimator.Estimator(model_fn=model_fn, model_dir=cfg.model_dir, params=cfg) return exp
def __init__(self, config, tasks): self._config = config self._tasks = tasks self._preprocessor = preprocessing.Preprocessor(config, self._tasks) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 tpu_cluster_resolver = None if config.use_tpu and config.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( config.tpu_name, zone=config.tpu_zone, project=config.gcp_project) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=config.checkpoints_dir, save_checkpoints_steps=config.save_checkpoints_steps, save_checkpoints_secs=None, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=config.iterations_per_loop, num_shards=config.num_tpu_cores, per_host_input_for_training=is_per_host)) (self._train_input_fn, self.train_steps, sizes) = self._preprocessor.prepare_train() task_weights = task_weighting.get_task_weights(config, sizes) model_fn = model_fn_builder(config=config, tasks=self._tasks, task_weights=task_weights, num_train_steps=self.train_steps) self._estimator = contrib_tpu.TPUEstimator( use_tpu=config.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, predict_batch_size=config.predict_batch_size)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if (not FLAGS.do_train and not FLAGS.do_eval_dev and not FLAGS.do_eval_test and not FLAGS.do_predict): raise ValueError( "At least one of `do_train`, `do_eval_dev` or `do_predict` or " "`do_eval_test' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) label_list = ["Yes", "No"] if FLAGS.from_three_class_model: label_list.append("Neutral") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = get_train() num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) eval_on = [] if FLAGS.do_eval_dev: eval_on.append((get_dev(), "dev")) if FLAGS.do_eval_test: eval_on.append((get_test(), "test")) for eval_examples, name in eval_on: eval_file = os.path.join(FLAGS.output_dir, "%s.tf_record" % name) file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running %s *****" % name) tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: # Eval will be slightly WRONG on the TPU because it will truncate # the last batch. eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "%s_eval_results.txt" % name) with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** %s eval results *****" % name) for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = get_custom(FLAGS.predict_input_file) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record.%s" % FLAGS.exp_name) file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction *****") tf.logging.info("Num examples = %d", len(predict_examples)) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) output_predictions = [] counter = 0 for result in estimator.predict(predict_input_fn, yield_single_examples=True): probs_result = result["probabilities"] answer = bool(probs_result[0] >= probs_result[1]) output_predictions.append( json.dumps({ "passage": predict_examples[counter].text_a, "question": predict_examples[counter].text_b, "title": "Steal Dataset", "answer": answer, "soft_answer": [float(probs_result[0]), float(probs_result[1])] })) counter += 1 tf.logging.info("%d in original file, %d predicted", len(predict_examples), counter) with tf.gfile.GFile(FLAGS.predict_output_file, "w") as f: f.write("\n".join(output_predictions))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) validate_flags_or_throw(albert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = fine_tuning_utils.create_vocab( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file, hub_module=FLAGS.albert_hub_module_handle) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int(min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, keep_checkpoint_max=0, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = squad_utils.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if FLAGS.do_train: num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = squad_utils.v2_model_fn_builder( albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, start_n_top=FLAGS.start_n_top, end_n_top=FLAGS.end_n_top, dropout_prob=FLAGS.dropout_prob, hub_module=FLAGS.albert_hub_module_handle) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. if not tf.gfile.Exists(FLAGS.train_feature_file): train_writer = squad_utils.FeatureWriter( filename=os.path.join(FLAGS.train_feature_file), is_training=True) squad_utils.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, do_lower_case=FLAGS.do_lower_case) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) # tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, is_v2=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: with tf.gfile.Open(FLAGS.predict_file) as predict_file: prediction_json = json.load(predict_file)["data"] eval_examples = squad_utils.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( FLAGS.predict_feature_left_file)): tf.logging.info("Loading eval features from {}".format( FLAGS.predict_feature_left_file)) with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: eval_features = pickle.load(fin) else: eval_writer = squad_utils.FeatureWriter( filename=FLAGS.predict_feature_file, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) squad_utils.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, do_lower_case=FLAGS.do_lower_case) eval_writer.close() with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: pickle.dump(eval_features, fout) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.predict_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, is_v2=True) def get_result(checkpoint): """Evaluate the checkpoint on SQuAD v2.0.""" # If running eval on the TPU, you will need to specify the number of # steps. reader = tf.train.NewCheckpointReader(checkpoint) global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) all_results = [] for result in estimator.predict( predict_input_fn, yield_single_examples=True, checkpoint_path=checkpoint): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_top_log_probs = ( [float(x) for x in result["start_top_log_probs"].flat]) start_top_index = [int(x) for x in result["start_top_index"].flat] end_top_log_probs = ( [float(x) for x in result["end_top_log_probs"].flat]) end_top_index = [int(x) for x in result["end_top_index"].flat] cls_logits = float(result["cls_logits"].flat[0]) all_results.append( squad_utils.RawResultV2( unique_id=unique_id, start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, cls_logits=cls_logits)) output_prediction_file = os.path.join( FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join( FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join( FLAGS.output_dir, "null_odds.json") result_dict = {} cls_dict = {} squad_utils.accumulate_predictions_v2( result_dict, cls_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.start_n_top, FLAGS.end_n_top) return squad_utils.evaluate_v2( result_dict, cls_dict, prediction_json, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file), int(global_step) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "f1" writer = tf.gfile.GFile(output_eval_file, "w") if tf.gfile.Exists(checkpoint_path + ".index"): result = get_result(checkpoint_path) best_perf = result[0][key_name] global_step = result[1] else: global_step = -1 best_perf = -1 checkpoint_path = None while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info("Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info("found 0 file, global step: {}. Sleeping." .format(global_step)) time.sleep(60) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result, global_step = get_result(checkpoint_path) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) writer.write("best {} = {}\n".format(key_name, best_perf)) tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) if len(_find_valid_cands(global_step)) > 2: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result, global_step = get_result(checkpoint_path) tf.logging.info("***** Final Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best perf happened at step: {}".format(global_step))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mismnli": classifier_utils.MisMnliProcessor, "mrpc": classifier_utils.MrpcProcessor, "rte": classifier_utils.RteProcessor, "sst-2": classifier_utils.Sst2Processor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "qnli": classifier_utils.QnliProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True." ) if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle: raise ValueError("At least one of `--albert_config_file` and " "`--albert_hub_module_handle` must be set") if FLAGS.albert_config_file: albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) else: albert_config = None # Get the config from TF-Hub. tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = fine_tuning_utils.create_vocab( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file, hub_module=FLAGS.albert_hub_module_handle) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, hub_module=FLAGS.albert_hub_module_handle, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir train_file = os.path.join(cached_dir, task_name + "_train.tf_record") if not tf.gfile.Exists(train_file): classifier_utils.file_based_convert_examples_to_features( train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, task_name) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_step) train_input_fn = classifier_utils.file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_step) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) cached_dir = FLAGS.cached_dir if not cached_dir: cached_dir = FLAGS.output_dir eval_file = os.path.join(cached_dir, task_name + "_eval.tf_record") if not tf.gfile.Exists(eval_file): classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, task_name) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = classifier_utils.file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.eval_batch_size) best_trial_info_file = os.path.join(FLAGS.output_dir, "best_trial.txt") def _best_trial_info(): """Returns information about which checkpoints have been evaled so far.""" if tf.gfile.Exists(best_trial_info_file): with tf.gfile.GFile(best_trial_info_file, "r") as best_info: global_step, best_metric_global_step, metric_value = ( best_info.read().split(":")) global_step = int(global_step) best_metric_global_step = int(best_metric_global_step) metric_value = float(metric_value) else: metric_value = -1 best_metric_global_step = -1 global_step = -1 tf.logging.info( "Best trial info: Step: %s, Best Value Step: %s, " "Best Value: %s", global_step, best_metric_global_step, metric_value) return global_step, best_metric_global_step, metric_value def _remove_checkpoint(checkpoint_path): for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") if task_name == "sts-b": key_name = "pearson" elif task_name == "cola": key_name = "matthew_corr" else: key_name = "eval_accuracy" global_step, best_perf_global_step, best_perf = _best_trial_info() writer = tf.gfile.GFile(output_eval_file, "w") while global_step < FLAGS.train_step: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(60) else: for checkpoint in sorted(steps_and_files.items()): step, checkpoint_path = checkpoint if global_step >= step: if (best_perf_global_step != step and len(_find_valid_cands(step)) > 1): _remove_checkpoint(checkpoint_path) continue result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if result[key_name] > best_perf: best_perf = result[key_name] best_perf_global_step = global_step elif len(_find_valid_cands(global_step)) > 1: _remove_checkpoint(checkpoint_path) writer.write("=" * 50 + "\n") writer.flush() with tf.gfile.GFile(best_trial_info_file, "w") as best_info: best_info.write("{}:{}:{}".format( global_step, best_perf_global_step, best_perf)) writer.close() for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = "model.ckpt-{}.{}".format(best_perf_global_step, ext) tgt_ckpt = "model.ckpt-best.{}".format(ext) tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) tf.io.gfile.rename(os.path.join(FLAGS.output_dir, src_ckpt), os.path.join(FLAGS.output_dir, tgt_ckpt), overwrite=True) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.data_dir) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(classifier_utils.PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") classifier_utils.file_based_convert_examples_to_features( predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file, task_name) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = classifier_utils.file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=predict_input_fn, checkpoint_path=checkpoint_path) output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") output_submit_file = os.path.join(FLAGS.output_dir, "submit_results.tsv") with tf.gfile.GFile(output_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "prediction\n") num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, (example, prediction)) in\ enumerate(zip(predict_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + actual_label + "\n") num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int(FLAGS.train_data_size / FLAGS.train_batch_size) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, num_choices=FLAGS.num_choices, add_masking=FLAGS.include_mlm) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=FLAGS.train_file, is_training=True, drop_remainder=True, add_masking=FLAGS.include_mlm) estimator.train(input_fn=train_input_fn, steps=num_train_steps) if FLAGS.do_eval: # This tells the estimator to run through the entire set. if FLAGS.eval_data_size < 0: eval_steps = None else: eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False # Note that we are masking inputs for eval as well as training and this will # decrease eval performance eval_input_fn = file_based_input_fn_builder( input_file=FLAGS.eval_file, is_training=False, drop_remainder=eval_drop_remainder, add_masking=FLAGS.include_mlm) # checkpoints_iterator blocks until a new checkpoint appears. for ckpt in contrib_training.checkpoints_iterator(estimator.model_dir): try: result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) tf.logging.info("********** Eval results:*******\n") for key in sorted(result.keys()): tf.logging.info("%s = %s" % (key, str(result[key]))) except tf.errors.NotFoundError: tf.logging.error("Checkpoint path '%s' no longer exists.", ckpt)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = {"sst-2": SST2Processor, "mnli": MNLIProcessor} tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name,)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = processor.get_train_examples(FLAGS.data_dir) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder( bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: train_file = os.path.join(FLAGS.output_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, num_labels=len(label_list)) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. These do NOT count towards the metric (all tf.metrics # support a per-instance weight, and these get a weight of 0.0). while len(eval_examples) % FLAGS.eval_batch_size != 0: eval_examples.append(PaddingInputExample()) eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") file_based_convert_examples_to_features(eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) # This tells the estimator to run through the entire set. eval_steps = None # However, if running eval on the TPU, you will need to specify the # number of steps. if FLAGS.use_tpu: assert len(eval_examples) % FLAGS.eval_batch_size == 0 eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False eval_input_fn = file_based_input_fn_builder( input_file=eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder, num_labels=len(label_list)) result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if FLAGS.do_predict: predict_examples = processor.get_test_examples(FLAGS.predict_input_file) num_actual_predict_examples = len(predict_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(predict_examples) % FLAGS.predict_batch_size != 0: predict_examples.append(PaddingInputExample()) predict_file = os.path.join(FLAGS.output_dir, FLAGS.exp_name + ".predict.tf_record") file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(predict_examples), num_actual_predict_examples, len(predict_examples) - num_actual_predict_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_drop_remainder = True if FLAGS.use_tpu else False predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=predict_drop_remainder, num_labels=len(label_list)) result = estimator.predict(input_fn=predict_input_fn) if FLAGS.predict_output_file is None: predict_output_file = os.path.join(FLAGS.output_dir, "test_results.tsv") else: predict_output_file = FLAGS.predict_output_file with tf.gfile.GFile(predict_output_file, "w") as writer: num_written_lines = 0 tf.logging.info("***** Predict results *****") for (i, prediction) in enumerate(result): probabilities = prediction["probabilities"] if i >= num_actual_predict_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" writer.write(output_line) num_written_lines += 1 assert num_written_lines == num_actual_predict_examples
def main(argv): del argv # Unused. if FLAGS.start_profiler_server: # Starts profiler. It will perform profiling when receive profiling request. profiler.start_profiler_server(FLAGS.profiler_port_number) if FLAGS.use_tpu: if FLAGS.distribution_strategy is None: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) tpu_grpc_url = tpu_cluster_resolver.get_master() tf.Session.reset(tpu_grpc_url) else: raise RuntimeError( 'Distribution strategy must be None when --use_tpu is True.') else: tpu_cluster_resolver = None if FLAGS.mode not in ['train', 'eval', 'train_and_eval']: raise ValueError('Unrecognize --mode: %s' % FLAGS.mode) # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') if FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise RuntimeError('You must use --distribution_strategy=None for ' 'train_and_eval.') # Parse hparams hparams = retinanet_model.default_hparams() config_file = FLAGS.config_file hparams.num_epochs = FLAGS.num_epochs if config_file and tf.gfile.Exists(config_file): # load params from file. with tf.gfile.Open(config_file, 'r') as f: values_map = json.load(f) hparams.override_from_dict(values_map) hparams.parse(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in hparams, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = hparams.get('image_size') for level in range(hparams.get('min_level'), hparams.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores config_proto = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) if FLAGS.use_xla and not FLAGS.use_tpu: config_proto.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) if FLAGS.auto_mixed_precision and FLAGS.distribution_strategy: config_proto.graph_options.rewrite_options.auto_mixed_precision = ( rewriter_config_pb2.RewriterConfig.ON) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. params = dict( hparams.values(), num_shards=num_shards, num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, ) tpu_config = contrib_tpu.TPUConfig( FLAGS.iterations_per_loop, num_shards=num_shards, num_cores_per_replica=num_cores_per_replica, input_partition_dims=input_partition_dims, per_host_input_for_training=contrib_tpu.InputPipelineConfig. PER_HOST_V2) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, evaluation_master=FLAGS.eval_master, model_dir=FLAGS.model_dir, log_step_count_steps=FLAGS.iterations_per_loop, session_config=config_proto, tpu_config=tpu_config, ) else: if FLAGS.num_gpus < 0: raise ValueError('`num_gpus` cannot be negative.') def _per_device_batch_size(batch_size, num_gpus): """Calculate per GPU batch for Estimator. Args: batch_size: Global batch size to be divided among devices. num_gpus: How many GPUs are used per worker. Returns: Batch size per device. Raises: ValueError: if batch_size is not divisible by number of devices """ if num_gpus <= 1: return batch_size remainder = batch_size % num_gpus if remainder: raise ValueError( 'Batch size must be a multiple of the number GPUs per worker.' ) return int(batch_size / num_gpus) # Uses Estimator. params = dict( hparams.values(), num_examples_per_epoch=FLAGS.num_examples_per_epoch, use_tpu=FLAGS.use_tpu, resnet_checkpoint=FLAGS.resnet_checkpoint, val_json_file=FLAGS.val_json_file, mode=FLAGS.mode, use_bfloat16=False, auto_mixed_precision=FLAGS.auto_mixed_precision, dataset_max_intra_op_parallelism=FLAGS. dataset_max_intra_op_parallelism, dataset_private_threadpool_size=FLAGS. dataset_private_threadpool_size, ) if FLAGS.distribution_strategy == 'mirrored': params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, FLAGS.num_gpus) if FLAGS.num_gpus == 0: devices = ['device:CPU:0'] else: devices = [ 'device:GPU:{}'.format(i) for i in range(FLAGS.num_gpus) ] if FLAGS.all_reduce_alg: dist_strat = tf.distribute.MirroredStrategy( devices=devices, cross_device_ops=contrib_distribute. AllReduceCrossDeviceOps(FLAGS.all_reduce_alg, num_packs=2)) else: dist_strat = tf.distribute.MirroredStrategy(devices=devices) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat, eval_distribute=dist_strat) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': local_device_protos = device_lib.list_local_devices() params['batch_size'] = _per_device_batch_size( FLAGS.train_batch_size, sum([1 for d in local_device_protos if d.device_type == 'GPU'])) if FLAGS.worker_hosts is None: tf_config_json = json.loads(os.environ.get('TF_CONFIG', '{}')) # Replaces master with chief. if tf_config_json: if 'master' in tf_config_json['cluster']: tf_config_json['cluster']['chief'] = tf_config_json[ 'cluster'].pop('master') if tf_config_json['task']['type'] == 'master': tf_config_json['task']['type'] = 'chief' os.environ['TF_CONFIG'] = json.dumps(tf_config_json) tf_config_json = json.loads(os.environ['TF_CONFIG']) worker_hosts = tf_config_json['cluster']['worker'] worker_hosts.extend(tf_config_json['cluster'].get('chief', [])) else: # Set TF_CONFIG environment variable worker_hosts = FLAGS.worker_hosts.split(',') os.environ['TF_CONFIG'] = json.dumps({ 'cluster': { 'worker': worker_hosts }, 'task': { 'type': 'worker', 'index': FLAGS.task_index } }) dist_strat = tf.distribute.experimental.MultiWorkerMirroredStrategy( communication=_COLLECTIVE_COMMUNICATION_OPTIONS[ FLAGS.all_reduce_alg]) run_config = tf.estimator.RunConfig(session_config=config_proto, train_distribute=dist_strat) else: raise ValueError('Unrecognized distribution strategy.') if FLAGS.mode == 'train': if FLAGS.model_dir is not None: if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) with tf.gfile.Open(os.path.join(FLAGS.model_dir, 'hparams.json'), 'w') as f: json.dump(hparams.values(), f, sort_keys=True, indent=2) tf.logging.info(params) if FLAGS.distribution_strategy is None: total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) # Run evaluation after training finishes. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) if FLAGS.eval_after_training: if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) output_dir = os.path.join(FLAGS.model_dir, 'train_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) evaluation.write_summary(eval_results, summary_writer, total_steps) else: train_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) if FLAGS.distribution_strategy == 'mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) tf.logging.info('Starting `MirroredStrategy` training...') train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) elif FLAGS.distribution_strategy == 'multi_worker_mirrored': total_steps = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (len(worker_hosts) * FLAGS.train_batch_size)) train_spec = tf.estimator.TrainSpec( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), max_steps=total_steps) eval_spec = tf.estimator.EvalSpec(input_fn=tf.data.Dataset) tf.logging.info( 'Starting `MultiWorkerMirroredStrategy` training...') tf.estimator.train_and_evaluate(train_estimator, train_spec, eval_spec) else: raise ValueError('Unrecognized distribution strategy.') elif FLAGS.mode == 'eval': # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) if FLAGS.distribution_strategy is None: # Uses TPUEstimator. eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) else: # Uses Estimator. if FLAGS.distribution_strategy == 'multi_worker_mirrored': raise ValueError( '--distribution_strategy=multi_worker_mirrored is not supported ' 'for eval.') elif FLAGS.distribution_strategy == 'mirrored': eval_estimator = tf.estimator.Estimator( model_fn=retinanet_model.est_retinanet_model_fn, model_dir=FLAGS.model_dir, config=run_config, params=params) else: raise ValueError('Unrecognized distribution strategy.') def terminate_eval(): tf.logging.info( 'Terminating eval after %d seconds of no checkpoints' % FLAGS.eval_timeout) return True output_dir = os.path.join(FLAGS.model_dir, 'eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) # Run evaluation when there's a new checkpoint for ckpt in contrib_training.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): tf.logging.info('Starting to evaluate.') try: eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Eval results: %s' % eval_results) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) evaluation.write_summary(eval_results, summary_writer, current_step) if current_step >= total_step: tf.logging.info( 'Evaluation finished after training step %d' % current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint' % ckpt) elif FLAGS.mode == 'train_and_eval': if FLAGS.distribution_strategy is not None: raise ValueError( 'Distribution strategy is not implemented for --mode=train_and_eval.' ) if FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --val_json_file for evaluation.') output_dir = os.path.join(FLAGS.model_dir, 'train_and_eval') tf.gfile.MakeDirs(output_dir) summary_writer = tf.summary.FileWriter(output_dir) num_cycles = int(FLAGS.num_epochs * FLAGS.num_examples_per_epoch / FLAGS.num_steps_per_eval) for cycle in range(num_cycles): tf.logging.info('Starting training cycle, epoch: %d.' % cycle) train_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, config=run_config, params=params) train_estimator.train(input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True), steps=FLAGS.num_steps_per_eval) tf.logging.info('Starting evaluation cycle, epoch: %d.' % cycle) # Run evaluation after every epoch. eval_params = dict( params, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.eval_batch_size, config=run_config, params=eval_params) eval_results = evaluation.evaluate( eval_estimator, input_fn=dataloader.InputReader(FLAGS.validation_file_pattern, is_training=False), num_eval_samples=FLAGS.eval_samples, eval_batch_size=FLAGS.eval_batch_size, validation_json_file=FLAGS.val_json_file) tf.logging.info('Evaluation results: %s' % eval_results) current_step = int(cycle * FLAGS.num_steps_per_eval) evaluation.write_summary(eval_results, summary_writer, current_step) else: tf.logging.info('Mode not found.') if FLAGS.model_dir: tf.logging.info('Exporting saved model.') eval_params = dict( params, use_tpu=True, input_rand_hflip=False, resnet_checkpoint=None, is_training_bn=False, use_bfloat16=False, ) eval_estimator = contrib_tpu.TPUEstimator( model_fn=retinanet_model.tpu_retinanet_model_fn, use_tpu=True, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.inference_batch_size, config=run_config, params=eval_params) export_path = eval_estimator.export_saved_model( export_dir_base=FLAGS.model_dir, serving_input_receiver_fn=build_serving_input_fn( hparams.image_size, FLAGS.inference_batch_size)) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, hparams.image_size, batch_sizes=[FLAGS.inference_batch_size])
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { "cola": classifier_utils.ColaProcessor, "mnli": classifier_utils.MnliProcessor, "mismnli": classifier_utils.MisMnliProcessor, "mrpc": classifier_utils.MrpcProcessor, "rte": classifier_utils.RteProcessor, "sst-2": classifier_utils.Sst2Processor, "sts-b": classifier_utils.StsbProcessor, "qqp": classifier_utils.QqpProcessor, "qnli": classifier_utils.QnliProcessor, "wnli": classifier_utils.WnliProcessor, } tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.albert_config_file and not FLAGS.albert_hub_module_handle: raise ValueError("At least one of `--albert_config_file` and " "`--albert_hub_module_handle` must be set") if FLAGS.albert_config_file: albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) if FLAGS.max_seq_length > albert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the ALBERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, albert_config.max_position_embeddings)) else: albert_config = None # Get the config from TF-Hub. tf.gfile.MakeDirs(FLAGS.output_dir) task_name = FLAGS.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]( use_spm=True if FLAGS.spm_model_file else False, do_lower_case=FLAGS.do_lower_case) label_list = processor.get_labels() tokenizer = fine_tuning_utils.create_vocab( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file, hub_module=FLAGS.albert_hub_module_handle) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=int(FLAGS.save_checkpoints_steps), keep_checkpoint_max=0, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = classifier_utils.model_fn_builder( albert_config=albert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.train_step, num_warmup_steps=FLAGS.warmup_step, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, task_name=task_name, hub_module=FLAGS.albert_hub_module_handle, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) # if FLAGS.do_predict: eval_examples = processor.get_dev_examples(FLAGS.data_dir) num_actual_eval_examples = len(eval_examples) if FLAGS.use_tpu: # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. while len(eval_examples) % FLAGS.predict_batch_size != 0: eval_examples.append(classifier_utils.PaddingInputExample()) error_analysis_file = os.path.join(FLAGS.output_dir, "error_analysis.tf_record") classifier_utils.file_based_convert_examples_to_features( eval_examples, label_list, FLAGS.max_seq_length, tokenizer, error_analysis_file, task_name) tf.logging.info("***** Running error analysis on dev set*****") tf.logging.info(" Num examples = %d (%d actual, %d padding)", len(eval_examples), num_actual_eval_examples, len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) error_analysis_drop_remainder = True if FLAGS.use_tpu else False error_analysis_input_fn = classifier_utils.file_based_input_fn_builder( input_file=error_analysis_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=error_analysis_drop_remainder, task_name=task_name, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size) checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result = estimator.predict(input_fn=error_analysis_input_fn, checkpoint_path=checkpoint_path) output_error_analysis_predict_file = os.path.join( FLAGS.output_dir, "error_analysis_test_results.tsv") output_error_analysis_submit_file = os.path.join( FLAGS.output_dir, "error_analysis_submit_results.tsv") with tf.gfile.GFile(output_error_analysis_predict_file, "w") as pred_writer,\ tf.gfile.GFile(output_error_analysis_submit_file, "w") as sub_writer: sub_writer.write("index" + "\t" + "text_a" + "\t" + "text_b" + "\t" + "prediction" + "\t" + "label" + "\n") num_written_lines = 0 tf.logging.info("***** Error analysis results *****") for (i, (example, prediction)) in\ enumerate(zip(eval_examples, result)): probabilities = prediction["probabilities"] if i >= num_actual_eval_examples: break output_line = "\t".join( str(class_probability) for class_probability in probabilities) + "\n" pred_writer.write(output_line) if task_name != "sts-b": actual_label = label_list[int(prediction["predictions"])] else: actual_label = str(prediction["predictions"]) sub_writer.write(example.guid + "\t" + str(example.text_a) + "\t" + str(example.text_b) + "\t" + str(actual_label) + "\t" + str(example.label) + "\n") num_written_lines += 1 assert num_written_lines == num_actual_eval_examples
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) label_list = ["Yes", "No"] if FLAGS.from_three_class_model: label_list.append("Neutral") tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) predict_examples = get_custom(FLAGS.predict_input_file) predict_file = os.path.join(FLAGS.init_checkpoint2, "predict.tf_record.%s" % FLAGS.exp_name) file_based_convert_examples_to_features(predict_examples, label_list, FLAGS.max_seq_length, tokenizer, predict_file) tf.logging.info("***** Running prediction *****") tf.logging.info("Num examples = %d", len(predict_examples)) predict_input_fn = file_based_input_fn_builder( input_file=predict_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) all_model_ans = [] for output_dir in [FLAGS.init_checkpoint1, FLAGS.init_checkpoint2]: all_answers = [] tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=output_dir, save_checkpoints_steps=1000, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(bert_config=bert_config, num_labels=len(label_list), init_checkpoint=FLAGS.init_checkpoint, learning_rate=None, num_train_steps=None, num_warmup_steps=None, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=32, eval_batch_size=8, predict_batch_size=FLAGS.predict_batch_size) for result in estimator.predict(predict_input_fn, yield_single_examples=True): probs_result = result["probabilities"] all_answers.append(bool(probs_result[0] >= probs_result[1])) all_model_ans.append(all_answers) first_ans, second_ans = all_model_ans[0], all_model_ans[1] assert len(first_ans) == len(second_ans) matching = 0 counter = 0 for a1, a2 in zip(first_ans, second_ans): if a1 == a2: matching += 1 counter += 1 tf.logging.info("Agreement = %.4f (%d / %d)", (float(matching) / counter), matching, counter)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) validate_flags_or_throw(albert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, spm_model_file=FLAGS.spm_model_file) # multiple gpus NUM_GPUS = FLAGS.num_gpu_cores if FLAGS.strategy_type == 'mirror' else 1 using_customized_optimizer = None if NUM_GPUS > 1 and FLAGS.strategy_type == "mirror": os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(i) for i in list(range(NUM_GPUS))]) # https://github.com/tensorflow/tensorflow/issues/21470#issuecomment-422506263 strategy = tf.contrib.distribute.MirroredStrategy( num_gpus=NUM_GPUS, cross_device_ops=AllReduceCrossDeviceOps('nccl', num_packs=NUM_GPUS), ) using_customized_optimizer = True tf.logging.info('Use MirroredStrategy with %d devices.', strategy.num_replicas_in_sync) else: strategy = tf.distribute.OneDeviceStrategy("GPU:0") using_customized_optimizer = False tf.logging.info('Single device mode.') tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 if FLAGS.do_train: iterations_per_loop = int( min(FLAGS.iterations_per_loop, FLAGS.save_checkpoints_steps)) else: iterations_per_loop = FLAGS.iterations_per_loop run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host), train_distribute=strategy, eval_distribute=strategy, #get error during evaluation ) train_examples = None num_train_steps = None num_warmup_steps = None train_examples = squad_utils.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) if FLAGS.do_train: num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = squad_utils.v2_model_fn_builder( albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, max_seq_length=FLAGS.max_seq_length, start_n_top=FLAGS.start_n_top, end_n_top=FLAGS.end_n_top, dropout_prob=FLAGS.dropout_prob, customized=using_customized_optimizer, optimizer=FLAGS.optimizer) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. if FLAGS.use_tpu and FLAGS.tpu_name: tf.logging.info("Use TPUEstimator") estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) else: tf.logging.info("Use normal Estimator") estimator = Estimator( model_fn=model_fn, params={}, config=run_config, ) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. if not tf.gfile.Exists(FLAGS.train_feature_file): train_writer = squad_utils.FeatureWriter(filename=os.path.join( FLAGS.train_feature_file), is_training=True) squad_utils.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature, do_lower_case=FLAGS.do_lower_case) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) # tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info( f" Batch size = {FLAGS.train_batch_size} * {NUM_GPUS}") tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.train_feature_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True, use_tpu=FLAGS.use_tpu, bsz=FLAGS.train_batch_size, is_v2=True) time_hist = TimeHistory() estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) total_time = sum(time_hist.times) if FLAGS.do_predict: with tf.gfile.Open(FLAGS.predict_file) as predict_file: prediction_json = json.load(predict_file)["data"] eval_examples = squad_utils.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists(FLAGS.predict_feature_left_file)): tf.logging.info("Loading eval features from {}".format( FLAGS.predict_feature_left_file)) with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: eval_features = pickle.load(fin) else: eval_writer = squad_utils.FeatureWriter( filename=FLAGS.predict_feature_file, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) squad_utils.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature, do_lower_case=FLAGS.do_lower_case) eval_writer.close() with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: pickle.dump(eval_features, fout) tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_input_fn = squad_utils.input_fn_builder( input_file=FLAGS.predict_feature_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False, use_tpu=FLAGS.use_tpu, bsz=FLAGS.predict_batch_size, is_v2=True) def get_result(checkpoint): """Evaluate the checkpoint on SQuAD v2.0.""" # If running eval on the TPU, you will need to specify the number of # steps. reader = tf.train.NewCheckpointReader(checkpoint) global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True, checkpoint_path=checkpoint): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_top_log_probs = ([ float(x) for x in result["start_top_log_probs"].flat ]) start_top_index = [ int(x) for x in result["start_top_index"].flat ] end_top_log_probs = ([ float(x) for x in result["end_top_log_probs"].flat ]) end_top_index = [int(x) for x in result["end_top_index"].flat] cls_logits = float(result["cls_logits"].flat[0]) all_results.append( squad_utils.RawResultV2( unique_id=unique_id, start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, cls_logits=cls_logits)) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") result_dict = {} cls_dict = {} squad_utils.accumulate_predictions_v2( result_dict, cls_dict, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.start_n_top, FLAGS.end_n_top) return squad_utils.evaluate_v2( result_dict, cls_dict, prediction_json, eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file), int(global_step) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "f1" writer = tf.gfile.GFile(output_eval_file, "w") avg_time_per_batch = np.mean(time_hist.times) writer.write("===== Hyperparameters =====\n") writer.write("Training batch size: {}\n".format( FLAGS.train_batch_size)) writer.write("Max sequence length: {}\n".format(FLAGS.max_seq_length)) writer.write("Learning rate: {}\n".format(FLAGS.learning_rate)) writer.write("Num of GPU cores: {}\n".format(NUM_GPUS)) if FLAGS.do_train: avg_time_per_batch = np.mean(time_hist.times) writer.write("Total time: {}\n".format(total_time)) writer.write("Speed: {}\n".format(FLAGS.train_batch_size * NUM_GPUS / avg_time_per_batch)) if num_train_steps and num_warmup_steps: writer.write("Training steps: {}\n".format(num_train_steps)) writer.write("Warmup steps: {}\n".format(num_warmup_steps)) if tf.gfile.Exists(checkpoint_path + ".index"): result = get_result(checkpoint_path) best_perf = result[0][key_name] global_step = result[1] else: global_step = -1 best_perf = -1 checkpoint_path = None while global_step < num_train_steps: steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format( global_step)) time.sleep(1) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in [ "meta", "data-00000-of-00001", "index" ]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) continue result, global_step = get_result(checkpoint_path) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt)) writer.write("best {} = {}\n".format(key_name, best_perf)) tf.logging.info(" best {} = {}\n".format( key_name, best_perf)) if len(_find_valid_cands(global_step)) > 2: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") result, global_step = get_result(checkpoint_path) tf.logging.info("***** Final Eval results *****") tf.logging.info(f"num_gpu_cores = {NUM_GPUS}") writer.write("===== Evuations =====\n") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best perf happened at step: {}".format(global_step))
def main(unused_argv): del unused_argv # Unused tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) params = { 'input_perm': [0, 1, 2, 3], 'output_perm': [0, 1, 2, 3], } batch_axis = 0 if FLAGS.transpose_enabled: params['input_perm'] = [3, 0, 1, 2] params['output_perm'] = [1, 2, 3, 0] batch_axis = 3 if FLAGS.eval_total_size > 0: eval_size = FLAGS.eval_total_size else: eval_size = _NUM_EVAL_IMAGES eval_steps = eval_size // FLAGS.eval_batch_size iterations = (eval_steps if FLAGS.mode == 'eval' else FLAGS.iterations) eval_batch_size = (None if FLAGS.mode == 'train' else FLAGS.eval_batch_size) tpu_config = contrib_tpu.TPUConfig(iterations_per_loop=iterations, num_shards=FLAGS.num_shards) run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_secs=FLAGS.save_checkpoints_secs, save_summary_steps=FLAGS.save_summary_steps, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement), tpu_config=tpu_config) inception_classifier = contrib_tpu.TPUEstimator( model_fn=inception_model_fn, use_tpu=FLAGS.use_tpu, config=run_config, params=params, train_batch_size=FLAGS.train_batch_size, eval_batch_size=eval_batch_size, batch_axis=(batch_axis, 0)) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. imagenet_train = InputPipeline(is_training=True, data_dir=FLAGS.data_dir) imagenet_eval = InputPipeline(is_training=False, data_dir=FLAGS.data_dir) if FLAGS.moving_average: eval_hooks = [LoadEMAHook(FLAGS.model_dir)] else: eval_hooks = [] if FLAGS.mode == 'eval': # Run evaluation when there is a new checkpoint for checkpoint in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # Includes compilation time eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks, checkpoint_path=checkpoint) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(checkpoint).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', checkpoint) elif FLAGS.mode == 'train_and_eval': for cycle in range(FLAGS.train_steps // FLAGS.train_steps_per_eval): tf.logging.info('Starting training cycle %d.' % cycle) inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps_per_eval) tf.logging.info('Starting evaluation cycle %d .' % cycle) eval_results = inception_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, hooks=eval_hooks) tf.logging.info('Evaluation results: %s' % eval_results) else: tf.logging.info('Starting training ...') inception_classifier.train(input_fn=imagenet_train.input_fn, steps=FLAGS.train_steps) if FLAGS.export_dir is not None: tf.logging.info('Starting to export model.') inception_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=image_serving_input_fn)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, FLAGS.init_checkpoint) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) tf.gfile.MakeDirs(FLAGS.output_dir) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) num_train_steps = None num_warmup_steps = None if FLAGS.do_train: num_train_steps = int( FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) if not tf.gfile.Exists(FLAGS.train_file): tf.logging.info( "DANITER:File doesn't exist, creating tfrecord data") examples = model_builder.load_hellaswag(FLAGS.train_raw_data) tf.logging.info("DANITER:Read raw data as json") model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.train_file, do_copa=True) train_input_fn = file_based_input_fn_builder( input_file=FLAGS.train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, steps=num_train_steps) if FLAGS.do_eval: # This tells the estimator to run through the entire set. if FLAGS.eval_data_size < 0: eval_steps = None else: eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size) eval_drop_remainder = True if FLAGS.use_tpu else False if not tf.gfile.Exists(FLAGS.eval_file): examples = model_builder.load_hellaswag(FLAGS.eval_raw_data) model_builder.file_based_convert_examples_for_bilinear( examples, 512, tokenizer, FLAGS.eval_file, do_copa=True) eval_input_fn = file_based_input_fn_builder( input_file=FLAGS.eval_file, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=eval_drop_remainder) def _find_valid_cands(curr_step): filenames = tf.gfile.ListDirectory(FLAGS.output_dir) candidates = [] for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] idx = ckpt_name.split("-")[-1] if idx != "best" and int(idx) > curr_step: candidates.append(filename) return candidates tf.logging.info("Evaling all models in output dir") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") key_name = "eval_accuracy" tf.logging.info("Checkpoint path " + checkpoint_path) if tf.gfile.Exists(checkpoint_path + ".index"): tf.logging.info("Found a best model... not good") result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) best_perf = result[key_name] global_step = result["global_step"] else: tf.logging.info("Setting global step to -1") global_step = -1 best_perf = -1 checkpoint_path = None tf.logging.info("Openning writer " + output_eval_file) writer = tf.gfile.GFile(output_eval_file, "w") steps_and_files = {} filenames = tf.gfile.ListDirectory(FLAGS.output_dir) tf.logging.info("Models found " + "\n".join(filenames)) for filename in filenames: if filename.endswith(".index"): ckpt_name = filename[:-6] cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) if cur_filename.split("-")[-1] == "best": continue gstep = int(cur_filename.split("-")[-1]) if gstep not in steps_and_files: tf.logging.info( "Add {} to eval list.".format(cur_filename)) steps_and_files[gstep] = cur_filename tf.logging.info("found {} files.".format(len(steps_and_files))) # steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) if not steps_and_files: tf.logging.info( "found 0 file, global step: {}. Sleeping.".format(global_step)) else: for ele in sorted(steps_and_files.items()): step, checkpoint_path = ele if global_step >= step: if len(_find_valid_cands(step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # Why should we remove checkpoints? # tf.gfile.Remove(src_ckpt) tf.logging.info("Skipping candidate for some reason") continue result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=checkpoint_path) global_step = result["global_step"] tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("best = {}\n".format(best_perf)) if len(_find_valid_cands(global_step)) > 1: for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tf.logging.info("removing {}".format(src_ckpt)) # tf.gfile.Remove(src_ckpt) writer.write("=" * 50 + "\n") writer.close()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") albert_config = modeling.AlbertConfig.from_json_file( FLAGS.albert_config_file) tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: tf.logging.info(" %s" % input_file) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 run_config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max, tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) model_fn = model_fn_builder(albert_config=albert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, optimizer=FLAGS.optimizer, poly_power=FLAGS.poly_power, start_warmup_step=FLAGS.start_warmup_step) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = contrib_tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size) if FLAGS.do_train: tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) if FLAGS.do_eval: tf.logging.info("***** Running evaluation *****") tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) global_step = -1 output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") writer = tf.gfile.GFile(output_eval_file, "w") eval_input_fn = input_fn_builder( input_files=input_files, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False) best_perf = 0 key_name = "masked_lm_accuracy" while global_step < FLAGS.num_train_steps: if estimator.latest_checkpoint() is None: tf.logging.info("No checkpoint found yet. Sleeping.") time.sleep(1) else: result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) global_step = result["global_step"] tf.logging.info("***** Eval results *****") checkpoint_path = estimator.latest_checkpoint() for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if result[key_name] > best_perf: best_perf = result[key_name] for ext in ["meta", "data-00000-of-00001", "index"]: src_ckpt = checkpoint_path + ".{}".format(ext) tgt_ckpt = checkpoint_path.rsplit( "-", 1)[0] + "-best.{}".format(ext) tf.logging.info("saving {} to {}".format( src_ckpt, tgt_ckpt)) tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) writer.write("saved {} to {}\n".format( src_ckpt, tgt_ckpt))