def main(): parser = argparse.ArgumentParser() parser.add_argument("--data", help="cifar data path", default="../data/cifar-10-batches-py") parser.add_argument("--epochs", type=int, help="number of learning epoch, default is 10", default=10) parser.add_argument("--saving", help="wheter saving or not(each verbose iteration)", action="store_true") parser.add_argument("--batch_size", type=int, help="batch size(default is 32)", default=32) parser.add_argument("--verbose", type=int, help="verbosity cycle(default is 1 epoch)", default=1) parser.add_argument("--no_tqdm", help="whether to use tqdm process bar", action="store_true") parser.add_argument("--lr", type=float, help="learning rate, default is 0.001", default=1e-3) args = parser.parse_args() dirname = args.data X_train, y_train = prerprocess_train(dirname) X_test, y_test = prerprocess_test(dirname) device = 'gpu:0' if tfe.num_gpus() > 0 else 'cpu:0' googlenet_model = GoogLEnet(learning_rate=args.lr, device_name=device) # googlenet_model.load() # you can load the latest model you saved googlenet_model(tf.convert_to_tensor(X_train[:1]), True) googlenet_model.summary() if args.no_tqdm: tqdm_option = None else: tqdm_option = "normal" googlenet_model.fit(X_train, y_train, X_test, y_test, epochs=args.epochs, verbose=args.verbose, batch_size=args.batch_size, saving=args.saving, tqdm_option=tqdm_option)
def benchmarkEagerL2hmc(self): """Benchmark Eager performance.""" hparams = get_default_hparams() dynamics = l2hmc.Dynamics(x_dim=hparams.x_dim, loglikelihood_fn=l2hmc.get_scg_energy_fn(), n_steps=hparams.n_steps, eps=hparams.eps) # TODO(lxuechen): Add learning rate decay optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate) # Warmup to reduce initialization effect when timing l2hmc.warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters) # Time start_time = time.time() l2hmc.fit(dynamics, optimizer, n_samples=hparams.n_samples, n_iters=hparams.n_iters) wall_time = time.time() - start_time examples_per_sec = hparams.n_samples / wall_time self.report_benchmark(name="eager_train_%s" % ("gpu" if tfe.num_gpus() > 0 else "cpu"), iters=hparams.n_iters, extras={"examples_per_sec": examples_per_sec}, wall_time=wall_time)
def main(_): tfe.enable_eager_execution() # Automatically determine device and data_format (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') # If data_format is defined in FLAGS, overwrite automatically set value. if FLAGS.data_format is not None: data_format = data_format print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = mnist_dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = mnist_dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) # Create file writers for writing TensorBoard summaries. if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(FLAGS.model_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(FLAGS.model_dir)) # Train and evaluate for a set number of epochs. with tf.device(device): for _ in range(FLAGS.train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def benchmarkEagerLinearRegression(self): num_batches = 200 batch_size = 64 dataset = linear_regression.synthetic_dataset( w=tf.random_uniform([3, 1]), b=tf.random_uniform([1]), noise_level=0.01, batch_size=batch_size, num_batches=num_batches) burn_in_dataset = dataset.take(10) model = linear_regression.LinearModel() with tf.device(device()): optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) # Perform burn-in. linear_regression.fit(model, burn_in_dataset, optimizer) start_time = time.time() linear_regression.fit(model, dataset, optimizer) wall_time = time.time() - start_time examples_per_sec = num_batches * batch_size / wall_time self.report_benchmark( name="eager_train_%s" % ("gpu" if tfe.num_gpus() > 0 else "cpu"), iters=num_batches, extras={"examples_per_sec": examples_per_sec}, wall_time=wall_time)
def main(_): tfe.enable_eager_execution() # Ground-truth constants. true_w = [[-2.0], [4.0], [1.0]] true_b = [0.5] noise_level = 0.01 # Training constants. batch_size = 64 learning_rate = 0.1 print("True w: %s" % true_w) print("True b: %s\n" % true_b) model = LinearModel() dataset = synthetic_dataset(true_w, true_b, noise_level, batch_size, 20) device = "gpu:0" if tfe.num_gpus() else "cpu:0" print("Using device: %s" % device) with tf.device(device): optimizer = tf.train.GradientDescentOptimizer(learning_rate) fit(model, dataset, optimizer, verbose=True, logdir=FLAGS.logdir) print("\nAfter training: w = %s" % model.variables[0].numpy()) print("\nAfter training: b = %s" % model.variables[1].numpy())
def _generate_synthetic_snli_data_batch(sequence_length, batch_size, vocab_size): """Generate a fake batch of SNLI data for testing.""" with tf.device("cpu:0"): labels = tf.random_uniform([batch_size], minval=1, maxval=4, dtype=tf.int64) prem = tf.random_uniform( (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64) prem_trans = tf.constant(np.array( [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 3, 2, 2]] * batch_size, dtype=np.int64).T) hypo = tf.random_uniform( (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64) hypo_trans = tf.constant(np.array( [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 3, 2, 2]] * batch_size, dtype=np.int64).T) if tfe.num_gpus(): labels = labels.gpu() prem = prem.gpu() prem_trans = prem_trans.gpu() hypo = hypo.gpu() hypo_trans = hypo_trans.gpu() return labels, prem, prem_trans, hypo, hypo_trans
def main(_): tfe.enable_eager_execution() # Ground-truth constants. true_w = [[-2.0], [4.0], [1.0]] true_b = [0.5] noise_level = 0.01 # Training constants. batch_size = 64 learning_rate = 0.1 print("True w: %s" % true_w) print("True b: %s\n" % true_b) model = LinearModel() dataset = synthetic_dataset(true_w, true_b, noise_level, batch_size, 20) device = "gpu:0" if tfe.num_gpus() else "cpu:0" print("Using device: %s" % device) with tf.device(device): optimizer = tf.train.GradientDescentOptimizer(learning_rate) fit(model, dataset, optimizer, verbose=True, logdir=FLAGS.logdir) print("\nAfter training: w = %s" % model.variables[0].numpy()) print("\nAfter training: b = %s" % model.variables[1].numpy())
def train(): # Specify the path where you want to save/restore the trained variables. checkpoint_directory = 'models_checkpoints/EmotionCNN/' # Use the GPU if available. device = 'gpu:0' if tfe.num_gpus() > 0 else 'cpu:0' # Define optimizer. optimizer = tf.train.AdamOptimizer() # Instantiate model. This doesn't initialize the variables yet. model = ERCNN(num_classes=7, device=device, checkpoint_directory=checkpoint_directory) training_data,eval_data = data_process() # Train model model.fit(training_data, eval_data, optimizer, num_epochs=500, early_stopping_rounds=5, verbose=10, train_from_scratch=False) model.save_model() plt.plot(range(len(model.history['train_loss'])), model.history['train_loss'], color='b', label='Train loss') plt.plot(range(len(model.history['eval_loss'])), model.history['eval_loss'], color='r', label='Dev loss') plt.title('Model performance during training', fontsize=15) plt.xlabel('Number of epochs', fontsize=15) plt.ylabel('Loss', fontsize=15) plt.legend(fontsize=15) plt.show() train_acc = model.compute_accuracy(training_data) eval_acc = model.compute_accuracy(eval_data) print('Train accuracy: ', train_acc.result().numpy()) print('Eval accuracy: ', eval_acc.result().numpy())
def benchmarkEagerLinearRegression(self): num_epochs = 10 num_batches = 200 batch_size = 64 dataset = linear_regression.synthetic_dataset(w=tf.random_uniform( [3, 1]), b=tf.random_uniform([1]), noise_level=0.01, batch_size=batch_size, num_batches=num_batches) burn_in_dataset = dataset.take(10) model = linear_regression.LinearModel() with tf.device(device()): optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) # Perform burn-in. linear_regression.fit(model, burn_in_dataset, optimizer) start_time = time.time() for _ in range(num_epochs): linear_regression.fit(model, dataset, optimizer) wall_time = time.time() - start_time examples_per_sec = num_epochs * num_batches * batch_size / wall_time self.report_benchmark( name="eager_train_%s" % ("gpu" if tfe.num_gpus() > 0 else "cpu"), iters=num_epochs * num_batches, extras={"examples_per_sec": examples_per_sec}, wall_time=wall_time)
def _report(self, test_name, start, num_iters, batch_size): avg_time = (time.time() - start) / num_iters dev = 'gpu' if tfe.num_gpus() else 'cpu' name = 'eager_%s_%s_batch_%d_%s' % (test_name, dev, batch_size, data_format()) extras = {'examples_per_sec': batch_size / avg_time} self.report_benchmark( iters=num_iters, wall_time=avg_time, name=name, extras=extras)
def main(_): assert tfe.num_gpus() > 0, 'Make sure the GPU device exists' device_name = '/gpu:{}'.format(args.cuda_device) print('\n==> ==> ==> Using device {}'.format(device_name)) # Load the dataset train_ds, val_ds = [ dataset_generator( mode, conf.input_size, num_epochs=1, batch_size=conf.batch_size, buffer_size=10000) # TODO edit this when in real training for mode in ['train', 'val'] ] # Create the model and optimizer model = RetinaNet() optimizer = tf.train.RMSPropOptimizer(conf.learning_rate) # Define the path to the TensorBoard summary train_dir, val_dir = [ os.path.join(conf.summary_dir, mode) for mode in ['train', 'val'] ] tf.gfile.MakeDirs(conf.summary_dir) train_summary_writer = tf.contrib.summary.create_summary_file_writer( train_dir, flush_millis=10000, name='train') val_summary_writer = tf.contrib.summary.create_summary_file_writer( val_dir, flush_millis=10000, name='val') checkpoint_prefix = os.path.join(conf.checkpoint_dir, 'ckpt') with tfe.restore_variables_on_create( tf.train.latest_checkpoint(conf.checkpoint_dir)): with tf.device(device_name): epoch = tfe.Variable(1., name='epoch') best_loss = tfe.Variable(tf.float32.max, name='best_loss') print('==> ==> ==> Start training from epoch {:.0f}...\n'.format( epoch.numpy())) while epoch <= conf.num_epochs + 1: gs = tf.train.get_or_create_global_step() with train_summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, epoch.numpy()) with val_summary_writer.as_default(): eval_loss = validate(model, val_ds, epoch.numpy()) # Save the best loss if eval_loss < best_loss: best_loss.assign( eval_loss) # do NOT be copied directly, SHALLOW! all_variables = (model.variables + optimizer.variables() + [gs] + [epoch] + [best_loss]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=gs) epoch.assign_add(1)
def train(self, x, y, val, execution_mode=None): device = '/gpu:0' if tfe.num_gpus() else '/cpu:0' log = { 'epoch_list': [], 'train_binary_crossentropy': [], 'train_roc': [], 'val_roc': [] } # with tfe.execution_mode(execution_mode): optimizer = tf.train.AdagradOptimizer(self.lr) weight_file = '/nn_modules/best_eager_MLP_weight' no_improve = 0 # no_improve 技计数 用于early_stoping max_score = 0 with tf.device(device): # 指定硬件 for epoch in range(self.epoch): # epoch 寻环 train_iterator = make_iterator( (x, y), batch_size=self.batch_size) # 生成iterator loss_history = [] full_y_pred = [] while True: # batch 寻环,利用try停止epoch try: batch_x, batch_y = train_iterator.get_next() grads, loss, batch_pre = self.compute_gradients_and_loss( batch_x, batch_y) self.apply_gradients(optimizer, grads) loss_history.append(loss.numpy()) full_y_pred.append(batch_pre.numpy()) # tfe.async_wait() except tf.errors.OutOfRangeError: break full_y_pred = np.concatenate(full_y_pred) val_score = self.validate(val[0], val[1]) if val_score > max_score: max_score = val_score no_improve = 0 self.model.save_weights(weight_file) else: no_improve += 1 if no_improve > 10: print("early stop at epoch %d" % (no_improve)) break self.model.load_weights(weight_file) train_score = f1(y, full_y_pred) epoch_loss = np.mean(loss_history) log['epoch_list'].append(epoch) log['train_binary_crossentropy'].append(epoch_loss) log['train_roc'].append(train_score) log['val_roc'].append(val_score) print( "epoch=%d,loss=%.6f,train_roc=%.6f,val_roc=%.6f,time=%s" % (epoch, epoch_loss, train_score, val_score, time.asctime())) datafe = pd.DataFrame(log) datafe.to_csv('performance_log.csv')
def _report(self, test_name, start, num_iters, batch_size): avg_time = (time.time() - start) / num_iters dev = 'gpu' if tfe.num_gpus() else 'cpu' name = 'eager_%s_%s_batch_%d_%s' % (test_name, dev, batch_size, data_format()) extras = {'examples_per_sec': batch_size / avg_time} self.report_benchmark(iters=num_iters, wall_time=avg_time, name=name, extras=extras)
def main(): # Enable eager execution tfe.enable_eager_execution() # check gpu availability device = '/gpu:0' if tfe.num_gpus() <= 0: device = '/cpu:0' train(device) return
def __init__(self, param_dict): self.param = param_dict print(self.param) self.lr = param_dict['lr'] self.epoch = param_dict['epoch'] self.batch_size = param_dict['batch_size'] self.val_batch_size = param_dict['val_batch_size'] # self.batch_num = param_dict['batch_num'] # self.embed_feature_size_list = param_dict['embed_faeture_list'] # self.MLP = param_dict['MLP'] # self.input_dim = param_dict['input_dim'] self.drop_rate = param_dict['drop_rate'] self.reg_rate = param_dict['reg_rate'] # self.vector_length = param_dict['vector_length'] # self.subclass = param_dict['subclass'] self.weight_file_path = param_dict['weight_file_path'] self.pre_train = param_dict['pre_train'] self.device = '/gpu:0' if tfe.num_gpus() else '/cpu:0' with tf.Session().as_default() as sess: self.sess = sess self.loss = tf.keras.losses.categorical_crossentropy # with tf.Graph().as_default(): self.model = czx_NN_subclass(param_dict) self.padded_dict = {} for col in self.param['feature_name'].feature_all(): if col is not 'appId_list_encoded': self.padded_dict[col] = [1] self.padded_dict['appId_list_encoded'] = self.param[ 'appId_list_encoded_length'] self.padded_dict['usage_appId_list'] = self.param['size_of_space'][ 'max_usage_len'] self.padded_dict['usage_duration_list'] = self.param['size_of_space'][ 'max_usage_len'] self.padded_dict['usage_times_list'] = self.param['size_of_space'][ 'max_usage_len'] self.padded_dict['usage_use_date_list'] = self.param['size_of_space'][ 'max_usage_len'] self.padded_dict['all_activedApp_cate_list'] = self.param[ 'size_of_space']['max_cate_len'] self.padded_dict['usage_appId_duration_list'] = self.param[ 'size_of_space']['max_usage_len'] self.padded_dict['usage_appId_times_list'] = self.param[ 'size_of_space']['max_usage_len'] self.padded_dict['usage_appId_mean_dura_list'] = self.param[ 'size_of_space']['max_usage_len'] self.padded_dict['usage_appId_full_list'] = self.param[ 'size_of_space']['max_usage_full_len'] self.padded_dict['usage_duration_full_list'] = self.param[ 'size_of_space']['max_usage_full_len'] self.padded_dict['usage_time_full_list'] = self.param['size_of_space'][ 'max_usage_full_len']
def main(_): pp = pprint.PrettyPrinter() pp.pprint(flags.FLAGS.__flags) filenames = glob.glob(data_dir) (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) if not os.path.exists(FLAGS.sample_dir): os.makedirs(FLAGS.sample_dir) model_objects = { 'generator': Generator(data_format), 'discriminator': Discriminator(data_format), 'generator_optimizer': tf.train.AdamOptimizer(FLAGS.generator_learning_rate, FLAGS.beta1, FLAGS.beta2), 'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.discriminator_learning_rate, FLAGS.beta1, FLAGS.beta2), 'step_counter': tf.train.get_or_create_global_step() } summary_writer = tf.contrib.summary.create_file_writer(FLAGS.summary_dir, flush_millis=1000) checkpoint = tfe.Checkpoint(**model_objects) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) checkpoint.restore(latest_cpkt) dataset = tf.data.TFRecordDataset( filenames).map(read_and_decode_with_labels) dataset = dataset.shuffle(10000).apply( tf.contrib.data.batch_and_drop_remainder(FLAGS.batch_size)) with tf.device(device): for epoch in range(FLAGS.epoch): start = time.time() with summary_writer.as_default(): train_one_epoch(dataset=dataset, batch_size=FLAGS.batch_size, log_interval=FLAGS.log_interval, z_dim=FLAGS.z_dim, device=device, epoch=epoch, **model_objects) end = time.time() checkpoint.save(checkpoint_prefix) print('\nTrain time for epoch #%d (step %d): %f' % (checkpoint.save_counter.numpy(), checkpoint.step_counter.numpy(), end - start))
def __init__(self, param_dict): self.lr = param_dict['lr'] self.epoch = param_dict['epoch'] self.batch_size = param_dict['batch_size'] # self.embed_feature_size_list = param_dict['embed_faeture_list'] # self.MLP = param_dict['MLP'] # self.input_dim = param_dict['input_dim'] self.drop_rate = param_dict['drop_rate'] self.reg_rate = param_dict['reg_rate'] # self.vector_length = param_dict['vector_length'] # self.subclass = param_dict['subclass'] self.device = '/gpu:0' if tfe.num_gpus() else '/cpu:0' self.model = czx_NN_subclass(param_dict)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint( model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint_dir)) # Train and evaluate for 10 epochs. with tf.device(device): for _ in range(10): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') # Train and evaluate for 11 epochs. with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = (model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset .from_tensor_slices(data.train.images) .shuffle(60000) .batch(FLAGS.batch_size)) # Create the models and optimizers generator = Generator(data_format) discriminator = Discriminator(data_format) with tf.variable_scope('generator'): generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) with tf.variable_scope('discriminator'): discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) with tf.device(device): for epoch in range(1, 101): with tfe.restore_variables_on_create(latest_cpkt): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(generator, discriminator, generator_optimizer, discriminator_optimizer, dataset, FLAGS.log_interval, FLAGS.noise) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) all_variables = ( generator.variables + discriminator.variables + generator_optimizer.variables() + discriminator_optimizer.variables() + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def test_spinn(embed, test_data, config): """Test a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. test_data: An instance of `data_chemprot.ChemprotData`, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: 1. Final loss value on the test split. 2. Final fraction of correct classifications on the test split. """ use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer(config.logdir, flush_millis=10000) with tf.device(device), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = ChemprotClassifier(config, embed) latest_checkpoint = tf.train.latest_checkpoint(config.logdir) print("Latest checkpoint", latest_checkpoint) tfe.restore_network_checkpoint( model, tf.train.latest_checkpoint(config.logdir)) start = time.time() dev_mean_loss = tfe.metrics.Mean() dev_accuracy = tfe.metrics.Accuracy() print(log_header) #restore dev_loss, dev_frac_correct, dev_f1, dev_lables, dev_logits, dev_pmids, dev_ent1s, dev_ent2s = _evaluate_on_dataset( dev_data, config.batch_size, model, use_gpu) print( dev_log_template.format(time.time() - start, 0, 0, 1, 0, 1 / 1, 0, dev_loss, 0, dev_frac_correct * 100.0)) print(dev_f1)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets (train_ds, test_ds) = load_data(FLAGS.data_dir) train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size) # Create the model and optimizer model = MNISTModel(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_summary_file_writer( train_dir, flush_secs=10) test_summary_writer = tf.contrib.summary.create_summary_file_writer( test_dir, flush_secs=10, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = ( model.variables + tfe.get_optimizer_variables(optimizer) + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets (train_ds, test_ds) = load_data(FLAGS.data_dir) train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size) # Create the model and optimizer model = MNISTModel(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = ( model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def main(): # Enable eager execution tfe.enable_eager_execution() # (device, data_format) = ('/gpu:0', 'channels_first') # if tfe.num_gpus() <= 0: # (device, data_format) = ('/cpu:0', 'channels_last') # check gpu availability device = '/gpu:0' if tfe.num_gpus() <= 0: device = '/cpu:0' train(device) return
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset.from_tensor_slices( data.train.images).shuffle(60000).batch(FLAGS.batch_size)) # Create the models and optimizers generator = Generator(data_format) discriminator = Discriminator(data_format) with tf.variable_scope('generator'): generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) with tf.variable_scope('discriminator'): discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) with tf.device(device): for epoch in range(1, 101): with tfe.restore_variables_on_create(latest_cpkt): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(generator, discriminator, generator_optimizer, discriminator_optimizer, dataset, FLAGS.log_interval, FLAGS.noise) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) all_variables = (generator.variables + discriminator.variables + generator_optimizer.variables() + discriminator_optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset.from_tensor_slices( data.train.images).shuffle(60000).batch(FLAGS.batch_size)) # Create the models and optimizers. model_objects = { 'generator': Generator(data_format), 'discriminator': Discriminator(data_format), 'generator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'step_counter': tf.train.get_or_create_global_step(), } # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) checkpoint = tfe.Checkpoint(**model_objects) # Restore variables on creation if a checkpoint exists. checkpoint.restore(latest_cpkt) with tf.device(device): for _ in range(100): start = time.time() with summary_writer.as_default(): train_one_epoch(dataset=dataset, log_interval=FLAGS.log_interval, noise_dim=FLAGS.noise, **model_objects) end = time.time() checkpoint.save(checkpoint_prefix) print('\nTrain time for epoch #%d (step %d): %f' % (checkpoint.save_counter.numpy(), checkpoint.step_counter.numpy(), end - start))
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = ( tf.data.Dataset.from_tensor_slices(data.train.images).shuffle(60000) .batch(FLAGS.batch_size)) # Create the models and optimizers. model_objects = { 'generator': Generator(data_format), 'discriminator': Discriminator(data_format), 'generator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'step_counter': tf.train.get_or_create_global_step(), } # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) checkpoint = tfe.Checkpoint(**model_objects) # Restore variables on creation if a checkpoint exists. checkpoint.restore(latest_cpkt) with tf.device(device): for _ in range(100): start = time.time() with summary_writer.as_default(): train_one_epoch(dataset=dataset, log_interval=FLAGS.log_interval, noise_dim=FLAGS.noise, **model_objects) end = time.time() checkpoint.save(checkpoint_prefix) print('\nTrain time for epoch #%d (step %d): %f' % (checkpoint.save_counter.numpy(), checkpoint.step_counter.numpy(), end - start))
def benchmarkEagerSpinnSNLIClassifier(self): test_device = "gpu:0" if tfe.num_gpus() else "cpu:0" with tf.device(test_device): burn_in_iterations = 2 benchmark_iterations = 10 vocab_size = 1000 batch_size = 128 sequence_length = 15 d_embed = 200 d_out = 4 embed = tf.random_normal((vocab_size, d_embed)) config = _test_spinn_config(d_embed, d_out) model = spinn.SNLIClassifier(config, embed) trainer = spinn.SNLIClassifierTrainer(model, config.lr) (labels, prem, prem_trans, hypo, hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length, batch_size, vocab_size) for _ in range(burn_in_iterations): trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans) gc.collect() start_time = time.time() for _ in xrange(benchmark_iterations): trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans) wall_time = time.time() - start_time # Named "examples"_per_sec to conform with other benchmarks. extras = {"examples_per_sec": benchmark_iterations / wall_time} self.report_benchmark( name="Eager_SPINN_SNLIClassifier_Benchmark", iters=benchmark_iterations, wall_time=wall_time, extras=extras)
def device_and_data_format(): return ('/gpu:0', 'channels_first') if tfe.num_gpus() else ('/cpu:0', 'channels_last')
def setUp(self): super(SpinnTest, self).setUp() self._test_device = "gpu:0" if tfe.num_gpus() else "cpu:0" self._temp_data_dir = tempfile.mkdtemp()
def device(): return "/device:GPU:0" if tfe.num_gpus() else "/device:CPU:0"
def data_format(): return "channels_first" if tfe.num_gpus() else "channels_last"
def data_format(): return "channels_first" if tfe.num_gpus() else "channels_last"
def device(): return "/device:GPU:0" if tfe.num_gpus() else "/device:CPU:0"
help='Predict the class of an input image', type=str) parser.add_argument('--test', help='Evaluate accuracy on the test set', action='store_true') parser.add_argument('--validation', help='Evaluate accuracy on the validation set', action='store_true') args = parser.parse_args() cfg = Configuration() net = AlexNet(cfg, training=False) testset = ImageNetDataset(cfg, 'test') if tfe.num_gpus() > 2: # set 2 to 0 if you want to run on the gpu # but currently running on gpu is impossible # because tf.in_top_k does not have a cuda implementation with tf.device('/gpu:0'): tester = Tester(cfg, net, testset) if args.classify: tester.classify_image(args.classify) elif args.validation: tester.test('validation') else: tester.test('test') else: tester = Tester(cfg, net, testset)
writer = tf.contrib.summary.create_file_writer("./tb/") writer.set_as_default() buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None speed = 0 mean_reward = 0 device = "gpu:0" if tfe.num_gpus() else "cpu:0" print("Using device: %s" % device) with tf.device(device): optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) while True: with tf.device('/cpu:0'): global_step.assign_add(1) frame_idx += 1 epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) reward = agent.play_step(net, epsilon) if reward is not None: total_rewards.append(reward)
def main(argv): parser = MNISTEagerArgParser() flags = parser.parse_args(args=argv[1:]) # TF v1.7 tfe.enable_eager_execution() # Automatically determine device and data_format (device, data_format) = ('/gpu:0', 'channels_first') if flags.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') # If data_format is defined in FLAGS, overwrite automatically set value. if flags.data_format is not None: data_format = flags.data_format # Log Info print("-" * 64) print("TEST INFO - EAGER") print("-" * 64) print("TF version:\t {}".format(tf.__version__)) print("Eager execution:\t {}".format(tf.executing_eagerly())) print("Dataset:\t MNIST") print("Model:\t CNN") print('Device:\t {}'.format(device)) if data_format == 'channels_first': print("Data format:\t NCHW (channel first)") else: print("Data format:\t NHWC (channel last)") print("=" * 64) # Load the datasets train_ds = mnist_dataset.train(flags.data_dir).shuffle(60000).batch( flags.batch_size) test_ds = mnist_dataset.test(flags.data_dir).batch(flags.batch_size) # Create the model and optimizer # model = create_model(data_format) model = MNISTModel(data_format) optimizer = tf.train.MomentumOptimizer(flags.lr, flags.momentum) # Create file writers for writing TensorBoard summaries. if flags.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(flags.output_dir, 'train') test_dir = os.path.join(flags.output_dir, 'eval') tf.gfile.MakeDirs(flags.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(flags.model_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(flags.model_dir)) # Train and evaluate for a set number of epochs. with tf.device(device): for _ in range(flags.train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, flags.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def device(): return '/gpu:0' if tfe.num_gpus() else '/cpu:0'
# data OBJECTS = ['hand'] NUM_OBJECTS = 1 MAX_DETECTIONS_PER_IMAGE = 1 VIDEO_IN = 'video.mp4' # data preprocessing DIM_OUTPUT_PER_GRID_PER_ANCHOR = 5 + NUM_OBJECTS GRID_H, GRID_W = 13, 13 GRID_SIZE = 416 // GRID_H ANCHORS = np.array([[0.09112895, 0.06958421], [0.21102316, 0.16803947], [0.42625895, 0.26609842], [0.25476474, 0.49848], [0.52668947, 0.59138947]]) NUM_ANCHORS = ANCHORS.shape[0] ANCHORS *= np.array([GRID_H, GRID_W]) # map from [0,1] space to [0,19] space IMG_OUT_H, IMG_OUT_W = GRID_H * GRID_SIZE, GRID_W * GRID_SIZE # prediction CHECKPOINT_DIR = 'checkpoints' CHECKPOINT_PREFIX = os.path.join(CHECKPOINT_DIR, "ckpt") DIR_TEST = 'test_input' DIR_IMGS_OUT = 'imgs_out' THRESHOLD_OUT_PROB = 0.5 THRESHOLD_IOU_NMS = 0.5 if tfe.num_gpus() > 0: DEVICE = '/gpu:0' print('Using GPU') else: DEVICE = '/cpu:0' print('Using CPU')
def train_or_infer_spinn(embed, word2index, train_data, dev_data, test_data, config): """Perform Training or Inference on a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. word2index: A `dict` mapping word to word index. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: If `config.inference_premise ` and `config.inference_hypothesis` are not `None`, i.e., inference mode: the logits for the possible labels of the SNLI data set, as a `Tensor` of three floats. else: The trainer object. Raises: ValueError: if only one of config.inference_premise and config.inference_hypothesis is specified. """ # TODO(cais): Refactor this function into separate one for training and # inference. use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) if ((config.inference_premise and not config.inference_hypothesis) or (not config.inference_premise and config.inference_hypothesis)): raise ValueError( "--inference_premise and --inference_hypothesis must be both " "specified or both unspecified, but only one is specified.") if config.inference_premise: # Inference mode. inference_sentence_pair = [ data.encode_sentence(config.inference_premise, word2index), data.encode_sentence(config.inference_hypothesis, word2index)] else: inference_sentence_pair = None log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer( config.logdir, flush_millis=10000) with tf.device(device), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step) checkpoint.restore(tf.train.latest_checkpoint(config.logdir)) if inference_sentence_pair: # Inference mode. prem, prem_trans = inference_sentence_pair[0] hypo, hypo_trans = inference_sentence_pair[1] hypo_trans = inference_sentence_pair[1][1] inference_logits = model( tf.constant(prem), tf.constant(prem_trans), tf.constant(hypo), tf.constant(hypo_trans), training=False) inference_logits = inference_logits[0][1:] max_index = tf.argmax(inference_logits) print("\nInference logits:") for i, (label, logit) in enumerate( zip(data.POSSIBLE_LABELS, inference_logits)): winner_tag = " (winner)" if max_index == i else "" print(" {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag)) return inference_logits train_len = train_data.num_batches(config.batch_size) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: checkpoint.save(os.path.join(config.logdir, "ckpt")) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, trainer, use_gpu) print(dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print(log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0)) return trainer
def device_and_data_format(): return ('/gpu:0', 'channels_first') if tfe.num_gpus() else ('/cpu:0', 'channels_last')
def main(argv): parser = MNISTEagerArgParser() flags = parser.parse_args(args=argv[1:]) tfe.enable_eager_execution() # Automatically determine device and data_format (device, data_format) = ('/gpu:0', 'channels_first') if flags.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') # If data_format is defined in FLAGS, overwrite automatically set value. if flags.data_format is not None: data_format = flags.data_format print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = mnist_dataset.train(flags.data_dir).shuffle(60000).batch( flags.batch_size) test_ds = mnist_dataset.test(flags.data_dir).batch(flags.batch_size) # Create the model and optimizer model = mnist.create_model(data_format) optimizer = tf.train.MomentumOptimizer(flags.lr, flags.momentum) # Create file writers for writing TensorBoard summaries. if flags.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(flags.output_dir, 'train') test_dir = os.path.join(flags.output_dir, 'eval') tf.gfile.MakeDirs(flags.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(flags.model_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint( model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(flags.model_dir)) # Train and evaluate for a set number of epochs. with tf.device(device): for _ in range(flags.train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, flags.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def train_or_infer_spinn(embed, word2index, train_data, dev_data, test_data, config): """Perform Training or Inference on a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. word2index: A `dict` mapping word to word index. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: If `config.inference_premise ` and `config.inference_hypothesis` are not `None`, i.e., inference mode: the logits for the possible labels of the SNLI data set, as a `Tensor` of three floats. else: The trainer object. Raises: ValueError: if only one of config.inference_premise and config.inference_hypothesis is specified. """ # TODO(cais): Refactor this function into separate one for training and # inference. use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) if ((config.inference_premise and not config.inference_hypothesis) or (not config.inference_premise and config.inference_hypothesis)): raise ValueError( "--inference_premise and --inference_hypothesis must be both " "specified or both unspecified, but only one is specified.") if config.inference_premise: # Inference mode. inference_sentence_pair = [ data.encode_sentence(config.inference_premise, word2index), data.encode_sentence(config.inference_hypothesis, word2index)] else: inference_sentence_pair = None log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer( config.logdir, flush_millis=10000) with tf.device(device), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step) checkpoint.restore(tf.train.latest_checkpoint(config.logdir)) if inference_sentence_pair: # Inference mode. prem, prem_trans = inference_sentence_pair[0] hypo, hypo_trans = inference_sentence_pair[1] hypo_trans = inference_sentence_pair[1][1] inference_logits = model( tf.constant(prem), tf.constant(prem_trans), tf.constant(hypo), tf.constant(hypo_trans), training=False) inference_logits = inference_logits[0][1:] max_index = tf.argmax(inference_logits) print("\nInference logits:") for i, (label, logit) in enumerate( zip(data.POSSIBLE_LABELS, inference_logits)): winner_tag = " (winner)" if max_index == i else "" print(" {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag)) return inference_logits train_len = train_data.num_batches(config.batch_size) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: checkpoint.save(os.path.join(config.logdir, "ckpt")) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, trainer, use_gpu) print(dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print(log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0)) return trainer
def train_spinn(embed, train_data, dev_data, test_data, config): """Train a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: 1. Final loss value on the test split. 2. Final fraction of correct classifications on the test split. """ use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer( config.logdir, flush_millis=10000) train_len = train_data.num_batches(config.batch_size) with tf.device(device), \ tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: all_variables = ( model.variables + [trainer.learning_rate] + [global_step]) saver = tfe.Saver(all_variables) saver.save(os.path.join(config.logdir, "ckpt"), global_step=global_step) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, model, trainer, use_gpu) print(dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print(log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, model, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0))