def main(): global sequence_length, batch_sizes sequence_length = int(input("\n\n enter a sequence length: ")) x_train, y_train, x_test, y_test, max_test, min_test, max_train, min_train = load_data( "train_data.csv", "test_data.csv") n_train = len(x_train) n_test = len(x_test) # shuffling temp_train = list(zip(x_train, y_train)) shuffle(temp_train) x_train, y_train = zip(*temp_train) temp_test = list(zip(x_test, y_test)) shuffle(temp_test) x_test, y_test = zip(*temp_test) x_train, y_train, x_test, y_test = np.array(x_train), np.array( y_train), np.array(x_test), np.array(y_test) # x = ( n*sequence_length ) || y = ( n*1 ) print("\n shape of the input: ", x_train.shape) print("\n shape of the output: ", y_train.shape) calculate_batch_sizes(n_train) batch_size = 1 while batch_size not in batch_sizes: print("\n Choose one of the following batch sizes to be used \n", batch_sizes) batch_size = int(input("\n enter a batch size: ")) tf.reset_default_graph() n_batches = int(len(y_train) / batch_size) print("\n number of batches: ", n_batches, "\n") x = tf.placeholder(tf.float32, [None, batch_size, sequence_length]) x_batches = np.reshape(x_train, [n_batches, batch_size, sequence_length]) y = tf.placeholder(tf.float32, [None, batch_size, 1]) y_batches = np.reshape(y_train, [n_batches, batch_size, 1]) params = dict() params["n_layers"] = 3 params["neurons_layer1"] = sequence_length params["neurons_layer2"] = sequence_length * 3 params["neurons_layer3"] = 1 params["learning_rate"] = 0.01 params["n_epochs"] = 4500 params["optimizer"] = "AdamOptimizer" print("\n Parameters of the network are:\n") for key in params: print("\t", key, ": ", params[key]) print("\n\n") my_lstm_cell = create_model(params) print("\n x's shape: ", x.shape) print("\n y's shape: ", y.shape, "\n") rnn_output, state = tf.nn.dynamic_rnn(cell=my_lstm_cell, inputs=x, dtype=tf.float32) print("\n shape of Network's output: ", rnn_output.shape, "\n") """ stacked_rnn_output = tf.reshape(rnn_output, [-1, 1]) stacked_outputs = tf.layers.dense(stacked_rnn_output, 1) outputs = tf.reshape(stacked_outputs, [n_train, 1, 1]) """ #outputs = tf.reshape(rnn_output, [n_train, 1, 1]) loss = tf.reduce_sum(tf.square(rnn_output - y)) optimizer = tf.train.AdamOptimizer(learning_rate=params["learning_rate"]) training_op = optimizer.minimize(loss) init = tf.global_variables_initializer() with tf.Session() as sess: #sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess = tf_debug.TensorBoardDebugWrapperSession(sess, "rohanasus:6006") init.run() avg_loss = float(0) i = 0 for ep in range(params["n_epochs"] + 1): sess.run(training_op, feed_dict={x: x_batches, y: y_batches}) if ep % 100 == 0: i += 1 mse = loss.eval(feed_dict={x: x_batches, y: y_batches}) if ep != 0: print("Epoch: ", ep, "\tLoss(MSE): ", mse) else: print("Epoch: ", ep, " \tLoss(MSE): ", mse) avg_loss = float(avg_loss - mse) avg_loss = float(avg_loss + mse) print("\n Average loss while training: ", avg_loss) avg_loss = float(avg_loss / i) remove = x_test.shape[0] % batch_size x_test = x_test[:-remove] y_test = y_test[:-remove] print("\n x_test's shape: ", x_test.shape, "\n") print("\nPredicted value \tActal value\n") y_pred = sess.run( rnn_output, feed_dict={x: x_test.reshape(-1, batch_size, sequence_length)}) temp_pred = np.reshape(np.array(y_pred), [-1, 1]) avg_error = float(0) for i in range(len(y_test)): print("Predicted: ", int(temp_pred[i][0] * (max_test - min_test) + min_test), "\tActual: ", int(y_test[i] * (max_test - min_test) + min_test)) avg_error = avg_error + abs(temp_pred[i][0] - y_test[i]) * (max_test - min_test) avg_error = float(avg_error / len(y_test)) accuracy = float(100 - avg_error) print("\n Average error while Testing: ", avg_error) print("\n Accuracy while testing: ", accuracy) all_tensors = [ n.name for n in tf.get_default_graph().as_graph_def().node ] tf.train.write_graph(sess.graph_def, '.', 'hellotensor.pbtxt')
def learning(data, data_info, just_restore=False): """ Training of the network Args: data: dataset to train on data_info : meta information about this dataset (such as variance, mean pose, etc.) it is an object from the class DataInfo (defined at the top of this file) just_restore: weather we are going to only restore the model from the checkpoint or are we going to train it as well Returns: nn: Neural Network trained on a data provided """ test = False debug = False with tf.Graph().as_default(): tf.set_random_seed(fl.FLAGS.seed) start_time = time.time() # Read the flags variance = fl.FLAGS.variance_of_noise num_hidden = fl.FLAGS.num_hidden_layers dropout = fl.FLAGS.dropout learning_rate = fl.FLAGS.learning_rate batch_size = fl.FLAGS.batch_size hidden_shapes = [fl.FLAGS.layer1_width for j in range(num_hidden)] # Check if the flags makes sence if dropout < 0 or variance < 0: print('ERROR! Have got negative values in the flags!') exit(1) # Allow TensorFlow to change device allocation when needed config = tf.ConfigProto( allow_soft_placement=True) # log_device_placement=True) # Adjust configuration so that multiple executions are possible config.gpu_options.allow_growth = True # Start a session sess = tf.Session(config=config) if debug: sess = tf_debug.TensorBoardDebugWrapperSession( sess, "taras-All-Series:6064") # Create a neural network shape = [ fl.FLAGS.frame_size * fl.FLAGS.chunk_length ] + hidden_shapes + [fl.FLAGS.frame_size * fl.FLAGS.chunk_length] nn = DAE(shape, sess, variance, data_info) print('\nDAE with the following shape was created : ', shape) # Initialize input_producer sess.run(tf.local_variables_initializer()) max_val = nn.max_val with tf.variable_scope("Train"): ############## DEFINE Optimizer and training OPERATOR ############ # Define the optimizer optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Do gradient clipping tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(nn._loss, tvars), 1e12) train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.train.get_or_create_global_step()) # Prepare for making a summary for TensorBoard train_error = tf.placeholder(dtype=tf.float32, shape=(), name='train_error') eval_error = tf.placeholder(dtype=tf.float32, shape=(), name='eval_error') train_summary_op = tf.summary.scalar('Train_error', train_error) eval_summary_op = tf.summary.scalar('Validation_error', eval_error) summary_dir = fl.FLAGS.summary_dir summary_writer = tf.summary.FileWriter( summary_dir, graph=tf.get_default_graph()) num_batches = int(data.train.num_sequences / batch_size) # Initialize the part of the graph with the input data sess.run( nn._train_data.initializer, feed_dict={nn._train_data_initializer: data.train.sequences}) sess.run( nn._valid_data.initializer, feed_dict={nn._valid_data_initializer: data.test.sequences}) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if fl.FLAGS.pretrain: layers_amount = len(nn.shape) - 2 # create an optimizers pretrain_optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate) # Make an array of the trainers for all the layers trainers = [ pretrain_optimizer.minimize( ut.loss_reconstruction( nn.run_less_layers(nn._input_, i + 1), nn.run_less_layers(nn._input_, i + 1, is_target=True), max_val, pretrain=True), global_step=tf.train.get_or_create_global_step(), name='Layer_wise_optimizer_' + str(i)) for i in range(layers_amount) ] # Initialize all the variables sess.run(tf.global_variables_initializer()) else: print("Initializing variables ...\n") sess.run(tf.global_variables_initializer()) # Create a saver saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) chkpt_file = fl.FLAGS.chkpt_dir + '/chkpt-final' # restore model, if needed if fl.FLAGS.restore: saver.restore(sess, chkpt_file) print("Model restored from the file " + str(chkpt_file) + '.') if just_restore: coord.request_stop() return nn # A few initialization for the early stopping delta = fl.FLAGS.delta_for_early_stopping # error tolerance for early stopping best_error = 10000 num_valid_batches = int(data.test.num_sequences / batch_size) try: # running enqueue threads. # Pretrain if fl.FLAGS.pretrain: layerwise_pretrain(nn, trainers, layers_amount, num_batches) # Train the whole network jointly step = 0 print('\nFinetune the whole network on ', num_batches, ' batches with ', batch_size, ' training examples in each for', fl.FLAGS.training_epochs, ' epochs...') print("") print(" ______________ ______") print("| Epoch | RMSE |") print("|------------ |------|") while not coord.should_stop(): _, train_error_ = sess.run( [train_op, nn._reconstruction_loss], feed_dict={}) if step % num_batches == 0: epoch = step * 1.0 / num_batches train_summary = sess.run( train_summary_op, feed_dict={train_error: np.sqrt(train_error_)}) # Print results of screen epoch_str = "| {0:3.0f} ".format(epoch)[:5] perc_str = "({0:3.2f}".format( epoch * 100.0 / fl.FLAGS.training_epochs)[:5] error_str = "%) |{0:5.2f}".format( train_error_)[:10] + "|" print(epoch_str, perc_str, error_str) if epoch % 5 == 0 and test: rmse = test(nn, fl.FLAGS.data_dir + '/test_1.binary') print( "\nOur RMSE for the first test sequence is : ", rmse) rmse = test(nn, fl.FLAGS.data_dir + '/test_2.binary') print( "\nOur RMSE for the second test sequenceis : ", rmse) if epoch > 0: summary_writer.add_summary(train_summary, step) # Evaluate on the validation sequences error_sum = 0 for valid_batch in range(num_valid_batches): curr_err = sess.run([nn._valid_loss], feed_dict={}) error_sum += curr_err[0] new_error = error_sum / (num_valid_batches) eval_sum = sess.run( eval_summary_op, feed_dict={eval_error: np.sqrt(new_error)}) summary_writer.add_summary(eval_sum, step) # Early stopping if fl.FLAGS.early_stopping: if (new_error - best_error) / best_error > delta: print('After ' + str(step) + ' steps started overfitting') break if new_error < best_error: best_error = new_error # Saver for the model save_path = saver.save(sess, chkpt_file) if epoch % 5 == 0: # Save for the model save_path = saver.save(sess, chkpt_file) print('Done training for %d epochs' % (epoch)) print("The model was saved in file: %s" % save_path) step += 1 except tf.errors.OutOfRangeError: if not fl.FLAGS.early_stopping: # Save the model save_path = saver.save(sess, chkpt_file) print('Done training for %d epochs, %d steps.' % (fl.FLAGS.training_epochs, step)) print("The final model was saved in file: %s" % save_path) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) duration = (time.time() - start_time) / 60 # in minutes, instead of seconds print("The training was running for %.3f min" % (duration)) return nn
def main(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True, fake_data=FLAGS.fake_data) def feed_dict(train): if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(FLAGS.train_batch_size, fake_data=FLAGS.fake_data) else: xs, ys = mnist.test.images, mnist.test.labels return {x: xs, y_: ys} sess = tf.InteractiveSession() # Create the MNIST neural network graph. # Input placeholders. with tf.name_scope("input"): x = tf.placeholder(tf.float32, [None, IMAGE_SIZE * IMAGE_SIZE], name="x-input") y_ = tf.placeholder(tf.float32, [None, NUM_LABELS], name="y-input") def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1, seed=RAND_SEED) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer.""" # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope("weights"): weights = weight_variable([input_dim, output_dim]) with tf.name_scope("biases"): biases = bias_variable([output_dim]) with tf.name_scope("Wx_plus_b"): preactivate = tf.matmul(input_tensor, weights) + biases activations = act(preactivate) return activations hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden") logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity) y = tf.nn.softmax(logits) with tf.name_scope("cross_entropy"): # The following line is the culprit of the bad numerical values that appear # during training of this graph. Log of zero gives inf, which is first seen # in the intermediate tensor "cross_entropy/Log:0" during the 4th run() # call. A multiplication of the inf values with zeros leads to nans, # which is first in "cross_entropy/mul:0". # # You can use the built-in, numerically-stable implementation to fix this # issue: # diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits) diff = -(y_ * tf.log(y)) with tf.name_scope("total"): cross_entropy = tf.reduce_mean(diff) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer( FLAGS.learning_rate).minimize(cross_entropy) with tf.name_scope("accuracy"): with tf.name_scope("correct_prediction"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) with tf.name_scope("accuracy"): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess.run(tf.global_variables_initializer()) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) elif FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) # Add this point, sess is a debug wrapper around the actual Session if # FLAGS.debug is true. In that case, calling run() will launch the CLI. for i in range(FLAGS.max_steps): acc = sess.run(accuracy, feed_dict=feed_dict(False)) print("Accuracy at step %d: %s" % (i, acc)) sess.run(train_step, feed_dict=feed_dict(True))
gt_wav_op, melspec = iterator.get_next() # feed forward pred_wav_op = model(gt_wav_op, melspec, is_training=False) # summaries tf.summary.audio('audio/pred', pred_wav_op, hp.signal.sr) tf.summary.audio('audio/gt', gt_wav_op, hp.signal.sr) # tf.summary.histogram('hist/wav', gt_wav) # tf.summary.histogram('hist/out', pred_wav) summ_op = tf.summary.merge_all() session_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 1}, ) with tf.Session(config=session_config) as sess: if debug: # session supporting tensorboard debugging. sess = tf_debug.TensorBoardDebugWrapperSession( sess, 'localhost:{}'.format(hp.debug_port)) # load model ckpt = '{}/{}'.format( hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir) sess.run(tf.global_variables_initializer()) if ckpt: var_list = None if hp.train.use_ema: var_list = {} for v in tf.trainable_variables('iaf_vocoder'): var_list[model.ema.average_name(v)] = v tf.train.Saver(var_list=var_list).restore(sess, ckpt) print('Successfully loaded checkpoint {}'.format(ckpt)) else: print('No checkpoint found at {}.'.format(hp.logdir))
def run_model( self, model: modeling.CANTRIPModel, train: Cohort, devel: Cohort, test: Cohort, weights: typing.Union[float, int, typing.Sequence[typing.Union[float, int]]] = 1): """ Run the given model using the given cohort and experimental settings contained in args. This function: (1) balanced the dataset (2) splits the cohort intro training:development:testing sets at the patient-level (3) trains CANTRIP and saves checkpoint/summaries for TensorBoard (4) evaluates CANTRIP on the development and testing set :param model: an instantiated CANTRIP model :param train: the cohort to use for training this experimental run :param devel: the cohort to use for validating this experimental run :param test: the cohort to use for testing this experimental run :param weights: sample weights :return: nothing """ # Save summaries and checkpoints into the directories passed to the script model_summaries_dir, model_checkpoint_path = self.get_model_file() # Clear any previous summaries/checkpoints if asked if FLAGS.clear_prev: nio.delete_dir_quiet(model_summaries_dir) nio.delete_dir_quiet(os.path.dirname(model_checkpoint_path)) print('Deleted previous model summaries/checkpoints') # Make output directories so we don't blow up when saving nio.make_dirs_quiet(os.path.dirname(model_checkpoint_path)) devel_batches = devel.batched(batch_size=FLAGS.batch_size, permute=False) test_batches = test.batched(batch_size=FLAGS.batch_size, permute=False) epoch_steps = len(train.to_list()) // FLAGS.batch_size optimizer = optimization.BERTOptimizer( model, lr_decay=True, l1_reg=FLAGS.use_l1_reg, l2_reg=FLAGS.use_l2_reg, num_train_steps=epoch_steps * 10, steps_per_epoch=epoch_steps, num_warmup_steps=epoch_steps * min(3, FLAGS.num_epochs - 1), init_lr=FLAGS.learning_rate, weights=weights, normalize_weights=FLAGS.use_focal_loss, focal_loss=FLAGS.use_focal_loss) summarizer = summarization.CANTRIPSummarizer(model, optimizer) # Now that everything has been defined in TensorFlow's computation graph, initialize our model saver saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.max_to_keep) batch_width = int(np.log10(FLAGS.batch_size)) + 1 count_format = '%0' + str(batch_width) + 'd' score_format = '%5.3f' # noinspection PyCompatibility metric_format = { 'TP': count_format, 'TN': count_format, 'FP': count_format, 'FN': count_format, 'Precision': score_format, 'Recall': score_format, 'Accuracy': score_format, 'Specificity': score_format, 'DOR': '%5.1f', 'F1': score_format, 'F2': score_format, 'F.5': score_format, 'AUROC': score_format, 'AUPRC': score_format, 'Loss': score_format, 'MCC': score_format, } log_metrics = { "Accuracy": "Acc", "AUROC": "AUROC", "AUPRC": "AUPRC", "Precision": "Prec", "Recall": "Sens", "Specificity": "Spec", "DOR": "OR", "F1": "F1", "MCC": "MCC", "Loss": "Loss" } def format_(results, metrics=None): if not metrics: metrics = {k: k for k in metric_format.keys()} return { metrics[metric]: (metric_format[metric] % value) for metric, value in results.items() if metric in metrics } # Tell TensorFlow to wake up and get ready to rumble with tf.Session() as sess: # If we specified a TensorBoard debug server, connect to it # (this is actually pretty sweet but you have to manually step through your model's flow so 99% of the time # you shouldn't need it) if FLAGS.debug is not None: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.debug) # Create our summary writer (used by TensorBoard) summary_writer = tf.summary.FileWriter(model_summaries_dir, sess.graph) # Restore model if it exists (and we didn't clear it), otherwise create a shiny new one checkpoint = tf.train.get_checkpoint_state(model_checkpoint_path) if checkpoint and gfile.Exists(checkpoint.model_checkpoint_path + '.index'): print("Reading model parameters from '%s'...", checkpoint.model_checkpoint_path) saver.restore(sess, checkpoint.model_checkpoint_path) else: print("Creating model with fresh parameters...") sess.run(tf.global_variables_initializer()) # Initialize local variables (these are just used for computing average metrics) sess.run(tf.local_variables_initializer()) # Create a progress logger to monitor training (this is a wrapped version of range() epoch_width = int(np.log10(FLAGS.num_epochs)) + 1 with trange(FLAGS.num_epochs, desc='Training') as train_log: # Save the training, development, and testing metrics for our best model (as measured by devel F1) # I'm lazy so I initialize best_devel_metrics with a zero F1 so I can compare the first iteration to it best_train_metrics, best_devel_metrics = {}, {'MCC': 0} # Iterate over training epochs for i in train_log: # Get global step and reset training metrics global_step, _ = sess.run( [optimizer.global_step, summarizer.train.reset_op]) total_loss = 0. if FLAGS.correct_imbalance == "downsample" or FLAGS.correct_imbalance == "upsample": train_ = train.balance_classes( method=FLAGS.correct_imbalance) else: train_ = train batches = train_.batched(batch_size=FLAGS.batch_size) num_batches = len(batches) with tqdm(batches, desc=('Epoch %0' + str(epoch_width) + 'd') % (i + 1)) as batch_log: # Iterate over each batch for j, batch in enumerate(batch_log): # We train the model by evaluating the optimizer's training op. At the same time we update # the training metrics and get metrics/summaries for the current batch and request the new # global step number (used by TensorBoard to coordinate metrics across different runs _, batch_summary, batch_metrics, global_step = sess.run( [ [ optimizer.train_op, summarizer.train.metric_ops ], # All fetches we aren't going to read summarizer.batch_summary, summarizer.batch_metrics, optimizer.global_step ], batch.feed(model, training=True)) # Update tqdm progress indicator with current training metrics on this batch batch_log.set_postfix(format_(batch_metrics)) # Save batch-level summaries summary_writer.add_summary(batch_summary, global_step=global_step) total_loss += batch_metrics['Loss'] # Save epoch-level training metrics and summaries train_metrics, train_summary = sess.run( [summarizer.train.metrics, summarizer.train.summary]) train_metrics['Loss'] = total_loss / num_batches summary_writer.add_summary(train_summary, global_step=global_step) # Evaluate development performance sess.run(summarizer.devel.reset_op) # Update local variables used to compute development metrics as we process each batch for devel_batch in devel_batches: sess.run([summarizer.devel.metric_ops], devel_batch.feed(model, training=False)) # Compute the development metrics devel_metrics, devel_summary = sess.run( [summarizer.devel.metrics, summarizer.devel.summary]) # Update training progress bar to indicate current performance on development set train_log.set_postfix(format_(devel_metrics)) # Save TensorBoard summary summary_writer.add_summary(devel_summary, global_step=global_step) # def format_metrics(metrics: dict): # return dict((key, '%6.4f' % value) for key, value in metrics.items()) train_log.write( ('Epoch %0' + str(epoch_width) + 'd. Train: %s | Devel: %s') % (i + 1, "; ".join("{}: {}".format(k, v) for k, v in format_( train_metrics, log_metrics).items()), "; ".join( "{}: {}".format(k, v) for k, v in format_( devel_metrics, log_metrics).items()))) sess.run(summarizer.test.reset_op) for batch in test_batches: sess.run([ summarizer.test.metrics, summarizer.test.metric_ops ], batch.feed(model, training=False)) test_metrics, test_summary = sess.run( [summarizer.test.metrics, summarizer.test.summary]) summary_writer.add_summary(test_summary, global_step=global_step) # If this run did better on the dev set, save it as the new best model if devel_metrics['MCC'] > best_devel_metrics['MCC']: best_devel_metrics = devel_metrics best_train_metrics = train_metrics best_test_metrics = test_metrics # Save the model saver.save(sess, model_checkpoint_path, global_step=global_step) print('Training complete!') return model, best_train_metrics, best_devel_metrics, best_test_metrics
def start_training(trainParams): with tf.Graph().as_default() as graph: tf.set_random_seed(2) keepp_pl = tf.placeholder(tf.float32) train_test_selector = tf.placeholder(tf.int32) dataset_handle = tf.placeholder(tf.string, shape=[]) global_step = tf.Variable(0, name='global_step', trainable=False) with tf.device('/cpu:0'): examples, train_iterator, test_iterator = dataset_interface.add_defaul_dataset_pipeline( trainParams, model, dataset_handle) ins = examples[0] lbs = examples[1] typecombs = examples[2] instcombs = examples[3] genres = examples[4] ids = examples[5] audiofiles = examples[6] logits, rkhs = model.inference(ins, keepp_pl) loss = model.loss(logits, lbs) train_op = model.training(loss, global_step) eval_top1, eval_top5, correct1, correct5 = model.evaluation( logits, lbs) with tf.device('/cpu:0'): avg_loss_op, avg_top1_op, avg_top5_op, reset_op = stats.add_summaries( loss, eval_top1, eval_top5) update_comb_stats, reset_comb_stats = stats.add_comb_stats( correct1, correct5, typecombs, train_test_selector) update_inst_stats, reset_inst_stats = stats.add_inst_stats( correct1, correct5, instcombs, train_test_selector) update_genre_stats, reset_genre_stats = stats.add_genre_stats( correct1, correct5, genres, train_test_selector) stats.add_confusion_matrix(logits, lbs) stats.collect_wrong_examples(correct1, ins, rkhs, instcombs, typecombs, ids, audiofiles) reset_all = [reset_op, reset_comb_stats, reset_genre_stats] update_stats = [ update_comb_stats, update_inst_stats, update_genre_stats ] summary = tf.summary.merge_all() init = tf.global_variables_initializer() saver = tf.train.Saver() config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) if trainParams.debug: sess = tf_debug.TensorBoardDebugWrapperSession( sess, 'localhost:6064') train_writer = tf.summary.FileWriter( trainParams.log_path_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(trainParams.log_path_dir + '/test') training_handle = sess.run(train_iterator.string_handle()) testing_handle = sess.run(test_iterator.string_handle()) hparams_op = add_hyperparameters_textsum(trainParams) # Initialize or load graph from checkpoint if not trainParams.restore_from_dir: tf.gfile.MakeDirs(trainParams.log_path_dir) _, hp_str = sess.run([init, hparams_op]) train_writer.add_summary(hp_str, 0) train_writer.flush() else: ckpt = tf.train.get_checkpoint_state( trainParams.restore_from_dir[0]) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print('loaded graph from dir %s' % trainParams.restore_from_dir[0]) graph.finalize() gstep = 0 try: print('running...') sess.run(train_iterator.initializer) sess.run(test_iterator.initializer) # Start the training loop. while gstep < trainParams.num_steps: try: # Train sess.run(reset_op) if trainParams.trace: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, loss_value, top1_value, top5_value, __, gstep = sess.run( [ train_op, avg_loss_op, avg_top1_op, avg_top5_op, update_stats, global_step ], feed_dict={ dataset_handle: training_handle, train_test_selector: 0, keepp_pl: model.kp }, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'stats_epoch %d' % gstep) train_writer.flush() print( '%s: TRAIN step %d. %0.2f hz loss: %0.04f top1 %0.04f top5 %0.04f' % (trainParams.run_name, gstep, 0.0, loss_value, top1_value, top5_value)) duration_mean = 1 while True: try: start_time = time.time() # Log training runtime statistics if np.mod(gstep + 1, trainParams.summary_interval) == 0: summary_str, _, loss_value, top1_value, top5_value, __, gstep = sess.run( [ summary, train_op, avg_loss_op, avg_top1_op, avg_top5_op, update_stats, global_step ], feed_dict={ dataset_handle: training_handle, train_test_selector: 0, keepp_pl: model.kp }) train_writer.add_summary(summary_str, gstep) train_writer.flush() print( '%s: TRAIN step %d. %0.2f hz loss: %0.04f top1 %0.04f top5 %0.04f' % (trainParams.run_name, gstep, model.batch_size / duration_mean, loss_value, top1_value, top5_value)) tt = [] sess.run([reset_op]) else: _, loss_value, top1_value, top5_value, __, gstep = sess.run( [ train_op, avg_loss_op, avg_top1_op, avg_top5_op, update_stats, global_step ], feed_dict={ dataset_handle: training_handle, train_test_selector: 0, keepp_pl: model.kp }) duration_mean = (duration_mean + (time.time() - start_time)) / 2 except tf.errors.OutOfRangeError: sess.run(train_iterator.initializer) break # Evaluate duration_mean = 1 sess.run([reset_op]) while True: try: start_time = time.time() loss_value, top1_value, top5_value, _, gstep = sess.run( [ avg_loss_op, avg_top1_op, avg_top5_op, update_stats, global_step ], feed_dict={ dataset_handle: testing_handle, train_test_selector: 1, keepp_pl: 1 }) duration_mean = (duration_mean + (time.time() - start_time)) / 2 except tf.errors.OutOfRangeError: sess.run(test_iterator.initializer) break summary_str, loss_value, top1_value, top5_value, _, gstep = sess.run( [ summary, avg_loss_op, avg_top1_op, avg_top5_op, update_stats, global_step ], feed_dict={ dataset_handle: testing_handle, train_test_selector: 1, keepp_pl: 1 }) test_writer.add_summary(summary_str, gstep) test_writer.flush() print( '%s: TEST step %d. %0.2f hz. loss: %0.04f. top1 %0.04f. top5 %0.04f' % (trainParams.run_name, gstep, model.batch_size / duration_mean, loss_value, top1_value, top5_value)) # Save a checkpoint checkpoint_file = os.path.join(trainParams.log_path_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=gstep) except Exception as e: print('Received expection while training: ' + str(e)) sess.close() return # os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"') # sess.run(train_iterator.initializer) # sess.run(test_iterator.initializer) # ckpt = tf.train.get_checkpoint_state(trainParams.restore_from_dir[0]) # if ckpt and ckpt.model_checkpoint_path: # saver.restore(sess, ckpt.model_checkpoint_path) # print('loaded graph from dir %s' % trainParams.restore_from_dir[0]) except Exception as e: print('finishing...' + str(e)) sess.close() return
# In[2]: #creating a session object which creates an environment where we can execute Operations and evaluate Tensors sess = tf.Session() # ## Debugger # # ### Uncomment the below line and execute the code to run the debugger. # # ### Go to the link once you start execution http://localhost:6006/ # In[3]: #Uncomment the below line to run the debugger sess = tf_debug.TensorBoardDebugWrapperSession(sess, "localhost:6064") # In[4]: #Inserting a placeholder for a tensor equal to size of data X = tf.placeholder(tf.float32, shape=[4, 2], name='X') #Inserting a placeholder for a tensor equal to size of labels of the data Y = tf.placeholder(tf.float32, shape=[4, 1], name='Y') # In[5]: #declaring a variable which will retain its state through multiple runs with random values from normal distribution W = tf.Variable(tf.truncated_normal([2, 2]), name="W") #declaring a variable which will retain its state through multiple runs with random values from normal distribution
def main(args): """Run training and validation. 1. Build graphs 1.1 Training graph to run on multiple GPUs 1.2 Validation graph to run on multiple GPUs 2. Configure sessions 2.1 Train 2.2 Validate 3. Main loop 3.1 Train 3.2 Write summary 3.3 Save model 3.4 Validate model Author: Perry Deng """ # Set reproduciable random seed tf.set_random_seed(1234) # Directories train_dir, train_summary_dir = conf.setup_train_directories() # Logger conf.setup_logger(logger_dir=train_dir, name="logger_train.txt") # Hyperparameters conf.load_or_save_hyperparams(train_dir) # Get dataset hyperparameters logger.info('Using dataset: {}'.format(FLAGS.dataset)) dataset_size_train = conf.get_dataset_size_train(FLAGS.dataset)\ if not FLAGS.train_on_test else conf.get_dataset_size_test(FLAGS.dataset) dataset_size_val = conf.get_dataset_size_validate(FLAGS.dataset) build_arch = conf.get_dataset_architecture(FLAGS.dataset) num_classes = conf.get_num_classes(FLAGS.dataset) create_inputs_train = conf.get_create_inputs(FLAGS.dataset, mode="train_whole")\ if not FLAGS.train_on_test else conf.get_create_inputs(FLAGS.dataset, mode="train_on_test") create_inputs_train_wholeset = conf.get_create_inputs(FLAGS.dataset, mode="train_whole") if dataset_size_val > 0: create_inputs_val = conf.get_create_inputs(FLAGS.dataset, mode="validate") #***************************************************************************** # 1. BUILD GRAPHS #***************************************************************************** #---------------------------------------------------------------------------- # GRAPH - TRAIN #---------------------------------------------------------------------------- logger.info('BUILD TRAIN GRAPH') g_train = tf.Graph() with g_train.as_default(), tf.device('/cpu:0'): # Get global_step global_step = tf.train.get_or_create_global_step() # Get batches per epoch num_batches_per_epoch = int(dataset_size_train / FLAGS.batch_size) # In response to a question on OpenReview, Hinton et al. wrote the # following: # "We use an exponential decay with learning rate: 3e-3, decay_steps: 20000, # decay rate: 0.96." # https://openreview.net/forum?id=HJWLfGWRb¬eId=ryxTPFDe2X lrn_rate = tf.train.exponential_decay(learning_rate = FLAGS.lrn_rate, global_step = global_step, decay_steps = 20000, decay_rate = 0.96) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer(learning_rate=lrn_rate) # Get batch from data queue. Batch size is FLAGS.batch_size, which is then # divided across multiple GPUs input_dict = create_inputs_train() batch_x = input_dict['image'] batch_labels = input_dict['label'] # AG 03/10/2018: Split batch for multi gpu implementation # Each split is of size FLAGS.batch_size / FLAGS.num_gpus # See: https://github.com/naturomics/CapsNet-Tensorflow/blob/master/ # dist_version/distributed_train.py splits_x = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_x) splits_labels = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_labels) #-------------------------------------------------------------------------- # MULTI GPU - TRAIN #-------------------------------------------------------------------------- # Calculate the gradients for each model tower tower_grads = [] tower_losses = [] tower_logits = [] tower_target_labels = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: logger.info('TOWER %d' % i) #with slim.arg_scope([slim.model_variable, slim.variable], # device='/cpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): loss, logits, x, patch, target_labels = tower_fn( build_arch, splits_x[i], splits_labels[i], scope, num_classes, reuse_variables=reuse_variables, is_train=True) # Don't reuse variable for first GPU, but do reuse for others reuse_variables = True # Compute gradients for one GPU patch_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "patch_params") grads = opt.compute_gradients(loss, var_list=patch_params) # Keep track of the gradients across all towers. tower_grads.append(grads) tower_target_labels.append(target_labels) # Keep track of losses and logits across for each tower tower_logits.append(logits) tower_losses.append(loss) # Loss for each tower tf.summary.scalar("loss", loss) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in- # gradients-in-tensorflow-when-updating grad_check = ([tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grads if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')]) # Apply the gradients to adjust the shared variables with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grads, global_step=global_step) # Calculate mean loss loss = tf.reduce_mean(tower_losses) # Calculate accuracy logits = tf.concat(tower_logits, axis=0) target_labels = tf.concat(tower_target_labels, axis=0) acc = met.accuracy(logits, target_labels) # Prepare predictions and one-hot labels probs = tf.nn.softmax(logits=logits) labels_oh = tf.one_hot(batch_labels, num_classes) # Group metrics together # See: https://cs230-stanford.github.io/tensorflow-model.html trn_metrics = {'loss' : loss, 'labels' : batch_labels, 'labels_oh' : labels_oh, 'logits' : logits, 'probs' : probs, 'acc' : acc, } # Reset and read operations for streaming metrics go here trn_reset = {} trn_read = {} # Logging tf.summary.scalar('batch_loss', loss) tf.summary.scalar('batch_success_rate', acc) # Set Saver # AG 26/09/2018: Save all variables including Adam so that we can continue # training from where we left off # max_to_keep=None should keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) # Display number of parameters train_params = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Trainable Parameters: {}'.format(train_params)) # Set summary op trn_summary = tf.summary.merge_all() #---------------------------------------------------------------------------- # GRAPH - TRAINING SET ACCURACY #---------------------------------------------------------------------------- logger.info('BUILD TRAINING SET ACCURACY GRAPH') g_trn_acc = tf.Graph() with g_trn_acc.as_default(): # Get global_step global_step = tf.train.get_or_create_global_step() # Get data input_dict = create_inputs_train_wholeset() batch_x = input_dict['image'] batch_labels = input_dict['label'] # AG 10/12/2018: Split batch for multi gpu implementation # Each split is of size FLAGS.batch_size / FLAGS.num_gpus # See: https://github.com/naturomics/CapsNet- # Tensorflow/blob/master/dist_version/distributed_train.py splits_x = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_x) splits_labels = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_labels) #-------------------------------------------------------------------------- # MULTI GPU - TRAINING SET ACCURACY #-------------------------------------------------------------------------- # Calculate the logits for each model tower tower_logits = [] tower_target_labels = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: with slim.arg_scope([slim.variable], device='/cpu:0'): loss, logits, x, patch, target_labels = tower_fn( build_arch, splits_x[i], splits_labels[i], scope, num_classes, reuse_variables=reuse_variables, is_train=False) # Don't reuse variable for first GPU, but do reuse for others reuse_variables = True # Keep track of losses and logits across for each tower tower_logits.append(logits) tower_target_labels.append(target_labels) # Loss for each tower tf.summary.histogram("train_set_logits", logits) # Combine logits from all towers logits = tf.concat(tower_logits, axis=0) target_labels = tf.concat(tower_target_labels, axis=0) # Calculate metrics train_set_loss = mod.spread_loss(logits, target_labels) train_set_acc = met.accuracy(logits, target_labels) # Prepare predictions and one-hot labels train_set_probs = tf.nn.softmax(logits=logits) train_set_labels_oh = tf.one_hot(batch_labels, num_classes) # Group metrics together # See: https://cs230-stanford.github.io/tensorflow-model.html train_set_metrics = {'loss' : train_set_loss, 'labels' : batch_labels, 'labels_oh' : train_set_labels_oh, 'logits' : logits, 'probs' : train_set_probs, 'acc' : train_set_acc, } # Reset and read operations for streaming metrics go here train_set_reset = {} train_set_read = {} saver = tf.train.Saver(max_to_keep=None) tf.summary.scalar("train_set_loss", train_set_loss) tf.summary.scalar("train_set_success_rate", train_set_acc) trn_acc_summary = tf.summary.merge_all() if dataset_size_val > 0: #---------------------------------------------------------------------------- # GRAPH - VALIDATION #---------------------------------------------------------------------------- logger.info('BUILD VALIDATION GRAPH') g_val = tf.Graph() with g_val.as_default(): # Get global_step global_step = tf.train.get_or_create_global_step() num_batches_val = int(dataset_size_val / FLAGS.batch_size) # Get data input_dict = create_inputs_val() batch_x = input_dict['image'] batch_labels = input_dict['label'] # AG 10/12/2018: Split batch for multi gpu implementation # Each split is of size FLAGS.batch_size / FLAGS.num_gpus # See: https://github.com/naturomics/CapsNet- # Tensorflow/blob/master/dist_version/distributed_train.py splits_x = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_x) splits_labels = tf.split( axis=0, num_or_size_splits=FLAGS.num_gpus, value=batch_labels) #-------------------------------------------------------------------------- # MULTI GPU - VALIDATE #-------------------------------------------------------------------------- # Calculate the logits for each model tower tower_logits = [] tower_target_labels = [] reuse_variables = None for i in range(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: with slim.arg_scope([slim.variable], device='/cpu:0'): loss, logits, x, patch, target_labels = tower_fn( build_arch, splits_x[i], splits_labels[i], scope, num_classes, reuse_variables=reuse_variables, is_train=False) # Don't reuse variable for first GPU, but do reuse for others reuse_variables = True # Keep track of losses and logits across for each tower tower_logits.append(logits) tower_target_labels.append(target_labels) # Loss for each tower tf.summary.histogram("val_logits", logits) # take patch and patched images from last tower val_patch = patch val_x = x # Combine logits from all towers logits = tf.concat(tower_logits, axis=0) target_labels = tf.concat(tower_target_labels, axis=0) # Calculate metrics val_loss = mod.spread_loss(logits, target_labels) val_acc = met.accuracy(logits, target_labels) # Prepare predictions and one-hot labels val_probs = tf.nn.softmax(logits=logits) val_labels_oh = tf.one_hot(batch_labels, num_classes) # Group metrics together # See: https://cs230-stanford.github.io/tensorflow-model.html val_metrics = {'loss' : val_loss, 'labels' : batch_labels, 'labels_oh' : val_labels_oh, 'logits' : logits, 'probs' : val_probs, 'acc' : val_acc, } val_images = {'patch' : val_patch, 'x' : val_x} # Reset and read operations for streaming metrics go here val_reset = {} val_read = {} tf.summary.scalar("val_loss", val_loss) tf.summary.scalar("val_success_rate", val_acc) # Saver saver = tf.train.Saver(max_to_keep=1) # Set summary op val_summary = tf.summary.merge_all() #**************************************************************************** # 2. SESSIONS #**************************************************************************** #----- SESSION TRAIN -----# # Session settings #sess_train = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, # log_device_placement=False), # graph=g_train) # Perry: added in for RTX 2070 incompatibility workaround config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True sess_train = tf.Session(config=config, graph=g_train) # Debugger # AG 05/06/2018: Debugging using either command line or TensorBoard if FLAGS.debugger is not None: # sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess_train = tf_debug.TensorBoardDebugWrapperSession(sess_train, FLAGS.debugger) with g_train.as_default(): sess_train.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) # Restore previous checkpoint # AG 26/09/2018: where should this go??? if FLAGS.load_dir is not None: prev_step = load_training(saver, sess_train, FLAGS.load_dir, opt) else: prev_step = 0 # Create summary writer, and write the train graph summary_writer = tf.summary.FileWriter(train_summary_dir, graph=sess_train.graph) #----- SESSION TRAIN SET ACCURACY -----# #sess_val = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, # log_device_placement=False), # graph=g_val) # Perry: added in for RTX 2070 incompatibility workaround config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True sess_train_acc = tf.Session(config=config, graph=g_trn_acc) with g_trn_acc.as_default(): sess_train_acc.run([tf.local_variables_initializer(), tf.global_variables_initializer()]) if dataset_size_val > 0: #----- SESSION VALIDATION -----# #sess_val = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, # log_device_placement=False), # graph=g_val) # Perry: added in for RTX 2070 incompatibility workaround config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True sess_val = tf.Session(config=config, graph=g_val) with g_val.as_default(): sess_val.run([tf.local_variables_initializer(), tf.global_variables_initializer()]) #**************************************************************************** # 3. MAIN LOOP #**************************************************************************** SUMMARY_FREQ = 100 SAVE_MODEL_FREQ = num_batches_per_epoch # 500 VAL_FREQ = num_batches_per_epoch # 500 PROFILE_FREQ = 5 #print("starting main loop") for step in range(prev_step, FLAGS.epoch * num_batches_per_epoch + 1): #print("looping") #for step in range(0,3): # AG 23/05/2018: limit number of iterations for testing # for step in range(100): epoch_decimal = step/num_batches_per_epoch epoch = int(np.floor(epoch_decimal)) # TF queue would pop batch until no file try: # TRAIN with g_train.as_default(): # With profiling if (FLAGS.profile is True) and ((step % PROFILE_FREQ) == 0): logger.info("Train with Profiling") run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # Without profiling else: run_options = None run_metadata = None # Reset streaming metrics if step % (num_batches_per_epoch/4) == 1: logger.info("Reset streaming metrics") sess_train.run([trn_reset]) # MAIN RUN tic = time.time() train_op_v, trn_metrics_v, trn_summary_v = sess_train.run( [train_op, trn_metrics, trn_summary], options=run_options, run_metadata=run_metadata) toc = time.time() # Read streaming metrics trn_read_v = sess_train.run(trn_read) # Write summary for profiling if run_options is not None: summary_writer.add_run_metadata( run_metadata, 'epoch{:f}'.format(epoch_decimal)) # Logging #logger.info('TRN' # + ' e-{:d}'.format(epoch) # + ' stp-{:d}'.format(step) # ) # + ' {:.2f}s'.format(toc - tic) # + ' loss: {:.4f}'.format(trn_metrics_v['loss']) # + ' acc: {:.2f}%'.format(trn_metrics_v['acc']*100) # ) except KeyboardInterrupt: sess_train.close() sess_val.close() sys.exit() except tf.errors.InvalidArgumentError as e: logger.warning('%d iteration contains NaN gradients. Discard.' % step) logger.error(str(e)) continue else: # WRITE SUMMARY if (step % SUMMARY_FREQ) == 0: logger.info("Write Train Summary") with g_train.as_default(): # Summaries from graph summary_writer.add_summary(trn_summary_v, step) # SAVE MODEL if (step % SAVE_MODEL_FREQ) == 0: logger.info("Save Model") with g_train.as_default(): train_checkpoint_dir = train_dir + '/checkpoint' if not os.path.exists(train_checkpoint_dir): os.makedirs(train_checkpoint_dir) # Save ckpt from train session ckpt_path = os.path.join(train_checkpoint_dir, 'model.ckpt' + str(epoch)) saver.save(sess_train, ckpt_path, global_step=step) if (step % VAL_FREQ) == 0: # calculate metrics every epoch with g_trn_acc.as_default(): logger.info("Start Train Set Accuracy") # Restore ckpt to val session latest_ckpt = tf.train.latest_checkpoint(train_checkpoint_dir) saver.restore(sess_train_acc, latest_ckpt) # Reset accumulators accuracy_sum = 0 loss_sum = 0 sess_train_acc.run(train_set_reset) for i in range(num_batches_per_epoch): train_set_metrics_v, train_set_summary_str_v = sess_train_acc.run( [train_set_metrics, trn_acc_summary]) # Update accuracy_sum += train_set_metrics_v['acc'] loss_sum += train_set_metrics_v['loss'] # Read trn_read_v = sess_train_acc.run(val_read) # Get checkpoint number ckpt_num = re.split('-', latest_ckpt)[-1] # Average across batches ave_acc = accuracy_sum / num_batches_per_epoch ave_loss = loss_sum / num_batches_per_epoch logger.info('TRN ckpt-{}'.format(ckpt_num) + ' avg_success: {:.2f}%'.format(ave_acc*100) + ' avg_loss: {:.4f}'.format(ave_loss) ) logger.info("Write Train Summary") summary_train = tf.Summary() summary_train.value.add(tag="trn_success", simple_value=ave_acc) summary_train.value.add(tag="trn_loss", simple_value=ave_loss) summary_writer.add_summary(summary_train, epoch) if dataset_size_val > 0: #----- Validation -----# with g_val.as_default(): logger.info("Start Validation") # Restore ckpt to val session latest_ckpt = tf.train.latest_checkpoint(train_checkpoint_dir) saver.restore(sess_val, latest_ckpt) # Reset accumulators accuracy_sum = 0 loss_sum = 0 sess_val.run(val_reset) for i in range(num_batches_val): if i == num_batches_val - 1: # take a sample of patched images on the last validation batch val_metrics_v, val_summary_str_v, val_images_v = sess_val.run( [val_metrics, val_summary, val_images]) x = val_images_v['x'] patch = val_images_v['patch'] else: val_metrics_v, val_summary_str_v = sess_val.run( [val_metrics, val_summary]) # Update accuracy_sum += val_metrics_v['acc'] loss_sum += val_metrics_v['loss'] # Read val_read_v = sess_val.run(val_read) # Get checkpoint number ckpt_num = re.split('-', latest_ckpt)[-1] # Logging #logger.info('VAL ckpt-{}'.format(ckpt_num) # + ' bch-{:d}'.format(i) # + ' cum_acc: {:.2f}%'.format(accuracy_sum/(i+1)*100) # + ' cum_loss: {:.4f}'.format(loss_sum/(i+1)) # ) # Average across batches ave_acc = accuracy_sum / num_batches_val ave_loss = loss_sum / num_batches_val logger.info('VAL ckpt-{}'.format(ckpt_num) + ' avg_success: {:.2f}%'.format(ave_acc*100) + ' avg_loss: {:.4f}'.format(ave_loss) ) logger.info("Write Val Summary") summary_val = tf.Summary() summary_val.value.add(tag="val_success", simple_value=ave_acc) summary_val.value.add(tag="val_loss", simple_value=ave_loss) summary_writer.add_summary(summary_val, epoch) log_images(summary_writer, "patch", [patch], epoch) log_images(summary_writer, "patched_input", x, epoch) if patch.shape[-1] == 1: patch = np.squeeze(patch, axis=-1) formatted = (patch * 255).astype('uint8') img = Image.fromarray(formatted) img.save(os.path.join(train_dir, "saved_patch.png")) # Close (main loop) sess_train.close() sess_val.close() sys.exit()
# M = VDEModelDesc(info_params) hps = get_default_hparams() hps.C = 10 hps.T = 28 hps.D = 28 hps.n_z = 28 M = ModelDesc(hps) logger.auto_set_dir(action='d') ds_train, ds_test = get_mnist_data() # sess = SessionCreatorAdapter(NewSessionCreator(), lambda sess: tf_debug.LocalCLIDebugWrapperSession(sess)) # sess = tf_debug.TensorBoardDebugWrapperSession(sess, "nam-pc:7000") creator = SessionCreatorAdapter( NewSessionCreator(), lambda sess: tf_debug.TensorBoardDebugWrapperSession( sess, "nam-pc:7000")) # Trainer(input=QueueInput(ds_train), model=M).train_with_defaults( # callbacks=[ # ModelSaver(), # callbacks.MergeAllSummaries(), # MinSaver('total_loss'), # InferenceRunner(ds_test, [ScalarStats('predict_trend/accuracy_')]) # ], # steps_per_epoch=info_params.steps_per_epoch, # max_epoch=info_params.epochs, # # session_init=SaverRestore(args.load) if args.load else None # ) Trainer(input=QueueInput(ds_train), model=M).train_with_defaults( callbacks=[
def __init__(self, params=None): # read params if params is None: self.params = utils.read_params() else: self.params = params if self.params["TRAIN"]["INITIALIZER"] == "XAVIER": init = tf.contrib.layers.xavier_initializer() else: init = tf.random_normal_initializer() self.CREATE_TIME = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") self.MODEL_DIR = "{}/model_{}".format( self.params["DIRS"]["MODELS_LOCAL"], self.CREATE_TIME) utils.make_dir(self.MODEL_DIR) with open(self.MODEL_DIR + '/params.json', 'w') as f: json.dump(self.params, f) # place holders with tf.name_scope("Data"): self.X = tf.placeholder(tf.float32, [None, None, None, None, None]) with tf.name_scope("Labels"): if "64" in self.params["TRAIN"]["DECODER_MODE"]: self.Y_onehot = tf.placeholder(tf.float32, [None, 64, 64, 64, 2]) else: self.Y_onehot = tf.placeholder(tf.float32, [None, 32, 32, 32, 2]) with tf.name_scope("LearningRate"): self.LR = tf.placeholder(tf.float32, []) print("Initializing Network") #pp = preprocessor.Preprocessor(self.X) # here #X_preprocessed = pp.out_tensor # here X_preprocessed = self.X # (n_batch, n_views, 127, 127, 3) n_batchsize = tf.shape(X_preprocessed)[0] # switch batch <-> nviews X_preprocessed = tf.transpose(X_preprocessed, [1, 0, 2, 3, 4]) # encoder print("encoder") if self.params["TRAIN"]["ENCODER_MODE"] == "DILATED": en = encoder.Dilated_Encoder(X_preprocessed) elif self.params["TRAIN"]["ENCODER_MODE"] == "RESIDUAL": en = encoder.Residual_Encoder(X_preprocessed) elif self.params["TRAIN"]["ENCODER_MODE"] == "SERESNET": en = encoder.SENet_Encoder(X_preprocessed) else: en = encoder.Simple_Encoder(X_preprocessed) encoded_input = en.out_tensor # switch batch <-> nviews encoded_input = tf.transpose(encoded_input, [1, 0, 2]) X_preprocessed = tf.transpose(X_preprocessed, [1, 0, 2, 3, 4]) # visualize transformation of input state to voxel if self.params["VIS"]["ENCODER_PROCESS"]: with tf.name_scope("misc"): feature_maps = tf.get_collection("feature_maps") fm_list = [] for fm in feature_maps: fm_slice = fm[0, 0, :, :, 0] #fm_shape = fm_slice.get_shape().as_list() fm_shape = tf.shape(fm_slice) fm_slice = tf.pad(fm_slice, [[0, 0], [127 - fm_shape[0], 0]]) fm_list.append(fm_slice) fm_img = tf.concat(fm_list, axis=0) tf.summary.image("feature_map_list", tf.expand_dims(tf.expand_dims(fm_img, -1), 0)) # recurrent_module print("recurrent_module") with tf.name_scope("Recurrent_module"): rnn_mode = self.params["TRAIN"]["RNN_MODE"] n_cell = self.params["TRAIN"]["RNN_CELL_NUM"] n_hidden = self.params["TRAIN"]["RNN_HIDDEN_SIZE"] if rnn_mode == "LSTM": rnn = recurrent_module.LSTM_Grid(initializer=init) hidden_state = ( tf.zeros([n_batchsize, n_cell, n_cell, n_cell, n_hidden], name="zero_hidden_state"), tf.zeros([n_batchsize, n_cell, n_cell, n_cell, n_hidden], name="zero_cell_state")) else: rnn = recurrent_module.GRU_Grid(initializer=init) hidden_state = tf.zeros( [n_batchsize, n_cell, n_cell, n_cell, n_hidden], name="zero_hidden_state") #n_timesteps = self.params["TRAIN"]["TIME_STEP_COUNT"] n_timesteps = np.shape(X_preprocessed)[1] # feed a limited seqeuence of images if isinstance(n_timesteps, int) and n_timesteps > 0: for t in range(n_timesteps): hidden_state = rnn.call(encoded_input[:, t, :], hidden_state) else: # feed an arbitray seqeuence of images n_timesteps = tf.shape(X_preprocessed)[1] t = tf.constant(0) def condition(h, t): return tf.less(t, n_timesteps) def body(h, t): h = rnn.call(encoded_input[:, t, :], h) t = tf.add(t, 1) return h, t hidden_state, t = tf.while_loop(condition, body, (hidden_state, t)) # decoder print("decoder") if isinstance(hidden_state, tuple): hidden_state = hidden_state[0] if self.params["TRAIN"]["DECODER_MODE"] == "DILATED": de = decoder.Dilated_Decoder(hidden_state) elif self.params["TRAIN"]["DECODER_MODE"] == "RESIDUAL": de = decoder.Residual_Decoder(hidden_state) elif self.params["TRAIN"]["DECODER_MODE"] == "RESIDUAL64": de = decoder.Residual_Decoder64(hidden_state) elif self.params["TRAIN"]["DECODER_MODE"] == "SERESNET": de = decoder.SENet_Decoder(hidden_state) elif self.params["TRAIN"]["DECODER_MODE"] == "SERESNET64": de = decoder.SENet_Decoder64(hidden_state) else: de = decoder.Simple_Decoder(hidden_state) self.logits = de.out_tensor # visualize transformation of hidden state to voxel if self.params["VIS"]["DECODER_PROCESS"]: with tf.name_scope("misc"): feature_voxels = tf.get_collection("feature_voxels") fv_list = [] for fv in feature_voxels: fv_slice = fv[0, :, :, 0, 0] fv_shape = fv_slice.get_shape().as_list() if "64" in self.params["TRAIN"]["DECODER_MODE"]: fv_slice = tf.pad(fv_slice, [[0, 0], [64 - fv_shape[0], 0]]) else: fv_slice = tf.pad(fv_slice, [[0, 0], [32 - fv_shape[0], 0]]) fv_list.append(fv_slice) fv_img = tf.concat(fv_list, axis=0) tf.summary.image("feature_voxel_list", tf.expand_dims(tf.expand_dims(fv_img, -1), 0)) # loss print("loss") if self.params["TRAIN"]["LOSS_FCN"] == "FOCAL_LOSS": voxel_loss = loss.Focal_Loss(self.Y_onehot, self.logits) self.softmax = voxel_loss.pred elif self.params["TRAIN"]["LOSS_FCN"] == "WEIGHTED_SOFTMAX": voxel_loss = loss.Weighted_Voxel_Softmax(self.Y_onehot, self.logits) self.softmax = voxel_loss.softmax elif self.params["TRAIN"]["LOSS_FCN"] == "SOFTMAX": voxel_loss = loss.Voxel_Softmax(self.Y_onehot, self.logits) self.softmax = voxel_loss.softmax else: print("WRONG LOSS FUNCTION. CHECK LOSS") os.abort() self.loss = voxel_loss.loss tf.summary.scalar("loss", self.loss) # misc print("misc") with tf.name_scope("misc"): self.step_count = tf.Variable(0, trainable=False, name="step_count") self.print = tf.Print(self.loss, [self.step_count, self.loss, t]) # optimizer print("optimizer") if self.params["TRAIN"]["OPTIMIZER"] == "ADAM": optimizer = tf.train.AdamOptimizer( learning_rate=self.LR, epsilon=self.params["TRAIN"]["ADAM_EPSILON"]) #learning_rate=self.params["TRAIN"]["ADAM_LEARN_RATE"], epsilon=self.params["TRAIN"]["ADAM_EPSILON"]) tf.summary.scalar("adam_learning_rate", optimizer._lr) else: optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.LR) #learning_rate=self.params["TRAIN"]["GD_LEARN_RATE"]) tf.summary.scalar("learning_rate", optimizer._learning_rate) grads_and_vars = optimizer.compute_gradients(self.loss) self.apply_grad = optimizer.apply_gradients( grads_and_vars, global_step=self.step_count) # metric print("metrics") with tf.name_scope("metrics"): Y = tf.argmax(self.Y_onehot, -1) predictions = tf.argmax(self.softmax, -1) acc, acc_op = tf.metrics.accuracy(Y, predictions) rms, rms_op = tf.metrics.root_mean_squared_error( self.Y_onehot, self.softmax) iou, iou_op = tf.metrics.mean_iou(Y, predictions, 2) self.metrics_op = tf.group(acc_op, rms_op, iou_op) tf.summary.scalar("accuracy", acc) tf.summary.scalar("rmse", rms) tf.summary.scalar("iou", iou) # initalize # config=tf.ConfigProto(log_device_placement=True) print("setup") self.summary_op = tf.summary.merge_all() self.sess = tf.InteractiveSession() if self.params["MODE"] == "DEBUG": self.sess = tf_debug.TensorBoardDebugWrapperSession( self.sess, "nat-oitwireless-inside-vapornet100-c-15126.Princeton.EDU:6064" ) # summaries print("summaries") if self.params["MODE"] == "TEST": self.test_writer = tf.summary.FileWriter( "{}/test".format(self.MODEL_DIR), self.sess.graph) else: self.train_writer = tf.summary.FileWriter( "{}/train".format(self.MODEL_DIR), self.sess.graph) self.val_writer = tf.summary.FileWriter( "{}/val".format(self.MODEL_DIR), self.sess.graph) # initialize print("initialize") tf.global_variables_initializer().run() tf.local_variables_initializer().run() print('trainable vars:', len(tf.trainable_variables())) print("...done!")
def run_model(model, raw_cohort, delta_encoder): """ Run the given model using the given cohort and experimental settings contained in args. This function: (1) balanced the dataset (2) splits the cohort intro training:development:testing sets at the patient-level (3) trains PRONTO and saves checkpoint/summaries for TensorBoard (4) evaluates PRONTO on the development and testing set :param model: an instantiated PRONTO model :type model: modeling.PRONTOModel :param raw_cohort: the cohort to use for this experimental run :type raw_cohort: preprocess.Cohort :param delta_encoder: encoder used to represented elapsed time deltas :type delta_encoder: preprocess.DeltaEncoder :return: nothing """ import scipy snapshot_sizes = [] for chronology in raw_cohort.chronologies(): for snapshot in chronology.snapshots: snapshot_sizes.append(len(snapshot)) print('Statistics on snapshot sizes:', scipy.stats.describe(snapshot_sizes)) days_til_onset = [] for chronology in raw_cohort.chronologies(): seconds = 0 for delta in chronology.deltas: seconds += delta days_til_onset.append(seconds / 60 / 60 / 24) print('Statistics on days until disease onset:', scipy.stats.describe(days_til_onset)) elapsed_times = [] for chronology in raw_cohort.chronologies(): for delta in chronology.deltas: elapsed_times.append(delta / 60 / 60 / 24) print('Statistics on elapsed time:', scipy.stats.describe(elapsed_times)) lengths = [] for chronology in raw_cohort.chronologies(): lengths.append(len(chronology)) print('Statistics on chronology lengths:', scipy.stats.describe(lengths)) # Balance the cohort to have an even number of positive/negative chronologies for each patient cohort = raw_cohort.balance_chronologies() # Split into training:development:testing train, devel, test = make_train_devel_test_split(cohort.patients(), FLAGS.tdt_ratio) # Save summaries and checkpoints into the directories passed to the script model_file = 'ln=%d_delta=%s_d=%.2f_vd=%.2f_lr=%g_bs=%d' % ( 1 if FLAGS.rnn_layer_norm else 0, 'disc' if FLAGS.use_discrete_deltas else 'tanh', FLAGS.dropout, FLAGS.vocab_dropout, FLAGS.learning_rate, FLAGS.batch_size, ) model_summaries_dir = os.path.join(FLAGS.output_dir, FLAGS.optimizer, FLAGS.rnn_cell_type, FLAGS.snapshot_encoder, model_file) model_checkpoint_dir = os.path.join(FLAGS.output_dir, FLAGS.optimizer, FLAGS.rnn_cell_type, FLAGS.snapshot_encoder, model_file, 'pronto_model') # Clear any previous summaries/checkpoints if asked if FLAGS.clear_prev: nio.delete_dir_quiet(model_summaries_dir) nio.delete_dir_quiet(model_checkpoint_dir) print('Deleted previous model summaries/checkpoints') # Make output directories so we don't blow up when saving nio.make_dirs_quiet(model_checkpoint_dir) # Instantiate PRONTO optimizer and summarizer classes if FLAGS.optimizer == 'PRONTO': optimizer = optimization.PRONTOOptimizer(model, learning_rate=FLAGS.learning_rate, sparse=True) elif FLAGS.optimizer == 'BERT': epoch_steps = len(cohort[train].make_epoch_batches(batch_size=FLAGS.batch_size, max_snapshot_size=FLAGS.max_snapshot_size, max_chrono_length=FLAGS.max_chrono_length, delta_encoder=delta_encoder)) optimizer = optimization.BERTOptimizer(model, num_train_steps=epoch_steps * FLAGS.num_epochs, num_warmup_steps=epoch_steps * 3, init_lr=FLAGS.learning_rate) print('Created BERT-like optimizer with initial learning rate of %f' % FLAGS.learning_rate) else: raise NotImplementedError('No optimizer available for %s' % FLAGS.optimizer) # noinspection PyUnboundLocalVariable summarizer = summarization.PRONTOSummarizer(model, optimizer) # Now that everything has been defined in TensorFlow's computation graph, initialize our model saver saver = tf.train.Saver(tf.global_variables()) first_cohort = cohort # Tell TensorFlow to wake up and get ready to rumble with tf.Session() as sess: # If we specified a TensorBoard debug server, connect to it # (this is actually pretty sweet but you have to manually step through your model's flow so 99% of the time # you shouldn't need it) if FLAGS.debug is not None: sess = tf_debug.TensorBoardDebugWrapperSession(sess, FLAGS.debug) # Create our summary writer (used by TensorBoard) summary_writer = tf.summary.FileWriter(model_summaries_dir, sess.graph) # Restore model if it exists (and we didn't clear it), otherwise create a shiny new one checkpoint = tf.train.get_checkpoint_state(model_checkpoint_dir) if checkpoint and gfile.Exists(checkpoint.model_checkpoint_path + '.index'): print("Reading model parameters from '%s'...", checkpoint.model_checkpoint_path) saver.restore(sess, checkpoint.model_checkpoint_path) else: print("Creating model with fresh parameters...") sess.run(tf.global_variables_initializer()) # Initialize local variables (these are just used for computing average metrics) sess.run(tf.local_variables_initializer()) # Create a progress logger to monitor training (this is a wrapped version of range() with trange(FLAGS.num_epochs, desc='Training') as train_log: # Save the training, development, and testing metrics for our best model (as measured by devel F1) # I'm lazy so I initialize best_devel_metrics with a zero F1 so I can compare the first iteration to it best_train_metrics, best_devel_metrics, best_test_metrics = {}, {'F2': 0}, {} # Iterate over training epochs for i in train_log: # Get global step and reset training metrics global_step, _ = sess.run([optimizer.global_step, summarizer.train.reset_op]) # Log our progress on the current epoch using tqdm cohort.make_epoch_batches shuffles the order of # chronologies and prepares them into mini-batches with zero-padding if needed total_loss = 0. batches = cohort[train].make_epoch_batches(batch_size=FLAGS.batch_size, max_snapshot_size=FLAGS.max_snapshot_size, max_chrono_length=FLAGS.max_chrono_length, delta_encoder=delta_encoder) num_batches = len(batches) with tqdm(batches, desc='Epoch %d' % (i + 1)) as batch_log: # Iterate over each batch for j, batch in enumerate(batch_log): # We train the model by evaluating the optimizer's training op. At the same time we update the # training metrics and get metrics/summaries for the current batch and request the new global # step number (used by TensorBoard to coordinate metrics across different runs _, batch_summary, batch_metrics, global_step = sess.run( [[optimizer.train_op, summarizer.train.metric_ops], # All fetches we aren't going to read summarizer.batch_summary, summarizer.batch_metrics, optimizer.global_step], batch.feed(model, training=True)) # Update tqdm progress indicator with current training metrics on this batch batch_log.set_postfix(batch_metrics) # Save batch-level summaries summary_writer.add_summary(batch_summary, global_step=global_step) total_loss += batch_metrics['Loss'] # Save epoch-level training metrics and summaries train_metrics, train_summary = sess.run([summarizer.train.metrics, summarizer.train.summary]) train_metrics['Loss'] = total_loss / num_batches summary_writer.add_summary(train_summary, global_step=global_step) # Re-sample chronologies in cohort cohort = raw_cohort.balance_chronologies() # Evaluate development performance sess.run(summarizer.devel.reset_op) # Update local variables used to compute development metrics as we process each batch for devel_batch in first_cohort[devel].make_epoch_batches(batch_size=FLAGS.batch_size, max_snapshot_size=FLAGS.max_snapshot_size, max_chrono_length=FLAGS.max_chrono_length, delta_encoder=delta_encoder): sess.run([summarizer.devel.metric_ops], devel_batch.feed(model, training=False)) # Compute the development metrics devel_metrics, devel_summary = sess.run([summarizer.devel.metrics, summarizer.devel.summary]) # Update training progress bar to indicate current performance on development set train_log.set_postfix(devel_metrics) # Save TensorBoard summary summary_writer.add_summary(devel_summary, global_step=global_step) def format_metrics(metrics: dict): return dict((key, '%6.4f' % value) for key, value in metrics.items()) train_log.write('Epoch %d. Train: %s | Devel: %s' % (i + 1, format_metrics(train_metrics), format_metrics(devel_metrics))) # Evaluate testing performance exactly as described above for development sess.run(summarizer.test.reset_op) for batch in first_cohort[test].make_epoch_batches(batch_size=FLAGS.batch_size, max_snapshot_size=FLAGS.max_snapshot_size, max_chrono_length=FLAGS.max_chrono_length, delta_encoder=delta_encoder): sess.run([summarizer.test.metrics, summarizer.test.metric_ops], batch.feed(model, training=False)) test_metrics, test_summary = sess.run([summarizer.test.metrics, summarizer.test.summary]) summary_writer.add_summary(test_summary, global_step=global_step) # If this run did better on the dev set, save it as the new best model if devel_metrics['F2'] > best_devel_metrics['F2']: best_devel_metrics = devel_metrics best_train_metrics = train_metrics best_test_metrics = test_metrics # Save the model saver.save(sess, model_checkpoint_dir, global_step=global_step) elif FLAGS.early_term: tqdm.write('Early termination!') break print('Training complete!') if FLAGS.print_performance: print('Train: %s' % str(best_train_metrics)) print('Devel: %s' % str(best_devel_metrics)) print('Test: %s' % str(best_test_metrics)) if FLAGS.save_tabbed_results: with open(os.path.join(model_summaries_dir, 'results.tsv'), 'w') as outfile: print_table_results(best_train_metrics, best_devel_metrics, best_test_metrics, 'simple', file=outfile) if FLAGS.save_latex_results: with open(os.path.join(model_summaries_dir, 'results.tex'), 'w') as outfile: print_table_results(best_train_metrics, best_devel_metrics, best_test_metrics, 'latex_booktabs', file=outfile)
def main(_): # Import data if FLAGS.fake_data: imgs = tf.random.uniform(maxval=256, shape=(10, 28, 28), dtype=tf.int32) labels = tf.random.uniform(maxval=10, shape=(10, ), dtype=tf.int32) mnist_train = imgs, labels mnist_test = imgs, labels else: mnist_train, mnist_test = tf.keras.datasets.mnist.load_data() def format_example(imgs, labels): imgs = tf.reshape(imgs, [-1, 28 * 28]) imgs = tf.cast(imgs, tf.float32) / 255.0 labels = tf.one_hot(labels, depth=10, dtype=tf.float32) return imgs, labels ds_train = tf.data.Dataset.from_tensor_slices(mnist_train) ds_train = ds_train.shuffle(1000, seed=RAND_SEED).repeat().batch( FLAGS.train_batch_size) ds_train = ds_train.map(format_example) it_train = ds_train.make_initializable_iterator() ds_test = tf.data.Dataset.from_tensors(mnist_test).repeat() ds_test = ds_test.map(format_example) it_test = ds_test.make_initializable_iterator() sess = tf.InteractiveSession() # Create the MNIST neural network graph. # Input placeholders. with tf.name_scope("input"): handle = tf.placeholder(tf.string, shape=()) iterator = tf.data.Iterator.from_string_handle( handle, (tf.float32, tf.float32), ((None, IMAGE_SIZE * IMAGE_SIZE), (None, 10))) x, y_ = iterator.get_next() def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1, seed=RAND_SEED) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer.""" # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope("weights"): weights = weight_variable([input_dim, output_dim]) with tf.name_scope("biases"): biases = bias_variable([output_dim]) with tf.name_scope("Wx_plus_b"): preactivate = tf.matmul(input_tensor, weights) + biases activations = act(preactivate) return activations hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden") logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity) y = tf.nn.softmax(logits) with tf.name_scope("cross_entropy"): # The following line is the culprit of the bad numerical values that appear # during training of this graph. Log of zero gives inf, which is first seen # in the intermediate tensor "cross_entropy/Log:0" during the 4th run() # call. A multiplication of the inf values with zeros leads to nans, # which is first in "cross_entropy/mul:0". # # You can use the built-in, numerically-stable implementation to fix this # issue: # diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits) diff = -(y_ * tf.log(y)) with tf.name_scope("total"): cross_entropy = tf.reduce_mean(diff) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer( FLAGS.learning_rate).minimize(cross_entropy) with tf.name_scope("accuracy"): with tf.name_scope("correct_prediction"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) with tf.name_scope("accuracy"): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess.run(tf.global_variables_initializer()) sess.run(it_train.initializer) sess.run(it_test.initializer) train_handle = sess.run(it_train.string_handle()) test_handle = sess.run(it_test.string_handle()) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if FLAGS.debug: if FLAGS.use_random_config_path: _, config_file_path = tempfile.mkstemp(".tfdbg_config") else: config_file_path = None sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=FLAGS.ui_type, config_file_path=config_file_path) elif FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) # Add this point, sess is a debug wrapper around the actual Session if # FLAGS.debug is true. In that case, calling run() will launch the CLI. for i in range(FLAGS.max_steps): acc = sess.run(accuracy, feed_dict={handle: test_handle}) print("Accuracy at step %d: %s" % (i, acc)) sess.run(train_step, feed_dict={handle: train_handle})
def main(config): # Import data mnist = input_data.read_data_sets(config.data_dir, one_hot=True, fake_data=config.fake_data) def feed_dict(train): if train or config.fake_data: xs, ys = mnist.train.next_batch(config.batch_size, fake_data=config.fake_data) else: xs, ys = mnist.test.images, mnist.test.labels return {x: xs, y_: ys} sess = tf.InteractiveSession() # Create the MNIST neural network graph. # Input placeholders. with tf.name_scope("input"): x = tf.placeholder(tf.float32, [None, config.image_size**2], name="x-input") y_ = tf.placeholder(tf.float32, [None, config.num_classes], name="y-input") hidden = tf.layers.dense( x, config.hidden_size, activation=tf.nn.relu, kernel_initializer=tf.initializers.truncated_normal(stddev=0.1, seed=config.seed), name="hidden") logits = tf.layers.dense( hidden, config.num_classes, kernel_initializer=tf.initializers.truncated_normal(stddev=0.1, seed=config.seed), name="logits") y = tf.nn.softmax(logits) with tf.name_scope("cross_entropy"): # The following line is the culprit of the bad numerical values that appear # during training of this graph. Log of zero gives inf, which is first seen # in the intermediate tensor "cross_entropy/Log:0" during the 4th run() # call. A multiplication of the inf values with zeros leads to nans, # which is first in "cross_entropy/mul:0". # # You can use the built-in, numerically-stable implementation to fix this # issue: # diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits) diff = -(y_ * tf.log(y)) with tf.name_scope("total"): cross_entropy = tf.reduce_mean(diff) with tf.name_scope("train"): train_step = tf.train.AdamOptimizer( config.learning_rate).minimize(cross_entropy) with tf.name_scope("accuracy"): with tf.name_scope("correct_prediction"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) with tf.name_scope("accuracy"): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) sess.run(tf.global_variables_initializer()) if config.debug and config.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address config are mutually exclusive." ) if config.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=config.ui_type) elif config.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, config.tensorboard_debug_address) # Add this point, sess is a debug wrapper around the actual Session if # config.debug is true. In that case, calling run() will launch the CLI. for i in range(config.max_steps): acc = sess.run(accuracy, feed_dict=feed_dict(False)) print("Accuracy at step %d: %s" % (i, acc)) sess.run(train_step, feed_dict=feed_dict(True))
def train(): gamma = 0.99 episodes = 100 batch_size = 128 max_time_steps = 200 episode_reward = 0 reward_history = [] env = gym.make('Pendulum-v0') obs_old = env.reset() memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) agent_policy = Policy(env, "policy") agent_critic = Value(env, "value") # agent_policy_t = Policy(env,"policy_t") # agent_critic_t = Value(env,"value_t") #initial rollouts to gather date for i in range(10000): action = env.action_space.sample() obs, rew, done, _ = env.step(action) episode_reward += rew memory.append(obs_old, action, rew, obs, done) obs_old = obs if done: # reward_history.append(episode_reward) episode_reward = 0 env.reset() episode_reward = 0 time_t = 200 tf.summary.scalar("episode_time_steps", time_t) tf.summary.scalar("episode_reward", episode_reward) merged = tf.summary.merge_all() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) mark = np.zeros([8, 1]) time_split = np.zeros_like(mark) with tf.Session() as sess: sess1 = tf_debug.TensorBoardDebugWrapperSession(sess, "Vader:6007") # tf_debug.LocalCLIDebugWrapperSession(sess) from datetime import datetime now = datetime.now() train_writer = tf.summary.FileWriter( './train/' + now.strftime("%Y%m%d-%H%M%S") + '/', sess.graph) # train_writer = tf.summary.FileWriter('.' + '/train', sess.graph) agent_critic.create_target(0.1) agent_policy.create_target(0.1) # agent_policy_t.create_target_capacity(agent_policy.get_trainable_parameters(),0.6) sess.run(tf.global_variables_initializer()) sess.run(tf.initialize_all_variables()) sess.run(tf.initialize_local_variables()) sess1.run(agent_policy.get_trainable_parameters()) # agent_critic.set_trainable_parameters(agent_critic.get_trainable_parameters(), 0) # agent_policy.set_trainable_parameters(agent_policy.get_trainable_parameters(), 0) for i in range(episodes): print('running episode:', i) t = 0 done = 0 while t < max_time_steps: # print(t) start = Time.time() action = agent_policy.predict( obs_old) # + agent_policy.noise(0,1/episode_reward) mark[0] = Time.time() - start # print('mark1:'+str(mark1)) action = action.reshape(-1) # if action>0.5: # action = 1 # else: # action = -0 # action = env.action_space.sample() obs, rew, done, info = env.step(action) # env.render() episode_reward += rew memory.append(obs_old, action, rew, obs, done) # print('mark2:' + str(mark2)) if done or t == max_time_steps - 1: time_t = t reward_history.append(episode_reward) episode_reward = 0 env.reset() obs_old = obs t += 1 for steps in range(50): batch = memory.sample(batch_size) obs_batch = batch['obs0'] obs_batch -= np.mean(obs_batch, 0) obs_batch = obs_batch / np.var(obs_batch, 0) agent_policy.act_as_target = True action_batch_predict = agent_policy.predict(obs_batch)[0] agent_policy.act_as_target = False agent_critic.act_as_target = True value_batch = agent_critic.predict(obs_batch, action_batch_predict)[0] # print(value_batch[0]) agent_critic.act_as_target = False y = np.array(batch['rewards']) + gamma * np.array( value_batch) #.reshape(-1,batch_size) agent_critic.update_value(obs=obs_batch, action=action_batch_predict, target=y) q_grad = np.array( agent_critic.get_q_gradient(action_batch_predict, obs_batch)).reshape( -1, env.action_space.shape[0]) agent_policy.optimize_policy(q_grad, obs_batch) parm = agent_critic.get_all_parameters() value = np.array(sess.run(agent_critic.get_all_parameters())) agent_critic.update_target() agent_policy.update_target() value = np.array(sess.run(agent_critic.get_all_parameters())) # print(time_split) print(reward_history[-1]) summary = sess.run(merged) train_writer.add_summary(summary, i) time = 200 episode_reward = 0
def __init__(self, policy, args): network_data_format = 'NHWC' if args.nhwc else 'NCHW' value_loss_weight = args.value_loss_weight entropy_weight = args.entropy_weight learning_rate = args.lr max_to_keep = args.max_to_keep nenvs = args.envs nsteps = args.steps_per_batch res = args.res checkpoint_path = args.ckpt_path summary_writer = args.summary_writer debug = args.debug debug_tb_adress = args.tensorboard_debug_address print('\n### A2C Agent #######') print(f'# policy = {policy}') print(f'# network_data_format = {network_data_format}') print(f'# value_loss_weight = {value_loss_weight}') print(f'# entropy_weight = {entropy_weight}') print(f'# learning_rate = {learning_rate}') print(f'# max_to_keep = {max_to_keep}') print(f'# nenvs = {nenvs}') print(f'# nsteps = {nsteps}') print(f'# res = {res}') print(f'# checkpoint_path = {checkpoint_path}') print(f'# debug = {debug}') print(f'# debug_tb_adress = {debug_tb_adress}') print('######################\n') max_gradient_norm = 1.0 tf.reset_default_graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if debug and debug_tb_adress: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) elif debug_tb_adress: sess = tf_debug.TensorBoardDebugWrapperSession( sess, debug_tb_adress) nbatch = nenvs * nsteps ch = get_input_channels() ob_space = { 'screen': [None, res, res, ch['screen']], 'minimap': [None, res, res, ch['minimap']], 'flat': [None, ch['flat']], 'available_actions': [None, ch['available_actions']] } step_model = policy(sess, ob_space=ob_space, nbatch=nenvs, nsteps=1, reuse=None, data_format=network_data_format) train_model = policy(sess, ob_space=ob_space, nbatch=nbatch, nsteps=nsteps, reuse=True, data_format=network_data_format) # Define placeholders fn_id = tf.placeholder(tf.int32, [None], name='fn_id') arg_ids = { k: tf.placeholder(tf.int32, [None], name='arg_{}_id'.format(k.id)) for k in train_model.policy[1].keys() } ACTIONS = (fn_id, arg_ids) ADVS = tf.placeholder(tf.float32, [None], name='adv') RETURNS = tf.placeholder(tf.float32, [None], name='returns') # Define Loss log_probs = compute_policy_log_probs(train_model.AV_ACTS, train_model.policy, ACTIONS) policy_loss = -tf.reduce_mean(ADVS * log_probs) value_loss = tf.reduce_mean( tf.square(RETURNS - train_model.value) / 2.) entropy = compute_policy_entropy(train_model.AV_ACTS, train_model.policy, ACTIONS) loss = policy_loss + value_loss * value_loss_weight - entropy * entropy_weight # Define Optimizer global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(learning_rate, global_step, 10000, 0.94) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, epsilon=1e-5) train_op = layers.optimize_loss(loss=loss, global_step=global_step, optimizer=optimizer, clip_gradients=max_gradient_norm, learning_rate=None, name="train_op") tf.summary.scalar('entropy', entropy) tf.summary.scalar('loss', loss) tf.summary.scalar('loss/policy', policy_loss) tf.summary.scalar('loss/value', value_loss) tf.summary.scalar('rl/value', tf.reduce_mean(train_model.value)) tf.summary.scalar('rl/returns', tf.reduce_mean(RETURNS)) tf.summary.scalar('rl/advs', tf.reduce_mean(ADVS)) summary_writer.add_graph(sess.graph) variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) saver = tf.train.Saver(variables, max_to_keep=max_to_keep) train_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) train_summary_op = tf.summary.merge(train_summaries) # Load checkpoints if exist if os.path.exists(checkpoint_path): ckpt = tf.train.get_checkpoint_state(checkpoint_path) self.train_step = int(ckpt.model_checkpoint_path.split('-')[-1]) saver.restore(sess, ckpt.model_checkpoint_path) print("Loaded agent at episode {} (step {})".format( self.train_step // nsteps, self.train_step)) else: self.train_step = 0 sess.run(tf.variables_initializer(variables)) def train(obs, states, actions, returns, advs, summary=False): """ Args: obs: dict of preprocessed observation arrays, with num_batch elements in the first dimensions. actions: see `compute_total_log_probs`. returns: array of shape [num_batch]. advs: array of shape [num_batch]. summary: Whether to return a summary. Returns: summary: (agent_step, loss, Summary) or None. """ feed_dict = { train_model.SCREEN: obs['screen'], train_model.MINIMAP: obs['minimap'], train_model.FLAT: obs['flat'], train_model.AV_ACTS: obs['available_actions'], RETURNS: returns, ADVS: advs, ACTIONS[0]: actions[0] } feed_dict.update({v: actions[1][k] for k, v in ACTIONS[1].items()}) if states is not None: # For recurrent polices feed_dict.update({train_model.STATES: states}) agent_step = self.train_step self.train_step += 1 if summary: _, _step, _loss, _summary = sess.run( [train_op, global_step, loss, train_summary_op], feed_dict=feed_dict) return _step, _loss, _summary else: sess.run([train_op, loss], feed_dict=feed_dict) def save(path, step=None): os.makedirs(path, exist_ok=True) print("Saving agent to %s, step %d" % (path, sess.run(global_step))) ckpt_path = os.path.join(path, 'model.ckpt') saver.save(sess, ckpt_path, global_step=global_step) def get_global_step(): return sess.run(global_step) self.train = train self.step = step_model.step self.get_value = step_model.get_value self.save = save self.initial_state = step_model.initial_state self.get_global_step = get_global_step
def model(data_dict, opt, logfile=None, print_dim=False): """ Creates and executes Tensorflow graph for BERT-based models Arguments: data_dict -- contains all necessary data for model opt -- option log, contains learning_rate, num_epochs, minibatch_size, ... logfile -- path of file to save opt and results print_dim -- print dimensions for debugging purposes Returns: opt -- updated option log parameters -- trained parameters of model """ ##### # Read options, set defaults and update log ##### try: # check input options print(opt) test_opt(opt) if opt.get('git', None) is None: add_git_version(opt) # keep track of git SHA # assign variables opt['model'] = opt.get('model', 'bert') assert 'bert' in opt['model'] learning_rate = opt['learning_rate'] = opt.get( 'learning_rate', 5e-5) # small learning rate for pretrained BERT layers speedup_new_layers = opt['speedup_new_layers'] = opt.get( 'speedup_new_layers', False) freeze_thaw_tune = opt['freeze_thaw_tune'] = opt.get( 'freeze_thaw_tune', False) layer_specific_lr = speedup_new_layers or freeze_thaw_tune num_epochs = opt.get('num_epochs', None) # get num of planned epochs opt['num_epochs'] = 0 # use this to keep track of finished epochs minibatch_size = opt['minibatch_size'] = opt.get('minibatch_size', 64) bert_embd = True bert_update = opt['bert_update'] = opt.get('bert_update', False) bert_large = opt['bert_large'] = opt.get('bert_large', False) cased = opt['bert_cased'] = opt.get('bert_cased', False) starter_seed = opt['seed'] = opt.get('seed', None) if not type(starter_seed) == int: assert starter_seed == None # layers = opt['layers'] = opt.get('layers', 1) hidden_layer = opt['hidden_layer'] = opt.get( 'hidden_layer', 0) # add hidden layer before softmax layer? assert hidden_layer in [0, 1, 2] topic_encoder = opt['topic_encoder'] = opt.get('topic_encoder', None) L_R_unk = opt.get('unk_sub', False) assert L_R_unk is False # assert encoder in ['word', 'ffn', 'cnn', 'lstm', 'bilstm', 'word+cnn', 'word+ffn', 'word+lstm', 'word+bilstm'] assert topic_encoder in [None, 'ffn', 'cnn', 'lstm', 'bilstm'] optimizer_choice = opt['optimizer'] = opt.get( 'optimizer', 'Adadelta') # which optimiser to use? assert optimizer_choice in ['Adam', 'Adadelta'] epsilon = opt['epsilon'] = opt.get('epsilon', 1e-08) rho = opt['rho'] = opt.get('rho', 0.95) L2 = opt['L2'] = opt.get('L2', 0) # L2 regularisation dropout = opt['dropout'] = opt.get('dropout', 0) assert not ( L2 > 0 and dropout > 0 ), 'Use dropout or L2 regularisation, not both. Current settings: L2={}, dropout={}.'.format( L2, dropout) sparse = opt['sparse_labels'] = opt.get( 'sparse_labels', True) # are labels encoded as sparse? save_checkpoints = opt.get('checkpoints', False) # save all checkpoints? stopping_criterion = opt['stopping_criterion'] = opt.get( 'stopping_criterion', None) # which metric should be used as early stopping criterion? assert stopping_criterion in [None, 'cost', 'MAP', 'F1', 'Accuracy'] if stopping_criterion is None and num_epochs is None: raise ValueError( 'Invalid parameter combination. Stopping criterion and number of epochs cannot both be None.' ) early_stopping = stopping_criterion in [ 'F1', 'cost', 'MAP', 'Accuracy' ] predict_every_epoch = opt['predict_every_epoch'] = opt.get( 'predict_every_epoch', False) reduction_factor = opt['hidden_reduce'] = opt.get('hidden_reduce', 2) patience = opt['patience'] = opt.get('patience', 20) # topic models topic_scope = opt['topic'] = opt.get('topic', '') if opt['model'] == 'bert_simple_topic': assert topic_scope in ['word', 'doc'] elif opt['model'] == 'bert': topic_scope = '' else: raise NotImplementedError() module_name = "src.models.forward.{}".format(opt['model']) model = importlib.import_module(module_name) if 'word' in topic_scope: topic_update = opt['topic_update'] = opt.get( 'topic_update', False) # None for backward compatibility num_topics = opt['num_topics'] = opt.get('num_topics', None) topic_type = opt['topic_type'] = opt.get('topic_type', None) if not topic_scope == '': assert 'topic' in opt['model'] assert num_topics > 1 assert topic_type in ['LDA', 'ldamallet', 'gsdmm'] opt['topic_alpha'] = opt.get('topic_alpha', 50) else: assert num_topics is None assert topic_type is None if opt['dataset'] == 'Quora' and opt['subsets'] == [ 'train', 'dev', 'test', 'p_test' ]: extra_test = True else: extra_test = False injection_location = opt['injection_location'] = opt.get( 'injection_location', None) if 'inject' in opt['model']: assert str(injection_location) in [ 'embd', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11' ] else: assert injection_location is None # gpu settings gpu = opt.get('gpu', -1) # general settings session_config = tf.ConfigProto() if not gpu == -1: print('Running on GPU: {}'.format(gpu)) os.environ["CUDA_VISIBLE_DEVICES"] = str( gpu) # specifies which GPU to use (if multiple are available) ops.reset_default_graph( ) # to be able to rerun the model without overwriting tf variables if not starter_seed == None: random.seed(starter_seed ) # use starter seed to set seed for random library seed_list = [random.randint(1, 100000) for i in range(100) ] # generate list of seeds to be used in the model np.random.seed(seed_list.pop(0)) tf.set_random_seed( seed_list.pop(0)) # set tensorflow seed to keep results consistent ##### # unpack data and assign to model variables ##### assert data_dict.get('embd', None) is None # (565852, 200) if 'word' in topic_scope: topic_embd = data_dict['word_topics'].get('topic_matrix', None) #topic_emb.shape # assign word ids if extra_test: ID1_train, ID1_dev, ID1_test, ID1_test_extra = data_dict['ID1'] ID2_train, ID2_dev, ID2_test, ID2_test_extra = data_dict['ID2'] else: ID1_train, ID1_dev, ID1_test = data_dict['ID1'] ID2_train, ID2_dev, ID2_test = data_dict['ID2'] train_dict, dev_dict, test_dict, test_dict_extra = extract_data( data_dict, topic_scope, extra_test) ##### # check input dimensions ##### if sparse: classes = 2 else: classes = train_dict['Y'].shape[1] (m, sentence_length_1) = train_dict['E1'].shape # (m, sentence_length_2) = train_dict['E2'].shape ##### # Define Tensorflow graph ##### # Create Placeholders and initialise weights of the correct shape X1, X1_mask, X1_seg, Y = create_placeholders([sentence_length_1, None], classes, bicnn=True, sparse=sparse, bert=bert_embd) # Create topic placeholders print('Topic scope: {}'.format(topic_scope)) if 'doc' in topic_scope: D_T1, D_T2 = create_doc_topic_placeholders(num_topics) else: D_T1, D_T2 = None, None if 'word' in topic_scope: W_T_embedded = None (m, sentence_length_1) = train_dict['W_T1'].shape (m, sentence_length_2) = train_dict['W_T2'].shape W_T1, W_T2 = create_word_topic_placeholders( [sentence_length_1, sentence_length_2]) else: W_T1_embedded, W_T2_embedded, W_T_embedded = None, None, None # tensors for feed_dict bert_inputs = dict(input_ids=X1, input_mask=X1_mask, segment_ids=X1_seg) maybe_print([X1], ['input ids'], True) dropout_prob = tf.placeholder_with_default(0.0, name='dropout_rate', shape=()) # load and lookup BERT BERT_version = get_bert_version(cased, bert_large) BERT_URL = 'https://tfhub.dev/google/bert_{}/1'.format(BERT_version) print('Loading pretrained model from {}'.format(BERT_URL)) bert_lookup = hub.Module(BERT_URL, name='bert_lookup', trainable=bert_update) X_embedded = bert_lookup( bert_inputs, signature="tokens", as_dict=True ) # important to use tf. 1.11 as tf 1.7 will produce error for sess.run(X_embedded) # X_embedded has 2 keys: # pooled_output is [batch_size, hidden_size] -->output embedding for each token # sequence_output is [batch_size, sequence_length, hidden_size] -->output embedding for the entire sequence # Create topic embedding matrix if 'word' in topic_scope: topic_vocabulary_size, topic_dim = topic_embd.shape # assert(topic_vocabulary_size==embd_vocabulary_size) # currently using the same id to index topic and embd matrix topic_embedding_matrix = initialise_pretrained_embedding( topic_vocabulary_size, topic_dim, topic_embd, name='word_topics', trainable=topic_update) # Lookup topic embedding W_T1_embedded = lookup_embedding(W_T1, topic_embedding_matrix, expand=False, transpose=False, name='topic_lookup_L') W_T2_embedded = lookup_embedding(W_T2, topic_embedding_matrix, expand=False, transpose=False, name='topic_lookup_R') # Forward propagation: Build forward propagation as tensorflow graph input_dict = { 'E1': X_embedded, 'E2': None, 'D_T1': D_T1, 'D_T2': D_T2, 'W_T1': W_T1_embedded, 'W_T2': W_T2_embedded, 'W_T': W_T_embedded } forward_pass = model.forward_propagation(input_dict, classes, hidden_layer, reduction_factor, dropout_prob, seed_list, print_dim) logits = forward_pass['logits'] with tf.name_scope('cost'): # Cost function: Add cost function to tensorflow graph main_cost = compute_cost(logits, Y, loss_fn='bert') cross_entropy_scalar = tf.summary.scalar('cross_entropy', main_cost) cost = main_cost cost_summary = tf.summary.merge([cross_entropy_scalar]) # Backpropagation: choose training regime (creates tensorflow optimizer which minimizes the cost). if layer_specific_lr: learning_rate_old_layers = tf.placeholder_with_default( 0.0, name='learning_rate_old', shape=()) learning_rate_new_layers = tf.placeholder_with_default( 0.0, name='learning_rate_new', shape=()) train_step = layer_specific_regime(optimizer_choice, cost, learning_rate_old_layers, learning_rate_new_layers, epsilon, rho) else: learning_rate_old_layers = tf.placeholder_with_default( 0.0, name='learning_rate', shape=()) learning_rate_new_layers = None train_step = standard_training_regime(optimizer_choice, cost, learning_rate_old_layers, epsilon, rho) # Prediction and Evaluation tensors with tf.name_scope('evaluation_metrics'): predicted_label = tf.argmax( logits, 1, name='predict' ) # which column is the one with the highest activation value? if sparse: actual_label = Y else: actual_label = tf.argmax(Y, 1) conf_scores = get_confidence_scores(logits, False) maybe_print( [predicted_label, actual_label, conf_scores], ['Predicted label', 'Actual label', 'Confidence Scores'], False) # create streaming metrics: http://ronny.rest/blog/post_2017_09_11_tf_metrics/ streaming_accuracy, streaming_accuracy_update = tf.metrics.accuracy( labels=actual_label, predictions=predicted_label) label_idx = tf.expand_dims(tf.where(tf.not_equal(Y, 0))[:, 0], 0, name='label_idx') rank_scores = tf.expand_dims(get_confidence_scores(logits), 0, name='rank_scores') maybe_print([label_idx, rank_scores], ['Label index', 'Rank scores'], False) streaming_map, streaming_map_update = tf.metrics.average_precision_at_k( label_idx, rank_scores, 10) # fixed NaN for examples without relevant docs by editing .virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/metrics_impl.py line 2796 # return math_ops.div(precision_sum, num_relevant_items + 1e-11, name=scope) streaming_recall, streaming_recall_update = tf.contrib.metrics.streaming_recall( predictions=predicted_label, labels=actual_label) streaming_precision, streaming_precision_update = tf.contrib.metrics.streaming_precision( predictions=predicted_label, labels=actual_label) eps = 1e-11 # prevent division by zero streaming_f1 = 2 * (streaming_precision * streaming_recall) / ( streaming_precision + streaming_recall + eps) # create and merge summaries accuracy_scalar = tf.summary.scalar('Accuracy', streaming_accuracy) recall_scalar = tf.summary.scalar('Recall', streaming_recall) precision_scalar = tf.summary.scalar('Precision', streaming_precision) f1_scalar = tf.summary.scalar('F1', streaming_f1) map_scalar = tf.summary.scalar('MAP', streaming_map) eval_summary = tf.summary.merge([ accuracy_scalar, recall_scalar, precision_scalar, f1_scalar, map_scalar ]) def predict(sess, subset, writer, epoch, ignore_MAP, topic_scope, layer_specific_lr): ''' Predict in minibatch loop to prevent out of memory error (for large datasets or complex models) :param input_X1: document 1 :param input_X2: document 2 :param input_T1: topic distributions for document 1 or None :param input_T2: topic distributions for document 2 or None :param input_Y: labels :param writer: :param epoch: :param ignore_MAP: :return: complete prediction results as list [confidence_scores, predictions, minibatch_cost, eval_metrics] ''' # print(input_T1) predictions = [] confidence_scores = [] minibatch_size = 10 minibatches = create_minibatches(subset, minibatch_size, sparse=sparse, random=False, topic_scope=topic_scope) sess.run(tf.local_variables_initializer()) # for streaming metrics for minibatch in minibatches: feed_dict = { X1: minibatch['E1'], X1_mask: minibatch['E1_mask'], X1_seg: minibatch['E1_seg'], # X2: minibatch['E2'], Y: minibatch['Y'], learning_rate_old_layers: 0, dropout_prob: 0 } # don't use dropout during prediction if layer_specific_lr: feed_dict[learning_rate_new_layers] = 0 if 'doc' in topic_scope: feed_dict[D_T1] = minibatch['D_T1'] feed_dict[D_T2] = minibatch['D_T2'] if 'word' in topic_scope: feed_dict[W_T1] = minibatch['W_T1'] feed_dict[W_T2] = minibatch['W_T2'] # Run the session to execute the prediction and evaluation, the feedict should contain a minibatch for (X,Y). pred, conf = sess.run( # evaluating merged_summary will mess up streaming metrics [predicted_label, conf_scores], feed_dict=feed_dict) predictions.extend(pred) confidence_scores.extend(conf) if not ignore_MAP: eval_metrics = [None, None, None, None, None] else: eval_metrics = [None, None, None, None] return confidence_scores, predictions, None, eval_metrics def predict_eval(sess, subset, writer, epoch, ignore_MAP, topic_scope, layer_specific_lr): ''' Predict in minibatch loop to prevent out of memory error (for large datasets or complex models) :param input_X1: document 1 :param input_X2: document 2 :param input_T1: topic distributions for document 1 or None :param input_T2: topic distributions for document 2 or None :param input_Y: labels :param writer: :param epoch: :param ignore_MAP: :return: complete prediction results as list [confidence_scores, predictions, minibatch_cost, eval_metrics] ''' # print(input_T1) predictions = [] confidence_scores = [] minibatch_size = 10 minibatches = create_minibatches(subset, minibatch_size, sparse=sparse, random=False, topic_scope=topic_scope) sess.run(tf.local_variables_initializer()) # for streaming metrics minibatch_cost = 0. num_minibatches = int( m / minibatch_size ) # number of minibatches of size minibatch_size in the train set for minibatch in minibatches: feed_dict = { X1: minibatch['E1'], X1_mask: minibatch['E1_mask'], X1_seg: minibatch['E1_seg'], # X2: minibatch['E2'], Y: minibatch['Y'], learning_rate_old_layers: 0, dropout_prob: 0 } # don't use dropout during prediction if layer_specific_lr: feed_dict[learning_rate_new_layers] = 0 if 'doc' in topic_scope: feed_dict[D_T1] = minibatch['D_T1'] feed_dict[D_T2] = minibatch['D_T2'] if 'word' in topic_scope: feed_dict[W_T1] = minibatch['W_T1'] feed_dict[W_T2] = minibatch['W_T2'] # Run the session to execute the prediction and evaluation, the feeddict should contain a minibatch for (X,Y). if not ignore_MAP: # print('with MAP') pred, conf, batch_cost, c, _, _, _, _ = sess.run( # merged_summary will mess up streaming metrics! [ predicted_label, conf_scores, cost, cost_summary, streaming_accuracy_update, streaming_recall_update, streaming_precision_update, streaming_map_update ], feed_dict=feed_dict) else: # print('without MAP') pred, conf, batch_cost, c, _, _, _ = sess.run( # merged_summary will mess up streaming metrics! [ predicted_label, conf_scores, cost, cost_summary, streaming_accuracy_update, streaming_recall_update, streaming_precision_update ], feed_dict=feed_dict) predictions.extend(pred) confidence_scores.extend(conf) writer.add_summary(c, epoch) minibatch_cost += batch_cost / num_minibatches if not ignore_MAP: eval, acc, rec, prec, f_1, ma_p = sess.run([ eval_summary, streaming_accuracy, streaming_recall, streaming_precision, streaming_f1, streaming_map ]) eval_metrics = [acc, prec, rec, f_1, ma_p] else: eval, acc, rec, prec, f_1 = sess.run([ eval_summary, streaming_accuracy, streaming_recall, streaming_precision, streaming_f1 ]) eval_metrics = [acc, prec, rec, f_1] writer.add_summary(eval, epoch) return confidence_scores, predictions, minibatch_cost, eval_metrics def training_loop(sess, train_dict, dev_dict, test_dict, train_writer, dev_writer, opt, dropout, seed_list, num_epochs, early_stopping, optimizer, lr_bert, lr_new, layer_specific_lr, stopping_criterion='MAP', patience=patience, topic_scope=None, predict_every_epoch=False): ''' Trains the model :param X1_train: document 1 (train) :param X2_train: document 2 (train) :param D_T1_train: topic 1 (train) :param D_T2_train: topic 2 (train) :param Y_train: labels (train) :param X1_dev: document 1 (dev) :param X2_dev: document 2 (dev) :param D_T1_dev: topic 1 (dev) :param D_T2_dev: topic 2 (dev) :param Y_dev: labels (dev) :param train_writer: :param dev_writer: :param opt: option dict :param dropout: :param seed_list: :param num_epochs: :param early_stopping: :param stopping_criterion: :param patience: :return: [opt, epoch] ''' if predict_every_epoch: epoch = opt['num_epochs'] sess.run(tf.local_variables_initializer()) _, _, train_cost, train_metrics = predict_eval( sess, train_dict, train_writer, epoch, skip_MAP(train_dict['E1']), topic_scope, layer_specific_lr) dev_scores, dev_pred, dev_cost, dev_metrics = predict_eval( sess, dev_dict, dev_writer, epoch, skip_MAP(dev_dict['E1']), topic_scope, layer_specific_lr) print('Predicting for epoch {}'.format(epoch)) test_scores, test_pred, _, test_metrics = predict_eval( sess, test_dict, test_writer, epoch, skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr) output_predictions(ID1_dev, ID2_dev, dev_scores, dev_pred, 'dev_{}'.format(epoch), opt) opt = save_eval_metrics( dev_metrics, opt, 'dev', 'score_{}'.format(epoch)) # log dev metrics output_predictions(ID1_test, ID2_test, test_scores, test_pred, 'test_{}'.format(epoch), opt) opt = save_eval_metrics( test_metrics, opt, 'test', 'score_{}'.format(epoch)) # log test metrics write_log_entry(opt, 'data/logs/' + logfile) epoch = opt[ 'num_epochs'] + 1 # continue counting after freeze epochs best_dev_value = None best_dev_round = 0 ep = 'num_epochs' while True: print('Epoch {}'.format(epoch)) minibatch_cost = 0. minibatches = create_minibatches(train_dict, minibatch_size, seed_list.pop(0), sparse=sparse, random=True, topic_scope=topic_scope) for minibatch in minibatches: feed_dict = { X1: minibatch['E1'], X1_mask: minibatch['E1_mask'], X1_seg: minibatch['E1_seg'], # X2: minibatch['E2'], Y: minibatch['Y'], learning_rate_old_layers: lr_bert, dropout_prob: dropout } if layer_specific_lr: feed_dict[learning_rate_new_layers] = lr_new # print(minibatch.keys()) if 'doc' in topic_scope: feed_dict[D_T1] = minibatch['D_T1'] feed_dict[D_T2] = minibatch['D_T2'] if 'word' in topic_scope: feed_dict[W_T1] = minibatch['W_T1'] feed_dict[W_T2] = minibatch['W_T2'] # IMPORTANT: The line that runs the graph on a minibatch. # Run the session to execute the optimizer and the cost, the feedict should contain a minibatch for (X,Y). _, temp_cost = sess.run([optimizer, cost], feed_dict=feed_dict) # write summaries and checkpoints every few epochs if not logfile is None: # print("Train cost after epoch %i: %f" % (epoch, minibatch_cost)) sess.run(tf.local_variables_initializer()) _, _, train_cost, train_metrics = predict_eval( sess, train_dict, train_writer, epoch, skip_MAP(train_dict['E1']), topic_scope, layer_specific_lr) dev_scores, dev_pred, dev_cost, dev_metrics = predict_eval( sess, dev_dict, dev_writer, epoch, skip_MAP(dev_dict['E1']), topic_scope, layer_specific_lr) if predict_every_epoch and (not epoch == num_epochs): print('Predicting for epoch {}'.format(epoch)) test_scores, test_pred, _, test_metrics = predict_eval( sess, test_dict, test_writer, epoch, skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr) output_predictions(ID1_dev, ID2_dev, dev_scores, dev_pred, 'dev_{}'.format(epoch), opt) opt = save_eval_metrics( dev_metrics, opt, 'dev', 'score_{}'.format(epoch)) # log dev metrics output_predictions(ID1_test, ID2_test, test_scores, test_pred, 'test_{}'.format(epoch), opt) opt = save_eval_metrics( test_metrics, opt, 'test', 'score_{}'.format(epoch)) # log test metrics write_log_entry(opt, 'data/logs/' + logfile) # dev_metrics = [acc, prec, rec, f_1, ma_p] # use cost or other metric as early stopping criterion if stopping_criterion == 'cost': stopping_metric = dev_cost print("Dev {} after epoch {}: {}".format( stopping_criterion, epoch, stopping_metric)) elif stopping_criterion == 'MAP': assert len(dev_metrics ) == 5 # X1_dev must have 10 * x examples current_result = dev_metrics[-1] # MAP print("Dev {} after epoch {}: {}".format( stopping_criterion, epoch, current_result)) stopping_metric = 1 - current_result # dev error elif stopping_criterion == 'F1': current_result = dev_metrics[3] # F1 print("Dev {} after epoch {}: {}".format( stopping_criterion, epoch, current_result)) stopping_metric = 1 - current_result # dev error elif stopping_criterion == 'Accuracy': current_result = dev_metrics[0] # Accuracy print("Dev {} after epoch {}: {}".format( stopping_criterion, epoch, current_result)) stopping_metric = 1 - current_result # dev error if early_stopping: # save checkpoint for first or better models if (best_dev_value is None) or (stopping_metric < best_dev_value): best_dev_value = stopping_metric best_dev_round = epoch save_model(opt, saver, sess, epoch) # save model opt = save_eval_metrics( train_metrics, opt, 'train') # update train metrics in log # check stopping criteria # stop training if predefined number of epochs reached if (not early_stopping) and (epoch == num_epochs): print('Reached predefined number of training epochs.') save_model(opt, saver, sess, epoch) # save model break if early_stopping and (epoch == num_epochs): print( 'Maximum number of epochs reached during early stopping.' ) break # stop training if early stopping criterion reached if early_stopping and epoch >= best_dev_round + patience: print( 'Early stopping criterion reached after training for {} epochs.' .format(epoch)) break # stop training if gradient is vanishing if math.isnan(minibatch_cost): print('Cost is Nan at epoch {}!'.format(epoch)) break epoch += 1 print('Finished training.') # restore weights from saved model in best epoch if early_stopping: print('Load best model from epoch {}'.format(best_dev_round)) opt[ep] = best_dev_round epoch = best_dev_round # log final predictions with correct epoch info load_model(opt, saver, sess, best_dev_round) # ToDo: fix Too many open files # clean up previous checkpoints to save space delete_all_checkpoints_but_best(opt, best_dev_round) else: opt[ep] = epoch opt = save_eval_metrics(train_metrics, opt, 'train') # log train metrics return opt, epoch # Initialize all the variables globally init = tf.global_variables_initializer() if (not logfile is None) or (early_stopping): saver = create_saver() start_time, opt = start_timer(opt, logfile) print('Model {}'.format(opt['id'])) ##### # Start session to execute Tensorflow graph ##### with tf.Session( config=session_config ) as sess: #config=tf.ConfigProto(log_device_placement=True) # add debugger (but not for batch experiments) if __name__ == '__main__' and FLAGS.debug: sess = tf_debug.TensorBoardDebugWrapperSession( sess, "NPMacBook.local:7000") # Run the initialization sess.run(init) if logfile is None: train_writer = None dev_writer = None test_writer = None if extra_test: test_writer_extra = None else: print('logfile: {}'.format(logfile)) create_model_folder(opt) model_dir = get_model_dir(opt) train_writer = tf.summary.FileWriter( model_dir + '/train', sess.graph) # save graph first dev_writer = tf.summary.FileWriter(model_dir + '/dev') test_writer = tf.summary.FileWriter(model_dir + '/test') if extra_test: test_writer_extra = tf.summary.FileWriter(model_dir + '/test_extra') # additional input for predict every epoch if predict_every_epoch: td = test_dict else: td = None # set learning rates per layer if speedup_new_layers: lr_bert = learning_rate lr_new = learning_rate * 100 else: lr_bert = learning_rate lr_new = learning_rate # Freeze BERT and only train new weights if freeze_thaw_tune: print('Freeze BERT and train new layers...') opt, epoch = training_loop(sess, train_dict, dev_dict, td, train_writer, dev_writer, opt, dropout, seed_list, num_epochs, early_stopping, train_step, 0, lr_new, layer_specific_lr, stopping_criterion, patience, topic_scope, predict_every_epoch) num_epochs += epoch lr_new = learning_rate # Normal Finetuning print('Finetune...') opt, epoch = training_loop( sess, train_dict, dev_dict, td, train_writer, dev_writer, opt, dropout, seed_list, num_epochs, early_stopping, train_step, lr_bert, lr_new, layer_specific_lr, stopping_criterion, patience, topic_scope, predict_every_epoch) # Predict + evaluate on dev and test set # train_scores, train_pred, _, train_metrics = predict(X1_train, X2_train, Y_train, train_writer, epoch) dev_scores, dev_pred, _, dev_metrics = predict_eval( sess, dev_dict, dev_writer, epoch, skip_MAP(dev_dict['E1']), topic_scope, layer_specific_lr) opt = save_eval_metrics(dev_metrics, opt, 'dev') if opt['dataset'] == 'GlueQuora': test_scores, test_pred, _, test_metrics = predict( sess, test_dict, test_writer, epoch, skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr) else: test_scores, test_pred, _, test_metrics = predict_eval( sess, test_dict, test_writer, epoch, skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr) opt = save_eval_metrics(test_metrics, opt, 'test') opt = end_timer(opt, start_time, logfile) if extra_test: test_scores_extra, test_pred_extra, _, test_metrics_extra = predict_eval( sess, test_dict_extra, test_writer_extra, epoch, skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr) opt = save_eval_metrics(test_metrics_extra, opt, 'PAWS') if print_dim: if stopping_criterion is None: stopping_criterion = 'Accuracy' print('Dev {}: {}'.format( stopping_criterion, opt['score'][stopping_criterion]['dev'])) print('Test {}: {}'.format( stopping_criterion, opt['score'][stopping_criterion]['test'])) if not logfile is None: # save log write_log_entry(opt, 'data/logs/' + logfile) # write predictions to file for scorer # output_predictions(ID1_train, ID2_train, train_scores, train_pred, 'train', opt) output_predictions(ID1_dev, ID2_dev, dev_scores, dev_pred, 'dev', opt) output_predictions(ID1_test, ID2_test, test_scores, test_pred, 'test', opt) if extra_test: output_predictions(ID1_test_extra, ID2_test_extra, test_scores_extra, test_pred_extra, 'PAWS_test', opt) print('Wrote predictions for model_{}.'.format(opt['id'])) # save model if save_checkpoints: save_model(opt, saver, sess, epoch) # save disk space # close all writers to prevent too many open files error train_writer.close() dev_writer.close() test_writer.close() if extra_test: test_writer_extra.close() except Exception as e: print("Error: {0}".format(e.__doc__)) traceback.print_exc(file=sys.stdout) opt['status'] = 'Error' write_log_entry(opt, 'data/logs/' + logfile) # print('==============') return opt
TRAIN_NUM_ANOMALIES = 1000 TEST_NUM_ANOMALIES = 50 IMG_HGT = 28 IMG_WDT = 28 IMG_DEPTH = 1 nClass = 2 # trainX,trainY = createData.get_MNIST_TrainingData(NUM_NORMAL) trainX, trainY, train_Anomaly_X, train_Anomaly_Y = createData.get_MNIST_TrainingData( NUM_NORMAL, TRAIN_NUM_ANOMALIES) [test_ones, label_ones, test_sevens, label_sevens] = createData.get_MNIST_TestingData(NUM_NORMAL, TEST_NUM_ANOMALIES) from src.models.OC_NN import OC_NN ocnn = OC_NN() nu = 0.01 NUM_EPOCHS = 100 # keras.backend.set_session( # tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "vlan-2663-10-17-5-224.staff.wireless.sydney.edu.au:7000")) keras.backend.set_session( tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:7000")) ocnn.fit(trainX, nu, NUM_EPOCHS, IMG_HGT, IMG_WDT, IMG_DEPTH, nClass) res = ocnn.score(test_ones, test_sevens) auc_OCNN = res print("=" * 35) print("AUC:", res) print("=" * 35)
def _run_experiment(flags, exp_config): """Setup and execute an experiment workflow with specified options.""" util.set_logging(flags['logging']) TbDebug.TB_DEBUG = flags['tb_debug'] # Get the component's default HParams, then override # ------------------------------------------------------------------------- component_hparams_override = {} # Override that if defined using flag if flags['hparams_override']: if isinstance(flags['hparams_override'], dict): component_hparams_override = flags['hparams_override'] else: # Unstringy the string formatted dict component_hparams_override = ast.literal_eval( flags['hparams_override']) # Override hparams for this sweep/run if flags['hparams_sweep']: # Unstringy the string formatted dict hparams_sweep = ast.literal_eval(flags['hparams_sweep']) # Selectively override component hparams component_hparams_override.update(hparams_sweep) # Export settings # ------------------------------------------------------------------------- export_opts = { 'export_filters': True, 'export_checkpoint': True, 'max_to_keep': 5, 'interval_batches': flags['batches'] } # Classifier settings # ------------------------------------------------------------------------- classifier_opts = { 'model': 'logistic', # Options: logistic, svm 'unit_range': False, # Set to True if using SVM 'interval_batches': flags['batches'], 'hparams': { 'logistic': { 'C': [0.01, 0.1, 1.0, 10.0] # Regularization }, 'svm': { 'C': [1.0, 10.0, 100.0] # Regularization } } } # Checkpoint Options # ------------------------------------------------------------------------- checkpoint_opts = { 'checkpoint_path': flags['checkpoint'], 'checkpoint_load_scope': flags['checkpoint_load_scope'], 'checkpoint_frozen_scope': flags['checkpoint_frozen_scope'] } # OPTIONAL: Override options from an experiment definition file # ------------------------------------------------------------------------- workflow_opts_override = {} if exp_config: if 'export-options' in exp_config: export_opts.update(exp_config['export-options']) if 'workflow-options' in exp_config: workflow_opts_override.update(exp_config['workflow-options']) if 'classifier-options' in exp_config: classifier_opts.update(exp_config['classifier-options']) if 'checkpoint-options' in exp_config: checkpoint_opts.update(exp_config['checkpoint-options']) # Override workflow options for this sweep/run if flags['workflow_opts_sweep']: # Unstringy the string formatted dict workflow_opts_sweep = ast.literal_eval(flags['workflow_opts_sweep']) # Selectively override component hparams workflow_opts_override.update(workflow_opts_sweep) # Training with Tensorflow # ------------------------------------------------------------------------- with tf.Graph().as_default(): # pylint: disable=no-context-manager config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=no-member session = tf.Session(config=config) if TbDebug.TB_DEBUG: from tensorflow.python import debug as tf_debug session = tf_debug.TensorBoardDebugWrapperSession( session, 'localhost:6064') print( "Use the following command to run Tensorboard Debugger:\n'tensorboard --logdir=./ --debugger_port 6064'" ) util.set_seed(flags['seed']) # Load relevant dataset, workflow and component modules dataset_class = util.get_module_class_ref(flags['dataset']) workflow_class = util.get_module_class_ref(flags['workflow']) component_class = util.get_module_class_ref(flags['component']) # Override workflow options workflow_opts = workflow_class.default_opts() workflow_opts.override_from_dict(workflow_opts_override) # Log experiment settings print('Dataset:', flags['dataset']) print('Workflow:', flags['workflow']) print('Component:', flags['component'], '\n') print('Export Options:', json.dumps(export_opts, indent=4)) print('Workflow Options:', json.dumps(workflow_opts.values(), indent=4)) print('Classifier Options:', json.dumps(classifier_opts, indent=4)) print('Checkpoint Options:', json.dumps(checkpoint_opts, indent=4), '\n') # Setup Experiment Workflow # ------------------------------------------------------------------------- workflow = workflow_class(session, dataset_class, flags['dataset_location'], component_class, component_hparams_override, classifier_opts, export_opts, opts=workflow_opts, summarize=flags['summarize'], seed=flags['seed'], summary_dir=flags['summary_dir'], checkpoint_opts=checkpoint_opts) workflow.setup() # Start experiment to train the model and evaluating every N batches # ------------------------------------------------------------------------- workflow.run(flags['batches'], evaluate=workflow_opts.evaluate, train=workflow_opts.train) session.close()
# Laden der Daten train_data, train_labels, eval_data, eval_labels = load_fashion_data() train_data = train_data.reshape(-1, 28, 28, 1) train_labels = np_utils.to_categorical(train_labels, 10) print(train_data.shape) # Model mit Keras model.add(InputLayer(input_shape=(28, 28,1),name="1_Eingabe")) model.add(Conv2D(32,(2, 2),padding='same',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="2_Conv2D")) model.add(Activation(activation='relu',name="3_ReLu")) model.add(MaxPool2D(padding='same',name="4_MaxPooling2D")) model.add(Conv2D(32,(2, 2),padding='same',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="5_Conv2D")) model.add(Activation(activation='relu',name="6_ReLu")) model.add(MaxPool2D(padding='same',name="7_MaxPooling2D")) model.add(Flatten()) model.add(Dense(1024,activation='relu',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="8_Dense")) model.add(Dropout(0.4,name="9_Dense")) model.add(Dense(10, activation='softmax',name="10_Ausgabe")) model.compile(loss=losses.categorical_crossentropy, optimizer=optimizers.Adadelta(), metrics = ["accuracy","mse",metrics.categorical_accuracy]) #keras.backend.set_session(tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:12345")) K.set_session(tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:12345")) history = model.fit(train_data,train_labels, batch_size=64, epochs=100, verbose=1,validation_split=0.33) # Optionale Ausgabe: #plt.plot(history.history['val_loss'], 'r', history.history['val_acc'], 'b') #plt.show()
def main(_): ##############下面,除了图片,所有坐标都是归一化坐标################## # 输入图片 [批数,高,宽,通道数], 已经去均值 input_images = tf.placeholder(dtype=tf.float32, shape=(1, input_shape[0], input_shape[1], 3), name='input_images') # ground truth, [批数,MAX_GT_INSTANCES,4] gt_boxes = tf.placeholder(dtype=tf.float32, shape=(1, None, 4), name='gt_boxes') # 类别编号 [批数,MAX_GT_INSTANCES] class_ids = tf.placeholder(dtype=tf.int32, shape=(1, None), name='class_ids') # MASK,[批数,MAX_GT_INSTANCES, 高,宽],每个GT都要有一个标签,一个mask input_gt_mask = tf.placeholder(dtype=tf.bool, shape=(1, None, None, None), name='input_gt_mask') # 真实的anchor标签,[批数,anchor个数],其中1表示正例,0表示负例,-1表示不予考虑 rpn_binary_gt = tf.placeholder(dtype=tf.int32, shape=(1, None), name='rpn_binary_gt') # anchor与gt之间的回归差异,[批数,anchor个数,(dx, dy, log(h), log(w))] anchor_deltas = tf.placeholder(dtype=tf.float32, shape=(1, None, 4), name='anchor_deltas') if not tf.gfile.Exists(config.checkpoint_path): tf.gfile.MakeDirs(config.checkpoint_path) else: if not config.restore: tf.gfile.DeleteRecursively(config.checkpoint_path) tf.gfile.MakeDirs(config.checkpoint_path) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, decay_steps=10000, decay_rate=0.94, staircase=True) tf.summary.scalar('lr', learning_rate) opt = tf.train.AdamOptimizer(learning_rate) mask_rcnn = MASK_RCNN() # mode, input_image, gt_boxes=None, class_ids=None, # input_gt_mask=None, anchor_labels=None,anchor_deltas=None rpn_loss, proposal_loss, mask_loss, model_loss = mask_rcnn.build_model( 'training', input_images, gt_boxes, class_ids, input_gt_mask, rpn_binary_gt, anchor_deltas) total_loss = model_loss + tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # total_loss = model_loss + batch_norm_updates_op = tf.group( *tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with_clip = True if with_clip: tvars = tf.trainable_variables() grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars), 10.0) gradient_op = opt.apply_gradients(list(zip(grads, tvars)), global_step=global_step) else: gradient_op = opt.minimize(loss=total_loss, global_step=global_step) summary_op = tf.summary.merge_all() # 定义滑动平均对象 variable_averages = tf.train.ExponentialMovingAverage( config.moving_average_decay, global_step) # 将该滑动平均对象作用于所有的可训练变量。tf.trainable_variables()以列表的形式返回所有可训练变量 variables_averages_op = variable_averages.apply(tf.trainable_variables()) # 下面这两句话等价于 train_op = tf.group(variables_averages_op, apply_gradient_op, batch_norm_updates_op) with tf.control_dependencies( [variables_averages_op, gradient_op, batch_norm_updates_op]): train_op = tf.no_op(name='train_op') saver = tf.train.Saver(tf.global_variables()) summary_writer = tf.summary.FileWriter(config.summary_path, tf.get_default_graph()) init = tf.global_variables_initializer() next_batch = producer().make_one_shot_iterator().get_next() # dataset_train = cocoData.CocoDataset(dataset_dir=config.dataset_dir,subset=config.subset,year=config.year) # dataset_train.prepare() # augmentation = imgaug.augmenters.Fliplr(0.5) # data_generator = cocoData.get_batch(num_workers=4,dataset=dataset_train,shuffle=False,augmentation=augmentation) config2 = tf.ConfigProto(allow_soft_placement=True) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) config2.gpu_options.allow_growth = True with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: # 如果是从原来的模型中接着训练,就不需要sess.run(tf.global_variables_initializer()) if config.restore: print('continue training from previous checkpoint') ckpt = tf.train.latest_checkpoint(config.checkpoint_path) saver.restore(sess, ckpt) elif tf.gfile.Exists(weights_path): sess.run(init) try: print("trying to assign pre-trained model...") load_trained_weights(weights_path, sess, ignore_missing=True) print("assign pre-trained model done!") except: raise 'loading pre-trained model failed,please check your pretrained ' \ 'model {:s}'.format(config.COCO_WEIGHTS_PATH) else: print("use initial parameters") sess.run(init) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_adress flags are mutually exclusive" ) if FLAGS.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) elif FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) start = time.time() avg_ml = 0.0 avg_tl = 0.0 for step in range(config.max_steps): image_name,image, gt_box, gt_class, segmentation_mask, anchor_labels, anchor_deltas_in \ = sess.run(next_batch) while gt_box.shape[1] == 0: image_name, image, gt_box, gt_class, segmentation_mask, anchor_labels, anchor_deltas_in \ = sess.run(next_batch) # data = next(data_generator) # i = 0 # while data is None and i < 5: # data = next(data_generator) # i += 1 # print("None type appear for 5 times") # exit(0) # # image, _, anchor_labels, anchor_deltas_in, gt_class, gt_box, segmentation_mask = data # inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, # batch_gt_class_ids, batch_gt_boxes, batch_gt_masks] try: ml, tl, _, r_loss, p_loss, m_loss = sess.run( [ model_loss, total_loss, train_op, rpn_loss, proposal_loss, mask_loss ], feed_dict={ input_images: image, gt_boxes: gt_box, class_ids: gt_class, input_gt_mask: segmentation_mask, rpn_binary_gt: anchor_labels, anchor_deltas: anchor_deltas_in }) except ValueError: print("maybe no gt in this step") if np.isnan(tl): print('Loss diverged, stop training') break else: avg_ml += ml avg_tl += tl if step % 10 == 0: avg_time_per_step = (time.time() - start) / 10 start = time.time() print( 'Step {}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step' .format(step, avg_ml / 10, avg_tl / 10, avg_time_per_step)) avg_ml = 0.0 avg_tl = 0.0 if step % config.save_checkpoint_steps == 0: filename = os.path.join(config.checkpoint_path, "model.ckpt") saver.save(sess, filename, global_step=global_step) if step % config.save_summary_steps == 0: _, tl, summary_str = sess.run( [train_op, total_loss, summary_op], feed_dict={ input_images: image, gt_boxes: gt_box, class_ids: gt_class, input_gt_mask: segmentation_mask, rpn_binary_gt: anchor_labels, anchor_deltas: anchor_deltas_in }) summary_writer.add_summary(summary_str, global_step=step)
def train(): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, fake_data=FLAGS.fake_data) sess = tf.InteractiveSession() if FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) # Create a multilayer model. # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, 784], name='x-input') y_ = tf.placeholder(tf.int64, [None], name='y-input') with tf.name_scope('input_reshape'): image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) tf.summary.image('input', image_shaped_input, 10) with tf.name_scope('conv_layer'): # Convolutional Layer W_conv = weight_variable([5, 5, 1, 12]) b_conv = bias_variable([12]) conv = tf.nn.conv2d( image_shaped_input, W_conv, strides=[1, 1, 1, 1], padding='SAME') h = tf.nn.relu(conv + b_conv) pool = tf.nn.max_pool(h, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') pool_flattened = tf.layers.flatten(pool) hidden1 = nn_layer(pool_flattened, 2352, 42, 'layer1') with tf.name_scope('dropout'): keep_prob = tf.placeholder(tf.float32) tf.summary.scalar('dropout_keep_probability', keep_prob) dropped = tf.nn.dropout(hidden1, keep_prob) # Do not apply softmax activation yet, see below. y = nn_layer(dropped, 42, 10, 'layer2', act=tf.identity) with tf.name_scope('cross_entropy'): # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.losses.sparse_softmax_cross_entropy on the # raw logit outputs of the nn_layer above, and then average across # the batch. with tf.name_scope('total'): one_hot_labels = tf.cast(tf.one_hot(y_, depth=10), tf.float32) cross_entropy = tf.reduce_mean( -tf.reduce_sum(one_hot_labels * tf.log(y))) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), y_) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # Merge all the summaries and write them out to # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') tf.global_variables_initializer().run() # Train the model, and also write summaries. # Every 10th step, measure test-set accuracy, and write test summaries # All other steps, run train_step on training data, & add training summaries def feed_dict(train): """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) k = FLAGS.dropout else: xs, ys = mnist.test.images, mnist.test.labels k = 1.0 return {x: xs, y_: ys, keep_prob: k} for i in range(FLAGS.max_steps): if i % 10 == 0: # Record summaries and test-set accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print('Accuracy at step %s: %s' % (i, acc)) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) train_writer.add_summary(summary, i) train_writer.close() test_writer.close()