def __init__(self, folderpath, search_name, evaluation_id, abort_if_exists=False, abort_if_notexists=False): self.evaluation_folderpath = get_evaluation_folderpath( folderpath, search_name, evaluation_id) self.evaluation_data_folderpath = get_evaluation_data_folderpath( folderpath, search_name, evaluation_id) assert (not abort_if_exists) or (not ut.folder_exists( self.evaluation_folderpath)) assert (not abort_if_notexists) or ut.folder_exists( self.evaluation_folderpath) ut.create_folder(self.evaluation_folderpath, abort_if_exists=abort_if_exists, create_parent_folders=True) ut.create_folder(self.evaluation_data_folderpath, abort_if_exists=abort_if_exists, create_parent_folders=True) self.config_filepath = ut.join_paths( [self.evaluation_folderpath, 'config.json']) self.results_filepath = ut.join_paths( [self.evaluation_folderpath, 'results.json'])
def __init__(self, train_dataset, val_dataset, num_classes, max_num_training_epochs=200, stop_patience=20, optimizer_type='sgd_mom', batch_size=128, whiten=False, init_lr=.015, lr_decay_value=.97, lr_num_epochs_per_decay=2.4, lr_warmup_epochs=3.0, weight_decay=.001, display_step=1, log_output_to_terminal=True, test_dataset=None, base_dir='scratch'): self.X_train = train_dataset.X self.X_val = val_dataset.X self.X_test = test_dataset.X if test_dataset else None self.y_train = train_dataset.y self.y_val = val_dataset.y self.y_test = test_dataset.y if test_dataset else None if whiten: self.X_train = aug.per_image_whiten(self.X_train) self.X_val = aug.per_image_whiten(self.X_val) self.X_test = aug.per_image_whiten( self.X_test) if self.X_test is None else None self.in_dim = list(train_dataset.next_batch(1)[0].shape[1:]) self.num_examples = self.X_train.shape[0] self.batch_size = batch_size self.steps_per_epoch = (self.num_examples + self.batch_size - 1) / self.batch_size self.max_num_training_epochs = max_num_training_epochs self.display_step = display_step self.stop_patience = stop_patience self.init_lr = init_lr self.lr_decay_value = lr_decay_value self.lr_num_epochs_per_decay = lr_num_epochs_per_decay self.lr_warmup_epochs = lr_warmup_epochs self.weight_decay = weight_decay self.optimizer_type = optimizer_type self.log_output_to_terminal = log_output_to_terminal self.base_dir = base_dir self.num_archs = 0 ut.create_folder(base_dir, abort_if_exists=False) delete_files_in_folder(base_dir)
def create_search_folderpath(folderpath, search_name, abort_if_exists=False, delete_if_exists=False, create_parent_folders=False): assert not (abort_if_exists and delete_if_exists) search_folderpath = get_search_folderpath(folderpath, search_name) search_data_folderpath = get_search_data_folderpath( folderpath, search_name) all_evaluations_folderpath = get_all_evaluations_folderpath( folderpath, search_name) if delete_if_exists: ut.delete_folder(search_folderpath, False, False) assert not (abort_if_exists and ut.folder_exists(search_folderpath)) if not ut.folder_exists(search_folderpath): ut.create_folder(search_folderpath, create_parent_folders=create_parent_folders) ut.create_folder(search_data_folderpath) ut.create_folder(all_evaluations_folderpath)
def main(): configs = ut.read_jsonfile( "./examples/tensorflow/full_benchmarks/experiment_config.json") parser = argparse.ArgumentParser("MPI Job for architecture search") parser.add_argument('--config', '-c', action='store', dest='config_name', default='normal') # Other arguments parser.add_argument('--display-output', '-o', action='store_true', dest='display_output', default=False) parser.add_argument('--resume', '-r', action='store_true', dest='resume', default=False) options = parser.parse_args() config = configs[options.config_name] num_procs = config['num_procs'] if 'num_procs' in config else 0 comm = get_communicator(config['communicator'], num_procs) if len(gpu_utils.get_gpu_information()) != 0: #https://github.com/tensorflow/tensorflow/issues/1888 gpu_utils.set_visible_gpus( [comm.get_rank() % gpu_utils.get_total_num_gpus()]) if 'eager' in config and config['eager']: import tensorflow as tf tf.logging.set_verbosity(tf.logging.ERROR) tf.enable_eager_execution() datasets = { 'cifar10': lambda: (load_cifar10('data/cifar10/', one_hot=False), 10), 'mnist': lambda: (load_mnist('data/mnist/'), 10), } (Xtrain, ytrain, Xval, yval, Xtest, ytest), num_classes = datasets[config['dataset']]() search_space_factory = name_to_search_space_factory_fn[ config['search_space']](num_classes) save_every = 1 if 'save_every' not in config else config['save_every'] if comm.get_rank() == 0: searcher = name_to_searcher_fn[config['searcher']]( search_space_factory.get_search_space) num_samples = -1 if 'samples' not in config else config['samples'] num_epochs = -1 if 'epochs' not in config else config['epochs'] start_searcher(comm, searcher, options.resume, config['search_folder'], config['search_name'], config['searcher_file_name'], num_samples=num_samples, num_epochs=num_epochs, save_every=save_every) else: train_d_advataset = InMemoryDataset(Xtrain, ytrain, True) val_dataset = InMemoryDataset(Xval, yval, False) test_dataset = InMemoryDataset(Xtest, ytest, False) search_path = sl.get_search_folderpath(config['search_folder'], config['search_name']) ut.create_folder(ut.join_paths([search_path, 'scratch_data']), create_parent_folders=True) scratch_folder = ut.join_paths( [search_path, 'scratch_data', 'eval_' + str(comm.get_rank())]) ut.create_folder(scratch_folder) evaluators = { 'simple_classification': lambda: SimpleClassifierEvaluator( train_dataset, val_dataset, num_classes, './temp' + str(comm.get_rank()), max_num_training_epochs=config['eval_epochs'], log_output_to_terminal=options.display_output, test_dataset=test_dataset), } assert not config['evaluator'].startswith('enas') or hasattr( search_space_factory, 'weight_sharer') evaluator = evaluators[config['evaluator']]() start_worker(comm, evaluator, search_space_factory, config['search_folder'], config['search_name'], resume=options.resume, save_every=save_every)
def eval(self, inputs, outputs): tf.reset_default_graph() model_dir = ut.join_paths( [self.base_dir, 'eval' + str(self.num_archs)]) ut.create_folder(model_dir, abort_if_exists=False) def model_fn(features, labels, mode, params): feature_columns = list(get_feature_columns().values()) images = tf.feature_column.input_layer( features=features, feature_columns=feature_columns) images = tf.reshape(images, shape=(-1, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_DEPTH)) set_recompile(outputs, True) gc.collect() htfe.set_is_training(outputs, mode == tf.estimator.ModeKeys.TRAIN) co.forward({inputs['in']: images}) logits = outputs['out'].val update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) predicted_classes = tf.argmax(logits, 1) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'class_ids': predicted_classes[:, tf.newaxis], 'probabilities': tf.nn.softmax(logits), 'logits': logits, } return tf.estimator.EstimatorSpec(mode, predictions=predictions) # define loss and optimizer train_vars = tf.trainable_variables() with tf.variable_scope('l2'): l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in train_vars if 'kernel' in v.name ]) * self.weight_decay unreg_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)) loss = unreg_loss + l2_loss # Compute evaluation metrics. accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, 1), predictions=predicted_classes, name='acc_op') metrics = {'accuracy': accuracy} if mode == tf.estimator.ModeKeys.EVAL: loss = tf.Print(loss, [ accuracy, l2_loss, unreg_loss, loss, tf.argmax(labels, 1), predicted_classes ], summarize=10) return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) # Create training op. assert mode == tf.estimator.ModeKeys.TRAIN step = tf.train.get_or_create_global_step() learning_rate = self.get_learning_rate(step) optimizer = tf.train.RMSPropOptimizer(learning_rate, .9, momentum=.9, epsilon=1.0) loss = tf.Print(loss, [ accuracy, l2_loss, unreg_loss, loss, learning_rate, tf.argmax(labels, 1), predicted_classes ], summarize=10) with tf.control_dependencies(update_ops): train_op = optimizer.minimize( loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) # NUM_GPUS = 2 # strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS) # config = tf.estimator.RunConfig(train_distribute=strategy) gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) run_config = tf.estimator.RunConfig(model_dir=model_dir, session_config=config) estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config, params={}) seqs = ut.SequenceTracker(abort_if_different_lengths=True) best_val_acc = -np.inf stop_counter = self.stop_patience timer_manager = ut.TimerManager() timer_manager.create_timer('eval') # getting the gpu_id based on the environment. if gpu_utils.is_environment_variable_defined('CUDA_VISIBLE_DEVICES'): s = gpu_utils.get_environment_variable('CUDA_VISIBLE_DEVICES') s_lst = s.split(',') if len(s_lst) == 1 and len(s_lst[0]) > 0: gpu_id = int(s_lst[0]) else: gpu_id = None else: gpus = gpu_utils.get_gpu_information() if len(gpus) == 1: gpu_id = 0 else: gpu_id = None for epoch in range(self.max_num_training_epochs): train_fn = lambda: input_fn(self.X_train, self.y_train, train=True) val_fn = lambda: input_fn(self.X_val, self.y_val, train=False) print('\n\nTraining') estimator.train(input_fn=train_fn) print('\n\nEvaluating') eval_results = estimator.evaluate(input_fn=val_fn) # early stopping val_acc = eval_results['accuracy'] # Display logs per epoch step if self.log_output_to_terminal and epoch % self.display_step == 0: print( "time:", "%7.1f" % timer_manager.get_time_since_event('eval', 'start'), "epoch:", '%04d' % (epoch + 1), "validation loss:", "{:.9f}".format(eval_results['loss']), "validation_accuracy:", "%.5f" % val_acc) d = { 'validation_accuracy': val_acc, 'validation_loss': eval_results['loss'], 'epoch_number': epoch + 1, 'time_in_minutes': timer_manager.get_time_since_event('eval', 'start', units='minutes'), } seqs.append(d) # update the patience counters. if best_val_acc < val_acc: best_val_acc = val_acc # reinitialize all the counters. stop_counter = self.stop_patience else: stop_counter -= 1 if stop_counter == 0: break print("Optimization Finished!") timer_manager.tick_timer('eval') eval_results = estimator.evaluate( input_fn=lambda: input_fn(self.X_val, self.y_val)) val_acc = eval_results['accuracy'] t_infer = ( timer_manager.get_time_since_last_tick('eval', 'miliseconds') / self.X_val.shape[0]) print("Validation accuracy: %f" % val_acc) seqs_dict = seqs.get_dict() results = { 'validation_accuracy': val_acc, 'num_parameters': float(htf.get_num_trainable_parameters()), 'inference_time_per_example_in_miliseconds': t_infer, 'num_training_epochs': seqs_dict['epoch_number'], 'sequences': seqs_dict } if 'gpu_utilization_in_percent' in seqs_dict: results['average_gpu_utilization_in_percent'] = np.mean( seqs_dict['gpu_utilization_in_percent']) results['average_gpu_memory_utilization_in_gigabytes'] = np.mean( seqs_dict['gpu_memory_utilization_in_gigabytes']) if self.X_test != None and self.y_test != None: test_results = estimator.evaluate( input_fn=lambda: input_fn(self.X_test, self.y_test)) test_acc = test_results['accuracy'] print("Test accuracy: %f" % test_acc) results['test_accuracy'] = test_acc results['training_time_in_hours'] = timer_manager.get_time_since_event( 'eval', 'start', units='hours') self.num_archs += 1 return results