def main(): #################################################################################################### title = opts.title seed = opts.seed mode = opts.mode gpu_list = opts.gpu_list batch_size = opts.batch_size dataset = opts.dataset preprocess = opts.preprocess network = opts.network optimizer = opts.optimizer lr_decay = opts.lr_decay epoch_step = opts.epoch_step learning_step = opts.learning_step path_load = opts.path_load path_save = opts.path_save print_line() #################################################################################################### time_tag = get_time('%y-%m-%d %X') time_tag_short = time_tag[:8] seed = set_seed(seed) num_check_log = 0 title_temp = title while True: path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt' if os.path.isfile(path_log) and title != 'temp': # if title is 'temp', we will overwrite it num_check_log += 1 title_temp = title + '_%d' % num_check_log else: title = title_temp del num_check_log, title_temp break print('title: ' + title) set_log(path_log) print_line() #################################################################################################### print(time_tag) print('SEED = %d' % seed) print_opts('options/' + OPTION + '.py') print_line() #################################################################################################### model_dir = '../model/' if isinstance(path_save, bool): # if title is 'temp', we will not save model path_save = model_dir + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None if path_load is not None: # key word search list = glob.glob(model_dir + '*' + path_load + '*.tf.data*') if len(list) == 0: raise FileNotFoundError('Could not find any model file match the key words' + path_load) elif len(list) > 1: for list_file in list: print(list_file) raise FileNotFoundError('Find more than one model file match the key words' + path_load) path_load = list[0][:list[0].find('.tf.') + 3] print('Find model in', path_load) #################################################################################################### os.environ['CUDA_VISIBLE_DEVICES'] = ''.join(str(gpu) + ',' for gpu in gpu_list) num_worker = max(len(gpu_list), 1) dataset_train = get_dataset(dataset, split='train') dataset_test = get_dataset(dataset, split='test') num_batch_train = dataset_train.num_sample // batch_size num_batch_test = dataset_test.num_sample // 100 assert batch_size % num_worker == 0, 'batch_size %d can not be divided by number of workers %d' % (batch_size, num_worker) iterator_train = get_batch(dataset_train, preprocess, True, batch_size // num_worker, seed=seed) iterator_test = get_batch(dataset_test, preprocess, False, 100, seed=seed) #################################################################################################### if mode in ['input_train', 'input_test']: if mode == 'input_train': num_batch = num_batch_train batch_input = iterator_train.get_next() else: num_batch = num_batch_test batch_input = iterator_test.get_next() print('Testing the speed of data input pipeline.') sess = get_session() while True: for _ in tqdm(range(num_batch), desc='Input pipeline', leave=False, smoothing=0.1): sess.run(batch_input) #################################################################################################### nets = [] net = get_net_fn(network) if num_worker == 1: if len(gpu_list) == 0: print('Multi-CPU training, it might be slow', ) print('All parameters are pinned to CPU, all Ops are pinned to CPU') is_cpu_ps = True else: print('Single-GPU training with gpu', gpu_list[0]) print('All parameters are pinned to GPU, all Ops are pinned to GPU') is_cpu_ps = False elif num_worker > 1: print('Multi-GPU training tower with gpu list', gpu_list) print('All parameters are pinned to CPU, all Ops are pinned to GPU') print('Get batchnorm moving average updates from data in the first GPU for speed') print('Get L2 decay grads in the second GPU for speed') is_cpu_ps = True else: raise NotImplementedError('Unrecognized device settings') tower_grads = [] tower_losses = [] tower_errors = [] # Loops over the number of workers and creates a copy ("tower") of the model on each worker. for i in range(num_worker): worker = '/gpu:%d' % i if gpu_list else '/cpu:0' # Creates a device setter used to determine where Ops are to be placed. if is_cpu_ps: # tf.train.replica_device_setter supports placing variables on the CPU, all # on one GPU, or on ps_servers defined in a cluster_spec. device_setter = tf.train.replica_device_setter(worker_device=worker, ps_device='/cpu:0', ps_tasks=1) else: device_setter = worker ''' 1. pin ops to GPU 2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training) 3. reuse parameters multi-GPU training # Creates variables on the first loop. On subsequent loops reuse is set # to True, which results in the "towers" sharing variables. # tf.device calls the device_setter for each Op that is created. # device_setter returns the device the Op is to be placed on. ''' with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \ tf.device(device_setter): print('Training model on GPU %d' % gpu_list[i]) if gpu_list else print('Training model on CPUs') batch_train = iterator_train.get_next() if mode == 'speed_net': with tf.device('/cpu:0'): print('Testing the speed of model by synthesized data, ' 'which is theoretically the maximum speed for training this model') batch_train = iterator_train.get_next() shape_x = [batch_size // num_worker] + batch_train[0].get_shape().as_list()[1:] shape_y = [batch_size // num_worker] + batch_train[1].get_shape().as_list()[1:] batch_train_x = tf.zeros(shape_x, dtype=tf.float32) batch_train_y = tf.zeros(shape_y, dtype=tf.float32) batch_train = [batch_train_x, batch_train_y] nets.append(net(batch_train[0], batch_train[1], opts=opts, is_training=True)) tower_losses.append(nets[i].loss) tower_errors.append(nets[i].error) if i == 0: # We only get batchnorm moving average updates from data in the first worker for speed update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) nets[-1].count_parameters() nets[-1].count_MACs() nets[-1].count_MEMs() loss_worker = nets[i].loss if num_worker == 1: # Single-GPU or multi-CPU training loss_worker += nets[i].get_l2_loss() elif i == 1: # We only compute L2 grads in the second worker for speed. # In this case, L2 grads should multiple num_worker to maintain the equivalence loss_worker += num_worker * nets[i].get_l2_loss() tower_grads.append( optimizer.compute_gradients(loss_worker, colocate_gradients_with_ops=True)) if i == num_worker - 1: print('Testing model on GPU %d' % gpu_list[i]) if gpu_list else print('Testing model on CPUs') tf.get_variable_scope().reuse_variables() batch_test = iterator_test.get_next() nets.append(net(batch_test[0], batch_test[1], opts=opts, is_training=False)) error_batch_test = nets[-1].error if mode in ['attack']: print('Attack model on GPU %d' % gpu_list[i - 1]) if gpu_list else print('Attack model on CPUs') tf.get_variable_scope().reuse_variables() batch_attack_x = tf.placeholder(shape=batch_test[0].get_shape(), dtype=batch_test[0].dtype) batch_attack_y = tf.placeholder(shape=batch_test[1].get_shape(), dtype=batch_test[1].dtype) nets.append(net(batch_attack_x, batch_attack_y, opts=opts, is_training=False)) error_batch_attack = nets[-1].error with tf.device('/cpu:0' if is_cpu_ps else worker): grad_batch_train = aggregate_gradients(tower_grads) loss_batch_train = aggregate_statistics(tower_losses) error_batch_train = aggregate_statistics(tower_errors) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grad_batch_train, global_step=learning_step) #################################################################################################### if hasattr(opts, 'delay'): delay4gpus(opts.delay, gpu_list=gpu_list) sess = get_session() saver = tf.train.Saver(max_to_keep=None) def evaluate(): error_test = 0. for _ in tqdm(range(num_batch_test), desc='Test', leave=False, smoothing=0.1): error_test += sess.run(error_batch_test) return error_test / num_batch_test def attack(black=False, num_batch=None): error_fgsm = 0. delta = 1./64 if num_batch is None: num_batch = num_batch_test if black is False: adversial_x = [] adversial_y = [] for _ in tqdm(range(num_batch), desc='Attack', leave=False, smoothing=0.1): test_x, test_y, grads = sess.run([nets[1].H[0], nets[1].Y[0], nets[1].grads_H[0]]) fsgm_x = test_x + delta*np.sign(grads) error_fgsm += sess.run(error_batch_attack, feed_dict={batch_attack_x: fsgm_x, batch_attack_y: test_y}) adversial_x.append(fsgm_x) adversial_y.append(test_y) adversial_x = np.array(adversial_x) adversial_y = np.array(adversial_y) np.savez('adversial_sample.npz', x=adversial_x, y=adversial_y) else: adversial_sample = np.load('adversial_sample.npz') adversial_x = adversial_sample['x'] adversial_y = adversial_sample['y'] for i in tqdm(range(adversial_x.shape[0]), desc='Attack', leave=False, smoothing=0.1): error_fgsm += sess.run(error_batch_attack, feed_dict={batch_attack_x: adversial_x[i, ...], batch_attack_y: adversial_y[i, ...]}) return error_fgsm / num_batch def save_model(path): saver.save(sess, path) print('S', end='') def load_model(path): print('Loading model from %s ...' % path_load) saver.restore(sess, path_load) if path_load is not None: load_model(path_load) error_test_best = evaluate() print('Test: %.4f' % error_test_best) if mode == 'attack': print(attack(black=False, num_batch=None)) if mode == 'export': vars_list = get_variable('shift')[:48] vars_numpy = sess.run(vars_list) export(vars_numpy, 'shift') if mode in ['test', 'export', 'attack']: exit(0) print_line() #################################################################################################### while True: # update learning rate lr_epoch = sess.run(lr_decay) if lr_epoch <= 0: break epoch = sess.run(epoch_step) print('Epoch: %03d' % epoch, end=' ') loss_epoch = 0. error_epoch = 0. t0 = get_time() for batch in tqdm(range(num_batch_train), desc='Epoch: %03d' % epoch, leave=False, smoothing=0.1): if mode == 'debug': print('DEBUG: '), _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run( [train_op, loss_batch_train, error_batch_train, nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W, nets[0].Y]) else: _, loss_delta, error_delta = sess.run([train_op, loss_batch_train, error_batch_train]) loss_epoch += loss_delta error_epoch += error_delta print('Loss: %.6f Train: %.4f' % (loss_epoch / num_batch_train, error_epoch / num_batch_train), end=' ') FPS = num_batch_train * batch_size / (get_time() - t0) error_test = evaluate() assert error_test > 1e-4, ('Invalid test error %f, something goes wrong' % error_test) print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS), end=' ') sess.run(epoch_step.assign(epoch + 1)) if epoch == 1: error_test_best = min(error_test, 0.9) if error_test < error_test_best: print('B', end=' ') if path_save is not None: save_model(path_save) error_test_best = error_test print('') print_line() #################################################################################################### sess.close() print('Optimization ended at ' + get_time('%y-%m-%d %X')) return 0
def main(): #################################################################################################### title = opts.title seed = opts.seed mode = opts.mode gpu_list = opts.gpu_list batch_size = opts.batch_size dataset = opts.dataset preprocess = opts.preprocess network = opts.network optimizer = opts.optimizer lr_decay = opts.lr_decay epoch_step = opts.epoch_step learning_step = opts.learning_step path_load = opts.path_load path_save = opts.path_save print_line() #################################################################################################### time_tag = get_time('%y-%m-%d %X') time_tag_short = time_tag[:8] seed = set_seed(seed) num_check_log = 0 title_temp = title while True: path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt' if os.path.isfile( path_log ) and title != 'temp': # if title is 'temp', we will overwrite it num_check_log += 1 title_temp = title + '_%d' % num_check_log else: del num_check_log, title_temp break print('title: ' + title) set_log(path_log) print_line() #################################################################################################### print(time_tag) print('SEED = %d' % seed) print_opts('opts.py') print_line() #################################################################################################### if isinstance(path_save, bool): # if title is 'temp', we will not save model path_save = '../model/' + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None if path_load is not None: # key word search list = glob.glob('../model/*' + path_load + '*.tf.data*') assert len(list) == 1, 'Find none or more than one model file' path_load = list[0][:list[0].find('.tf.') + 3] print('Find model in', path_load) #################################################################################################### num_gpu = len(gpu_list) num_split = num_gpu if num_gpu > 0 else 1 dataset_train = get_dataset(dataset, split='train') dataset_test = get_dataset(dataset, split='test') num_batch_train = dataset_train.num_sample // batch_size num_batch_test = dataset_test.num_sample // 100 assert batch_size % num_split == 0, 'batch_size %d can not be divided by number of gpus %d' % ( batch_size, num_split) iterator_train = get_batch(dataset_train, preprocess, True, batch_size // num_split, num_split, seed=seed) iterator_test = get_batch(dataset_test, preprocess, False, 100, num_split, seed=seed) #################################################################################################### if mode in ['input_train', 'input_test']: if mode == 'input_train': num_batch = num_batch_train batch_input = iterator_train.get_next() else: num_batch = num_batch_test batch_input = iterator_test.get_next() sess = get_session(gpu_list) while (1): for batch in tqdm(range(num_batch), desc='Input pipeline', leave=False, smoothing=0.1): sess.run(batch_input) #################################################################################################### nets = [] net = get_net_fn(network) if num_gpu == 1: print('Single-GPU training with gpu', gpu_list[0]) print('All parameters are pinned to GPU, all Ops are pinned to GPU') is_cpu_ps = False elif num_gpu > 1: print('Multi-GPU training tower with gpu list', gpu_list) print('All parameters are pinned to CPU, all Ops are pinned to GPU') print( 'Get batchnorm moving average updates from data in the first GPU for speed' ) print('Get L2 decay grads in the second GPU for speed') is_cpu_ps = True else: print('Training with only CPU, maybe very slow') print('All parameters are pinned to CPU, all Ops are pinned to CPU') is_cpu_ps = True tower_grads = [] tower_losses = [] tower_errors = [] if num_gpu > 0: # Loops over the number of GPUs and creates a copy ("tower") of the model on each GPU. for i in range(num_gpu): worker = '/gpu:%d' % gpu_list[i] # Creates a device setter used to determine where Ops are to be placed. if is_cpu_ps: # tf.train.replica_device_setter supports placing variables on the CPU, all # on one GPU, or on ps_servers defined in a cluster_spec. device_setter = tf.train.replica_device_setter( worker_device=worker, ps_device='/cpu:0', ps_tasks=1) else: device_setter = worker ''' 1. pin ops to GPU 2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training) 3. reuse parameters multi-GPU training # Creates variables on the first loop. On subsequent loops reuse is set # to True, which results in the "towers" sharing variables. # tf.device calls the device_setter for each Op that is created. # device_setter returns the device the Op is to be placed on. ''' with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \ tf.device(device_setter): print('Training model on GPU %d' % gpu_list[i]) if mode == 'speed_net': with tf.device('/cpu:0'): # use fake data to test the computation speed on GPU batch_train = iterator_train.get_next() shape_x = [batch_size // num_gpu ] + batch_train[0].get_shape().as_list()[1:] shape_y = [batch_size // num_gpu ] + batch_train[1].get_shape().as_list()[1:] batch_train_x = tf.zeros(shape_x, dtype=tf.float32) batch_train_y = tf.zeros(shape_y, dtype=tf.float32) batch_train = [batch_train_x, batch_train_y] else: batch_train = iterator_train.get_next() nets.append( net(batch_train[0], batch_train[1], is_training=True)) tower_losses.append(nets[i].loss) tower_errors.append(nets[i].error) if i == 0: # We only get batchnorm moving average updates from data in the first GPU for speed. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) nets[-1].total_parameters() nets[-1].total_MACs() loss_worker = nets[i].loss if num_gpu == 1: # Single-GPU training loss_worker += nets[i].get_l2_loss() elif i == 1: # We only compute L2 grads in the second GPU for speed. # In this case, L2 grads should x numGPU to keep the equivalence loss_worker += num_gpu * nets[i].get_l2_loss() tower_grads.append( optimizer.compute_gradients( loss_worker, colocate_gradients_with_ops=True)) if i == num_gpu - 1: print('Testing model on GPU %d' % gpu_list[i]) if num_gpu == 1: tf.get_variable_scope().reuse_variables() batch_test = iterator_test.get_next() nets.append( net(batch_test[0], batch_test[1], is_training=False)) error_batch_test = nets[-1].error else: # training with only CPU with tf.variable_scope(tf.get_variable_scope()), \ tf.device('/cpu:0'): print('Training model on CPU') if mode == 'speed_net': # use fake data to test the computation speed on GPU batch_train = iterator_train.get_next() shape_x = [batch_size // num_gpu ] + batch_train[0].get_shape().as_list()[1:] shape_y = [batch_size // num_gpu ] + batch_train[1].get_shape().as_list()[1:] batch_train_x = tf.zeros(shape_x, dtype=tf.float32) batch_train_y = tf.zeros(shape_y, dtype=tf.float32) batch_train = [batch_train_x, batch_train_y] else: batch_train = iterator_train.get_next() nets.append(net(batch_train[0], batch_train[1], is_training=True)) tower_losses.append(nets[-1].loss) tower_errors.append(nets[-1].error) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) nets[-1].total_parameters() nets[-1].total_MACs() loss_worker = nets[-1].loss + nets[-1].get_l2_loss() tower_grads.append( optimizer.compute_gradients(loss_worker, colocate_gradients_with_ops=True)) print('Testing model on CPU') tf.get_variable_scope().reuse_variables() batch_test = iterator_test.get_next() nets.append(net(batch_test[0], batch_test[1], is_training=False)) error_batch_test = nets[-1].error with tf.device('/cpu:0' if is_cpu_ps else worker): grad_batch_train = aggregate_gradients(tower_grads) loss_batch_train = aggregate_statistics(tower_losses) error_batch_train = aggregate_statistics(tower_errors) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grad_batch_train, global_step=learning_step) #################################################################################################### sess = get_session(gpu_list) saver = tf.train.Saver(max_to_keep=None) def evaluate(): error_test = 0. for _ in tqdm(range(num_batch_test), desc='Test', leave=False, smoothing=0.1): error_test += sess.run([error_batch_test])[0] return error_test / num_batch_test def load_model(path): print('Loading model from', path) saver.restore(sess, path) def save_model(path): saver.save(sess, path) print('S', end='') if path_load is not None: load_model(path_load) error_test_best = evaluate() print('Test: %.4f' % error_test_best) if mode == 'test': exit(0) if mode == 'export': vars_list = get_variable('batchnorm/gamma:') vars_numpy = sess.run(vars_list) export(vars_numpy, 'gamma') exit(0) if mode == 'restart': sess.run(epoch_step.assign(90)) print_line() #################################################################################################### while True: # update learning rate lr_epoch = sess.run(lr_decay) if lr_epoch <= 0: break # sess.run(epoch_step.assign(1)) epoch = sess.run(epoch_step) print('Epoch: %03d' % epoch, end=' ') loss_epoch = 0. error_epoch = 0. t0 = get_time() for batch in tqdm(range(num_batch_train), desc='Epoch: %03d' % epoch, leave=False, smoothing=0.1): if mode == 'debug': print('DEBUG: '), _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run( [ train_op, loss_batch_train, error_batch_train, nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W, nets[0].y ]) else: _, loss_delta, error_delta = sess.run( [train_op, loss_batch_train, error_batch_train]) loss_epoch += loss_delta error_epoch += error_delta print('Loss: %.6f Train: %.4f' % (loss_epoch / num_batch_train, error_epoch / num_batch_train), end=' ') FPS = num_batch_train * batch_size / (get_time() - t0) error_test = evaluate() assert error_test > 1e-4, ( 'Invalid test error %f, something goes wrong' % error_test) print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS), end=' ') sess.run(epoch_step.assign(epoch + 1)) if epoch == 1: error_test_best = min(error_test, 0.9) if error_test < error_test_best: print('B', end=' ') if path_save is not None: save_model(path_save) error_test_best = error_test print('') print_line() #################################################################################################### sess.close() print('Optimization ended at ' + get_time('%y-%m-%d %X')) return 0