def setup_bnds_decay_rates(model_name, dataset_name): """ NOTE: The bnd_decay_rates here is mgw_size invariant """ batch_size = FLAGS.batch_size if not FLAGS.enbl_multi_gpu else FLAGS.batch_size * mgw.size( ) nb_batches_per_epoch = int(FLAGS.nb_smpls_train / batch_size) mgw_size = int(mgw.size()) if FLAGS.enbl_multi_gpu else 1 init_lr = FLAGS.lrn_rate_init * FLAGS.batch_size * mgw_size / FLAGS.batch_size_norm if FLAGS.enbl_multi_gpu else FLAGS.lrn_rate_init if dataset_name == 'cifar_10': if model_name.startswith('resnet'): bnds = [nb_batches_per_epoch * 15, nb_batches_per_epoch * 40] decay_rates = [1e-3, 1e-4, 1e-5] elif model_name.startswith('lenet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 30] decay_rates = [1e-4, 1e-5, 1e-6] elif dataset_name == 'ilsvrc_12': if model_name.startswith('resnet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 20] decay_rates = [1e-4, 1e-5, 1e-6] elif model_name.startswith('mobilenet'): bnds = [nb_batches_per_epoch * 5, nb_batches_per_epoch * 30] decay_rates = [1e-4, 1e-5, 1e-6] finetune_steps = nb_batches_per_epoch * FLAGS.uql_quant_epochs init_lr = init_lr if FLAGS.enbl_warm_start else FLAGS.lrn_rate_init return init_lr, bnds, decay_rates, finetune_steps
def __monitor_progress(self, summary, log_rslt, idx_iter, time_step): """Monitor the training progress. Args: * summary: summary protocol buffer * log_rslt: logging operations' results * idx_iter: index of the training iteration * time_step: time step between two summary operations """ # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # compute the training speed speed = FLAGS.batch_size * FLAGS.summ_step / time_step if FLAGS.enbl_multi_gpu: speed *= mgw.size() # display monitored statistics log_str = ' | '.join([ '%s = %.4e' % (name, value) for name, value in zip(self.log_op_names, log_rslt) ]) tf.logging.info('iter #%d: %s | speed = %.2f pics / sec' % (idx_iter + 1, log_str, speed))
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) if FLAGS.mobilenet_version == 1: nb_epochs = 100 idxs_epoch = [30, 60, 80, 90] decay_rates = [1.0, 0.1, 0.01, 0.001, 0.0001] lrn_rate = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) elif FLAGS.mobilenet_version == 2: nb_epochs = 412 epoch_step = 2.5 decay_rate = 0.98**epoch_step # which is better, 0.98 OR (0.98 ** epoch_step)? lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) else: raise ValueError('invalid MobileNet version: {}'.format( FLAGS.mobilenet_version)) return lrn_rate, nb_iters
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) if FLAGS.mobilenet_version == 1: nb_epochs = 100 nb_epochs = 412 idxs_epoch = [12000, 20000] step_rate = [200, 200, 4000] epoch_step = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, step_rate) decay_rates = [0.985, 0.980, 0.505] decay_rate = setup_lrn_rate_piecewise_constant( global_step, batch_size, idxs_epoch, decay_rates) lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(30000) elif FLAGS.mobilenet_version == 2: nb_epochs = 412 epoch_step = 500 decay_rate = 0.9 # which is better, 0.98 OR (0.98 ** epoch_step)? lrn_rate = setup_lrn_rate_exponential_decay( global_step, batch_size, epoch_step, decay_rate) nb_iters = int(15000) else: raise ValueError('invalid MobileNet version: {}'.format( FLAGS.mobilenet_version)) return lrn_rate, nb_iters
def __monitor_progress(self, idx_iter, log_rslt, time_prev): if not self.__is_primary_worker(): return None # display monitored statistics speed = FLAGS.batch_size * self.tune_global_disp_steps / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() if self.dataset_name == 'coco2017-pose': if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[:8] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: lrn_rate, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[:7] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4], log_rslt[5] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %e | model_loss = %e | loss = %e | acc_top1 = %e | acc_top5 = %e | speed = %.2f pics / sec ' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4] tf.logging.info( 'iter #%d: lr = %e | model_loss = %e | loss = %e | acc_top1 = %e | acc_top5 = %e| speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def __monitor_progress(self, summary, log_rslt, time_prev, idx_iter): # early break for non-primary workers if not self.is_primary_worker(): return None # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # display monitored statistics speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], \ log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4], log_rslt[5] tf.logging.info('iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' \ % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[0], \ log_rslt[1], log_rslt[2], log_rslt[3], log_rslt[4] tf.logging.info('iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' \ % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" nb_epochs = 100 idxs_epoch = [30, 60, 80, 90] decay_rates = [1.0, 0.1, 0.01, 0.001, 0.0001] batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) lrn_rate = setup_lrn_rate_piecewise_constant(global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(FLAGS.nb_smpls_train * nb_epochs * FLAGS.nb_epochs_rat / batch_size) return lrn_rate, nb_iters
def setup_lrn_rate(self, global_step): """Setup the learning rate (and number of training iterations).""" nb_epochs = 100 idxs_epoch = [0.4, 0.8] decay_rates = [0.001, 0.0005, 0.0001] batch_size = FLAGS.batch_size * (1 if not FLAGS.enbl_multi_gpu else mgw.size()) lrn_rate = setup_lrn_rate_piecewise_constant(global_step, batch_size, idxs_epoch, decay_rates) nb_iters = int(12000) #nb_iters = int(200) return lrn_rate, nb_iters
def setup_lrn_rate(global_step, model_name, dataset_name): """Setup the learning rate for the given dataset. Args: * global_step: training iteration counter * model_name: model's name; must be one of ['lenet', 'resnet_*', 'mobilenet_v1', 'mobilenet_v2'] * dataset_name: dataset's name; must be one of ['cifar_10', 'ilsvrc_12'] Returns: * lrn_rate: learning rate * nb_batches: number of training mini-batches """ # obtain the overall batch size across all GPUs if not FLAGS.enbl_multi_gpu: batch_size = FLAGS.batch_size else: batch_size = FLAGS.batch_size * mgw.size() # choose a learning rate protocol according to the model & dataset combination global_step = tf.cast(global_step, tf.int32) if dataset_name == 'cifar_10': if model_name == 'lenet': lrn_rate, nb_batches = setup_lrn_rate_lenet_cifar10( global_step, batch_size) elif model_name.startswith('resnet'): lrn_rate, nb_batches = setup_lrn_rate_resnet_cifar10( global_step, batch_size) else: raise NotImplementedError('model: {} / dataset: {}'.format( model_name, dataset_name)) elif dataset_name == 'ilsvrc_12': if model_name.startswith('resnet'): lrn_rate, nb_batches = setup_lrn_rate_resnet_ilsvrc12( global_step, batch_size) elif model_name.startswith('mobilenet_v1'): lrn_rate, nb_batches = setup_lrn_rate_mobilenet_v1_ilsvrc12( global_step, batch_size) elif model_name.startswith('mobilenet_v2'): lrn_rate, nb_batches = setup_lrn_rate_mobilenet_v2_ilsvrc12( global_step, batch_size) else: raise NotImplementedError('model: {} / dataset: {}'.format( model_name, dataset_name)) else: raise NotImplementedError('dataset: ' + dataset_name) return lrn_rate, nb_batches
def __monitor_progress(self, summary, log_rslt, time_prev, idx_iter): # early break for non-primary workers if not self.is_primary_worker(): return None # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, idx_iter) # display monitored statistics speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() # NOTE: for cifar-10, acc_top5 is 0. if self.dataset_name == 'coco2017-pose': if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[: 8] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: lrn_rate, model_loss, loss, total_loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll = log_rslt[: 7] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | ll_paf = %.4f | ll_heat = %.4f | ll = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, total_loss_ll_paf, total_loss_ll_heat, total_loss_ll, speed)) else: if FLAGS.enbl_dst: lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5 = log_rslt[: 6] tf.logging.info( 'iter #%d: lr = %e | dst_loss = %.4f | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, dst_loss, model_loss, loss, acc_top1, acc_top5, speed)) else: lrn_rate, model_loss, loss, acc_top1, acc_top5 = log_rslt[:5] tf.logging.info( 'iter #%d: lr = %e | model_loss = %.4f | loss = %.4f | acc_top1 = %.4f | acc_top5 = %.4f | speed = %.2f pics / sec' % (idx_iter + 1, lrn_rate, model_loss, loss, acc_top1, acc_top5, speed)) return timer()
def __monitor_progress(self, summary, log_rslt): # early break for non-primary workers if not self.__is_primary_worker(): return # write summaries for TensorBoard visualization self.sm_writer.add_summary(summary, self.idx_iter) # display monitored statistics lrn_rate, loss, accuracy = log_rslt[0], log_rslt[1], log_rslt[2] speed = FLAGS.batch_size * FLAGS.summ_step / (timer() - self.time_prev) if FLAGS.enbl_multi_gpu: speed *= mgw.size() tf.logging.info('iter #%d: lr = %e | loss = %e | speed = %.2f pics / sec' % (self.idx_iter + 1, lrn_rate, loss, speed)) for i in range(len(self.accuracy_keys)): tf.logging.info('{} = {}'.format(self.accuracy_keys[i], accuracy[i])) self.time_prev = timer()
def build(self, enbl_trn_val_split=False): '''Build iterator(s) for tf.data.Dataset() object. Args: * enbl_trn_val_split: whether to split into training & validation subsets Returns: * iterator_trn: iterator for the training subset * iterator_val: iterator for the validation subset OR * iterator: iterator for the chosen subset (training OR testing) Example: # build iterator(s) dataset = xxxxDataset(is_train=True) # TF operations are not created iterator = dataset.build() # TF operations are created OR iterator_trn, iterator_val = dataset.build(enbl_trn_val_split=True) # for dataset-train only # use the iterator to obtain a mini-batch of images & labels images, labels = iterator.get_next() ''' # obtain list of data files' names filenames = tf.data.Dataset.list_files(self.file_pattern, shuffle=True) if self.enbl_shard: filenames = filenames.shard(mgw.size(), mgw.rank()) # create a tf.data.Dataset from list of files dataset = filenames.apply( tf.contrib.data.parallel_interleave( self.dataset_fn, cycle_length=FLAGS.cycle_length)) dataset = dataset.map(self.parse_fn, num_parallel_calls=FLAGS.nb_threads) # create iterators for training & validation subsets separately if self.is_train and enbl_trn_val_split: iterator_val = self.__make_iterator( dataset.take(FLAGS.nb_smpls_val)) iterator_trn = self.__make_iterator( dataset.skip(FLAGS.nb_smpls_val)) return iterator_trn, iterator_val return self.__make_iterator(dataset)
def __retrain_network(self): """Retrain the network with layerwise regression & network fine-tuning.""" # determine how many iterations to be executed for regression & fine-tuning nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 nb_iters_rg = int(math.ceil(FLAGS.ws_nb_iters_rg / nb_workers)) nb_iters_ft = int(math.ceil(FLAGS.ws_nb_iters_ft / nb_workers)) # re-train the network with layerwise regression time_prev = timer() for rg_train_op in self.rg_train_ops: for __ in range(nb_iters_rg): self.sess_train.run(rg_train_op) time_rg = timer() - time_prev # re-train the network with global fine-tuning time_prev = timer() for __ in range(nb_iters_ft): self.sess_train.run(self.ft_train_op) time_ft = timer() - time_prev # display the time consumption tf.logging.info('time consumption: %.4f (s) - RG | %.4f (s) - FT' % (time_rg, time_ft))
def __init__(self, dataset_name, weights, statistics, bit_placeholders, ops, layerwise_tune_list, sess_train, sess_eval, saver_train, saver_eval, barrier_fn): """ By passing the ops in the learner, we do not need to build the graph again for training and testing. Args: * dataset_name: a string that indicates which dataset to use * weights: a list of Tensors, the weights of networks to quantize * statistics: a dict, recording the number of weights, activations e.t.c. * bit_placeholders: a dict of placeholder Tensors, the input of bits * ops: a dict of ops, including trian_op, eval_op e.t.c. * layerwise_tune_list: a tuple, in which [0] records the layerwise op and [1] records the layerwise l2_norm * sess_train: a session for train * sess_eval: a session for eval * saver_train: a Tensorflow Saver for the training graph * saver_eval: a Tensorflow Saver for the eval graph * barrier_fn: a function that implements barrier """ self.dataset_name = dataset_name self.weights = weights self.statistics = statistics self.bit_placeholders = bit_placeholders self.ops = ops self.layerwise_tune_ops, self.layerwise_diff = \ layerwise_tune_list[0], layerwise_tune_list[1] self.sess_train = sess_train self.sess_eval = sess_eval self.saver_train = saver_train self.saver_eval = saver_eval self.auto_barrier = barrier_fn self.total_num_weights = sum(self.statistics['num_weights']) self.total_bits = self.total_num_weights * FLAGS.uql_equivalent_bits self.w_rl_helper = RLHelper(self.sess_train, self.total_bits, self.statistics['num_weights'], self.weights, random_layers=FLAGS.uql_enbl_random_layers) self.mgw_size = int(mgw.size()) if FLAGS.enbl_multi_gpu else 1 self.tune_global_steps = int(FLAGS.uql_tune_global_steps / self.mgw_size) self.tune_global_disp_steps = int(FLAGS.uql_tune_disp_steps / self.mgw_size) # build the rl trianing graph with tf.Graph().as_default(): config = tf.ConfigProto() config.gpu_options.visible_device_list = str(mgw.local_rank() \ if FLAGS.enbl_multi_gpu else 0) self.sess_rl = tf.Session(config=config) # train an RL agent through multiple roll-outs self.s_dims = self.w_rl_helper.s_dims self.a_dims = 1 buff_size = len(self.weights) * int(FLAGS.uql_nb_rlouts // 4) self.agent = DdpgAgent(self.sess_rl, self.s_dims, self.a_dims, FLAGS.uql_nb_rlouts, buff_size, a_min=0., a_max=FLAGS.uql_w_bit_max-FLAGS.uql_w_bit_min)
def __choose_discr_chns(self): # pylint: disable=too-many-locals """Choose discrimination-aware channels.""" # select the most discriminative channels through multiple stages nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 nb_iters_block = int(FLAGS.dcp_nb_iters_block / nb_workers) nb_iters_layer = int(FLAGS.dcp_nb_iters_layer / nb_workers) for idx_block in range(self.nb_blocks): # fine-tune the current block for idx_iter in range(nb_iters_block): if (idx_iter + 1) % FLAGS.summ_step != 0: self.sess_train.run(self.block_train_ops[idx_block]) else: summary, __ = self.sess_train.run([self.summary_op, self.block_train_ops[idx_block]]) if self.is_primary_worker('global'): self.sm_writer.add_summary(summary, nb_iters_block * idx_block + idx_iter) # select the most discriminative channels for each layer for idx_layer in range(1, self.nb_layers): # do not prune the first layer if self.idxs_layer_to_block[idx_layer] != idx_block: continue # initialize the mask as all channels are pruned mask_shape = self.sess_train.run(tf.shape(self.masks[idx_layer])) tf.logging.info('layer #{}: mask\'s shape is {}'.format(idx_layer, mask_shape)) nb_chns = mask_shape[2] grad_norm_mask = np.ones(nb_chns) mask_vec = np.sum(self.sess_train.run(self.masks[idx_layer]), axis=(0, 1, 3)) prune_ratio = 1.0 - float(np.count_nonzero(mask_vec)) / mask_vec.size tf.logging.info('layer #%d: prune_ratio = %.4f' % (idx_layer, prune_ratio)) is_first_entry = True while is_first_entry or prune_ratio > FLAGS.dcp_prune_ratio: # choose the most important channel and then update the mask grad_norm = self.sess_train.run(self.grad_norms[idx_layer]) idx_chn_input = np.argmax(grad_norm * grad_norm_mask) grad_norm_mask[idx_chn_input] = 0.0 tf.logging.info('adding channel #%d to the non-pruned set' % idx_chn_input) mask_delta = np.zeros(mask_shape) mask_delta[:, :, idx_chn_input, :] = 1.0 if is_first_entry: is_first_entry = False self.sess_train.run(self.mask_init_ops[idx_layer]) self.sess_train.run(self.mask_updt_ops[idx_layer], feed_dict={self.mask_deltas[idx_layer]: mask_delta}) self.sess_train.run(self.prune_ops[idx_layer]) # fine-tune the current layer for idx_iter in range(nb_iters_layer): self.sess_train.run(self.layer_train_ops[idx_layer]) # re-compute the pruning ratio mask_vec = np.sum(self.sess_train.run(self.masks[idx_layer]), axis=(0, 1, 3)) prune_ratio = 1.0 - float(np.count_nonzero(mask_vec)) / mask_vec.size tf.logging.info('layer #%d: prune_ratio = %.4f' % (idx_layer, prune_ratio)) # compute overall pruning ratios if self.is_primary_worker('global'): log_rslt = self.sess_train.run(self.log_op) log_str = ' | '.join(['%s = %.4e' % (name, value) for name, value in zip(self.log_op_names, log_rslt)]) tf.logging.info('block #%d: %s' % (idx_block + 1, log_str))
def __choose_channels(self): # pylint: disable=too-many-locals """Choose channels for all convolutional layers.""" # obtain each layer's pruning ratio if FLAGS.cpg_prune_ratio_type == 'uniform': ratio_list = [FLAGS.cpg_prune_ratio] * self.nb_layers if FLAGS.cpg_skip_ht_layers: ratio_list[0] = 0.0 ratio_list[-1] = 0.0 elif FLAGS.cpg_prune_ratio_type == 'list': with open(FLAGS.cpg_prune_ratio_file, 'r') as i_file: i_line = i_file.readline().strip() ratio_list = [float(sub_str) for sub_str in i_line.split(',')] else: raise ValueError('unrecognized pruning ratio type: ' + FLAGS.cpg_prune_ratio_type) # select channels for all convolutional layers nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 nb_iters_layer = int(FLAGS.cpg_nb_iters_layer / nb_workers) for idx_layer in range(self.nb_layers): # skip if no pruning is required if ratio_list[idx_layer] == 0.0: continue if self.is_primary_worker('global'): tf.logging.info('layer #%d: pr = %.2f (target)' % (idx_layer, ratio_list[idx_layer])) tf.logging.info('mask.shape = {}'.format( self.masks[idx_layer].shape)) # select channels for the current convolutional layer time_prev = timer() reg_loss_prev = 0.0 lrn_rate_pgd = FLAGS.cpg_lrn_rate_pgd_init for idx_iter in range(nb_iters_layer): # take a stochastic proximal gradient descent step prune_perctl = ratio_list[idx_layer] * 100.0 * ( idx_iter + 1) / nb_iters_layer __, reg_loss = self.sess_train.run( [ self.layer_ops[idx_layer]['prune'], self.reg_losses[idx_layer] ], feed_dict={ self.lrn_rates_pgd[idx_layer]: lrn_rate_pgd, self.prune_perctls[idx_layer]: prune_perctl }) mask = self.sess_train.run(self.masks[idx_layer]) if self.is_primary_worker('global'): nb_chns_nnz = np.count_nonzero(np.sum(mask, axis=(0, 1, 3))) tf.logging.info( 'iter %d: nnz-chns = %d | loss = %.2e | lr = %.2e | percentile = %.2f' % (idx_iter + 1, nb_chns_nnz, reg_loss, lrn_rate_pgd, prune_perctl)) # adjust the learning rate if reg_loss < reg_loss_prev: lrn_rate_pgd *= FLAGS.cpg_lrn_rate_pgd_incr else: lrn_rate_pgd *= FLAGS.cpg_lrn_rate_pgd_decr reg_loss_prev = reg_loss # fine-tune with selected channels only self.sess_train.run(self.mask_updt_ops[idx_layer]) for idx_iter in range(nb_iters_layer): __, reg_loss = self.sess_train.run([ self.layer_ops[idx_layer]['finetune'], self.reg_losses[idx_layer] ]) mask = self.sess_train.run(self.masks[idx_layer]) if self.is_primary_worker('global'): nb_chns_nnz = np.count_nonzero(np.sum(mask, axis=(0, 1, 3))) tf.logging.info('iter %d: nnz-chns = %d | loss = %.2e' % (idx_iter + 1, nb_chns_nnz, reg_loss)) # re-compute the pruning ratio mask_vec = np.mean(np.square( self.sess_train.run(self.masks[idx_layer])), axis=(0, 1, 3)) prune_ratio = 1.0 - float( np.count_nonzero(mask_vec)) / mask_vec.size if self.is_primary_worker('global'): tf.logging.info('layer #%d: pr = %.2f (actual) | time = %.2f' % (idx_layer, prune_ratio, timer() - time_prev)) # compute overall pruning ratios if self.is_primary_worker('global'): log_rslt = self.sess_train.run(self.log_op) log_str = ' | '.join([ '%s = %.4e' % (name, value) for name, value in zip(self.log_op_names, log_rslt) ])
def __choose_channels(self): # pylint: disable=too-many-locals """Choose channels for all convolutional layers.""" # obtain each layer's pruning ratio prune_ratios = [FLAGS.cpr_prune_ratio] * self.nb_conv_layers if FLAGS.cpr_skip_frst_layer: prune_ratios[0] = 0.0 if FLAGS.cpr_skip_last_layer: prune_ratios[-1] = 0.0 # select channels for all the convolutional layers nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 for idx_layer, (prune_ratio, conv_info) in enumerate( zip(prune_ratios, self.conv_info_list)): # skip if no pruning is required if prune_ratio == 0.0: continue if self.is_primary_worker('global'): tf.logging.info('layer #%d: pr = %.2f (target)' % (idx_layer, prune_ratio)) tf.logging.info('kernel shape = {}'.format( conv_info['conv_krnl_prnd'].shape)) # extract the current layer's information conv_krnl_full = self.sess_train.run(conv_info['conv_krnl_full']) conv_krnl_prnd = self.sess_train.run(conv_info['conv_krnl_prnd']) conv_krnl_prnd_ph = conv_info['conv_krnl_prnd_ph'] update_op = conv_info['update_op'] input_full_tf = conv_info['input_full'] input_prnd_tf = conv_info['input_prnd'] output_full_tf = conv_info['output_full'] output_prnd_tf = conv_info['output_prnd'] strides = conv_info['strides'] padding = conv_info['padding'] nb_chns_input = conv_krnl_prnd.shape[2] # sample inputs & outputs through multiple mini-batches nb_iters_smpl = int( math.ceil(float(FLAGS.cpr_nb_smpl_insts) / FLAGS.batch_size)) inputs_list = [[] for __ in range(nb_chns_input)] outputs_list = [] for idx_iter in range(nb_iters_smpl): inputs_full, inputs_prnd, outputs_full, outputs_prnd = \ self.sess_train.run([input_full_tf, input_prnd_tf, output_full_tf, output_prnd_tf]) inputs_smpl, outputs_smpl = self.__smpl_inputs_n_outputs( conv_krnl_full, conv_krnl_prnd, inputs_full, inputs_prnd, outputs_full, outputs_prnd, strides, padding) for idx_chn_input in range(nb_chns_input): inputs_list[idx_chn_input] += [inputs_smpl[idx_chn_input]] outputs_list += [outputs_smpl] inputs_np_list = [np.vstack(x) for x in inputs_list] outputs_np = np.vstack(outputs_list) # choose channels via solving the sparsity-constrained regression problem conv_krnl_prnd = self.__solve_sparse_regression( inputs_np_list, outputs_np, conv_krnl_prnd, prune_ratio) self.sess_train.run(update_op, feed_dict={conv_krnl_prnd_ph: conv_krnl_prnd}) # evaluate the channel pruned model if FLAGS.cpr_eval_per_layer: if self.is_primary_worker('global'): self.__save_model(is_train=True) self.evaluate() self.auto_barrier() # evaluate the final channel pruned model if not FLAGS.cpr_eval_per_layer: if self.is_primary_worker('global'): self.__save_model(is_train=True) self.evaluate() self.auto_barrier()
def __choose_channels(self): # pylint: disable=too-many-locals """Choose channels for all convolutional layers.""" # obtain each layer's pruning ratio prune_ratios = [FLAGS.cpr_prune_ratio] * self.nb_conv_layers if FLAGS.cpr_skip_frst_layer: prune_ratios[0] = 0.0 if FLAGS.cpr_skip_last_layer: prune_ratios[-1] = 0.0 # evaluate the model before channel pruning tf.logging.info('evaluating the model before channel pruning') if False and self.is_primary_worker('global'): self.__save_model(is_train=True) self.evaluate() self.auto_barrier() # select channels for all the convolutional layers nb_workers = mgw.size() if FLAGS.enbl_multi_gpu else 1 skip_names = FLAGS.cpr_skip_op_names.split( ',') if FLAGS.cpr_skip_op_names is not None else [] for idx_layer, (prune_ratio, conv_info) in enumerate( zip(prune_ratios, self.conv_info_list)): # skip certain layers if no pruning is required enbl_skip = False for skip_name in skip_names: if skip_name in conv_info['conv_krnl_prnd'].name: enbl_skip = True break if enbl_skip: tf.logging.info('skip %s since no pruning is required' % conv_info['conv_krnl_prnd'].name) continue # display the layer information if self.is_primary_worker('global'): tf.logging.info('layer #%d: pr = %.2f (target)' % (idx_layer, prune_ratio)) tf.logging.info('kernel name = {}'.format( conv_info['conv_krnl_prnd'].name)) tf.logging.info('kernel shape = {}'.format( conv_info['conv_krnl_prnd'].shape)) # extract the current layer's information conv_krnl_full = self.sess_train.run(conv_info['conv_krnl_full']) conv_krnl_prnd = self.sess_train.run(conv_info['conv_krnl_prnd']) conv_krnl_prnd_ph = conv_info['conv_krnl_prnd_ph'] update_op = conv_info['update_op'] input_full_tf = conv_info['input_full'] input_prnd_tf = conv_info['input_prnd'] output_full_tf = conv_info['output_full'] output_prnd_tf = conv_info['output_prnd'] strides = conv_info['strides'] padding = conv_info['padding'] nb_chns_input = conv_krnl_prnd.shape[2] # sample inputs & outputs through multiple mini-batches tf.logging.info( 'sampling inputs & outputs through multiple mini-batches') time_beg = timer() nb_insts = 0 # number of sampled instances (for regression) collected so far inputs_list = [[] for __ in range(nb_chns_input)] outputs_list = [] while nb_insts < FLAGS.cpr_nb_insts_reg: inputs_full, inputs_prnd, outputs_full, outputs_prnd = \ self.sess_train.run([input_full_tf, input_prnd_tf, output_full_tf, output_prnd_tf]) inputs_smpl, outputs_smpl = self.__smpl_inputs_n_outputs( conv_krnl_full, conv_krnl_prnd, inputs_full, inputs_prnd, outputs_full, outputs_prnd, strides, padding) nb_insts += outputs_smpl.shape[0] for idx_chn_input in range(nb_chns_input): inputs_list[idx_chn_input] += [inputs_smpl[idx_chn_input]] outputs_list += [outputs_smpl] tf.logging.info('sampled inputs & outputs (%d / %d)' % (nb_insts, FLAGS.cpr_nb_insts_reg)) idxs_inst = np.random.choice(nb_insts, size=(FLAGS.cpr_nb_insts_reg), replace=False) inputs_np_list = [np.vstack(x)[idxs_inst] for x in inputs_list] outputs_np = np.vstack(outputs_list)[idxs_inst] tf.logging.info('time elapsed (sampling): %.4f (s)' % (timer() - time_beg)) # choose channels via solving the sparsity-constrained regression problem tf.logging.info( 'choosing channels via solving the sparsity-constrained regression problem' ) time_beg = timer() conv_krnl_prnd = self.__solve_sparse_regression( inputs_np_list, outputs_np, conv_krnl_prnd, prune_ratio) self.sess_train.run(update_op, feed_dict={conv_krnl_prnd_ph: conv_krnl_prnd}) tf.logging.info('time elapsed (selection): %.4f (s)' % (timer() - time_beg)) # evaluate the channel pruned model tf.logging.info('evaluating the channel pruned model') if FLAGS.cpr_eval_per_layer: if self.is_primary_worker('global'): self.__save_model(is_train=True) self.evaluate() self.auto_barrier()