def reg_params_init(sess, hps): ''' This function initializes the regularization paramters. Args: sess: the predefined computation graph. hps: hyperparameters collection Returns: layer_owl_params: a list, each element is an array containing the weights of the corresponding layer. ''' weight_placeholder = get_weight_placeholders() reg_applied_layers = hps.reg_applied_layers layer_owl_params = [] for idx, triple in enumerate(weight_placeholder): print('layer {}'.format(idx)) # if the layer is not regularized, then append [] if not reg_applied_layers[idx]: layer_owl_params.append([]) continue #Regularization parameters reg_params = hps.reg_params lambda_1 = np.float32(reg_params[idx][0]) lambda_2 = np.float32(reg_params[idx][1]) if (lambda_1 < 0) | (lambda_2 < 0): raise Exception('regularization parameters must be non-negative') #GrOWL weights should be applied to the rows of the (reshaped) weight matrix param_i, placeholder_i, assign_op_i = triple param_shape = sess.run(tf.shape(param_i)) if np.size(param_i.get_shape().as_list()) == 2: row_num = param_shape[0] elif np.size(param_i.get_shape().as_list()) == 4: row_num = param_shape[2] transition_ind = np.floor(row_num * FLAGS.PLD_transition) param_index = np.linspace(start=transition_ind - 1, stop=0, num=transition_ind) print(' row num: {}, transition_ind: {}, largest reg: {}'.format( row_num, transition_ind, lambda_1 + lambda_2 * transition_ind)) if row_num > transition_ind: param_index = np.append( param_index, np.zeros([1, int(row_num - transition_ind)])) layer_owl_params.append(lambda_1 + lambda_2 * param_index) print("length of weight_placeholder:{0}".format(len(weight_placeholder))) assert len(layer_owl_params) == len(weight_placeholder) assert len(layer_owl_params) == len(hps.reg_applied_layers) return layer_owl_params, hps
def apply_reg_prox(sess, learning_rate_val, layer_reg_params, hps): ''' Updates the weights parameter of each layer Args: sess: the comptutaion graph learning_rate: the predefined learning rate layer_reg_params: owl parameters, initially created by reg_params_init hps: Returns: None ''' # get weights of the network weight_placeholders = get_weight_placeholders() # prox_lr_val = min(learning_rate_val, 0.001) prox_lr_val = learning_rate_val for idx, triple in enumerate(weight_placeholders): #Don't apply owl/growl if told not to if not hps.reg_applied_layers[idx]: continue param_i, placeholder_i, assign_op_i = triple param_val = sess.run(param_i) dim_i = np.size(param_val.shape) if dim_i == 2: if FLAGS.use_growl: prox_param_val = apply_growl( param_val, prox_lr_val * layer_reg_params[idx]) else: prox_param_val = apply_group_lasso( param_val, prox_lr_val * layer_reg_params[idx]) elif dim_i == 4: # For convolutional layer, we need to first reshape 4D tensor to 2D matrix reduced_param_val = reshape_2D_4D(param_val, target_shape=None, reshape_type=2, reshape_order='F') if FLAGS.use_growl: reduced_prox_param_val = apply_growl( reduced_param_val, prox_lr_val * layer_reg_params[idx]) else: reduced_prox_param_val = apply_group_lasso( reduced_param_val, prox_lr_val * layer_reg_params[idx]) #Now reshape the 2D matrix back to 4D tensor prox_param_val = reshape_2D_4D(reduced_prox_param_val, target_shape=param_val.shape, reshape_type=1, reshape_order='F') # assign the new weights to param_i using the assign_op_i sess.run(assign_op_i, feed_dict={placeholder_i: prox_param_val})
def apply_param_share(sess, group_info, hps): """Parameter sharing for the retraining phase Args: sess: the computation graph group_info: the group information. A list of tuples, each tuple contains the index of the rows which belongs to the same group hps: """ weight_placeholders = get_weight_placeholders() # Track the index of the regularized layer, for retrieving the group info idx_true_layer = 0 for idx, triple in enumerate(weight_placeholders): #Don't apply param share to the non_regularizer applied layers if not hps.reg_applied_layers[idx]: continue #Only apply param share to those layers that from the pattern of clusters if not hps.param_shared_layers[idx]: #Update group idx idx_true_layer = idx_true_layer + 1 continue param_i, param_placeholder_i, param_assign_op_i = triple dim_i = param_i.get_shape().as_list() param_val = sess.run(param_i) if np.size(dim_i) == 4: #reshape the 4D tensor to a 2D matrix param_val_reshaped = reshape_2D_4D(param_val, target_shape=None, reshape_type=2, reshape_order='F') #retrain with parameter sharing param_val_reshaped_shared = group_averaging( param_val_reshaped, group_info[idx_true_layer]) #back to 4D tensor param_val_shared = reshape_2D_4D(param_val_reshaped_shared, target_shape=tuple(dim_i), reshape_type=1, reshape_order='F') elif np.size(dim_i) == 2: param_val_shared = group_averaging(param_val, group_info[idx_true_layer]) #Update parameter sess.run(param_assign_op_i, feed_dict={param_placeholder_i: param_val_shared}) #Update group idx idx_true_layer = idx_true_layer + 1
def reg_params_init(sess, config): ''' This function initializes the regularization paramters. Args: sess: the predefined computation graph. config: the yaml configuration file. Returns: layer_owl_params: n-tuple, each elements is an array containing the weights of the corresponding layer. ''' weight_placeholder = utils_nn.get_weight_placeholders() layer_owl_params = [] min_num_row = float("Inf") if config['PLD_transition'] == 0: # read out the minimum number of rows for idx, triple in enumerate(weight_placeholder): param_i, placeholder_i, assign_op_i = triple param_shape = sess.run(tf.shape(param_i)) if param_shape[0] < min_num_row: min_num_row = param_shape[0] # iterates through all layers, idx is the layer number for idx, triple in enumerate(weight_placeholder): param_i, placeholder_i, assign_op_i = triple # OWL weights should be applied to the rows of the weight matrix param_shape = sess.run(tf.shape(param_i)) reg_params = config['growl_params'] lambda_1 = np.float32(reg_params[idx][0]) lambda_2 = np.float32(reg_params[idx][1]) if (lambda_1 < 0) | (lambda_2 < 0): raise Exception('regularization parameters must be non-negative') # get row_num row_num = int(param_shape[0]) if config['reg_params_type'] == 'PLD': if config['PLD_transition'] != 0: transition_ind = np.floor(param_shape[0]*config['PLD_transition']) -1 else: transition_ind = min_num_row param_index = np.linspace(start=row_num-1, stop=0, num=transition_ind) param_index = np.append(param_index, np.zeros([1, int(param_shape[0]-transition_ind)])) layer_owl_params.append(lambda_1 + lambda_2 * param_index) assert len(layer_owl_params) == len(weight_placeholder) return layer_owl_params
def apply_owl_prox(sess, learning_rate, layer_reg_params, config): ''' Updates the weights parameter of each layer Args: sess: the comptutaion graph learning_rate: the predefined learning rate layer_reg_params: owl parameters, initially created by reg_params_init config: yaml configuration file Returns: None ''' # get weights of the network weight_placeholders = utils_nn.get_weight_placeholders() learning_rate_val = sess.run(learning_rate) for idx, triple in enumerate(weight_placeholders): #Don't apply owl/growl if told not to if not config['owl_applied_layers'][idx]: continue param_i, placeholder_i, assign_op_i = triple param_val = sess.run(param_i) dim_i = np.size(param_val.shape) if config['use_growl']: prox_param_val = apply_growl(param_val, learning_rate_val * layer_reg_params[idx]) else: prox_param_val = apply_group_lasso(param_val, learning_rate_val * layer_reg_params[idx]) # assign the new weights to param_i using the assign_op_i # refer to utils_nn.py for details of assign_op_i sess.run(assign_op_i, feed_dict={placeholder_i:prox_param_val})
def measure_compression(sess, res_dict, step, training, hps, num_cluster_arr=[]): ''' Monitor the compression ratio ''' mask_palceholders = get_mask_placeholders() weight_placeholders = get_weight_placeholders() num_nonzero_row_arr = [] num_total_row_arr = [] num_row_size_arr = [] num_nonzero_params = 0 num_unique_params = 0 num_total_params = 0 for idx, mask_triple in enumerate(mask_palceholders): mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[ idx] dim_i = param_i.get_shape().as_list() param_val = sess.run(param_i) mask = sess.run(mask_i) param_val_masked = param_val * mask if np.size(dim_i) == 4: param_val_masked_reshaped = reshape_2D_4D(param_val_masked, target_shape=None, reshape_type=2, reshape_order='F') row_norm = norm(param_val_masked_reshaped, axis=1) num_nonzero_params += np.count_nonzero(row_norm) * np.shape( param_val_masked_reshaped)[1] num_unique_params += np.size(np.unique(param_val_masked_reshaped)) num_total_params += np.prod(dim_i) num_nonzero_row_arr.append(np.count_nonzero(row_norm)) num_total_row_arr.append(np.size(row_norm)) num_row_size_arr.append(np.shape(param_val_masked_reshaped)[1]) elif np.size(dim_i) == 2: row_norm = norm(param_val_masked, axis=1) num_nonzero_params += np.count_nonzero(row_norm) * dim_i[1] num_unique_params += np.size(np.unique(param_val_masked)) num_total_params += np.prod(dim_i) num_nonzero_row_arr.append(np.count_nonzero(row_norm)) num_total_row_arr.append(np.size(row_norm)) num_row_size_arr.append(np.shape(param_val_masked)[1]) # num_cluster arr only contains the cluster information for regularized layers, we need to first fill in the number of rows for unregularized layers if (not hps.reg_applied_layers[idx]) and (not training): num_cluster_arr = np.insert(num_cluster_arr, idx, np.size(row_norm)) # calculate the list for num_param_i / num_total param weight_ratio_list = np.divide( np.multiply(num_total_row_arr, num_row_size_arr), float(num_total_params)) # calculate the nonzero ratio list, nonzero ratio for each layer is defined as num_nozero_row_i/num_total_row_i num_total_row_arr = np.asarray(num_total_row_arr, dtype=np.float32) num_nonzero_row_arr = np.asarray(num_nonzero_row_arr, dtype=np.float32) nonzero_ratio_list = np.divide(num_nonzero_row_arr, num_total_row_arr) if training: compression_ratio_arr = np.append( np.multiply(nonzero_ratio_list[0:-1], nonzero_ratio_list[1:]), nonzero_ratio_list[-1]) compression_ratio = np.inner(compression_ratio_arr, weight_ratio_list) else: compression_ratio_arr = np.append( np.multiply(nonzero_ratio_list[0:-1], nonzero_ratio_list[1:]), nonzero_ratio_list[-1]) compression_ratio_arr = np.multiply( compression_ratio_arr, np.divide(num_cluster_arr, num_nonzero_row_arr)) compression_ratio = np.inner(compression_ratio_arr, weight_ratio_list) print('nonzero_ratio_list: {}'.format(nonzero_ratio_list)) print('weight_ratio_list: {}'.format(weight_ratio_list)) print('num_nonzero_row_arr: {}'.format(num_nonzero_row_arr)) print('num_total_row_arr: {}'.format(num_total_row_arr)) print('num_row_size_arr: {}'.format(num_row_size_arr)) print("At step {}, total compression ratio is: {:.4f}%".format( step, compression_ratio * 100)) res_dict['compression_ratio_arr'].append(compression_ratio) np.save(FLAGS.res_dir + 'res_dict.npy', res_dict) return compression_ratio
def update_mask(sess, threshold, hps, res_dict, step): ''' update the mask during the training process to prevent drifting from zero Args: sess: the computation graph learning_rate: the predefined learning rate threshold: the pruning threshold, this may help avoid the floating number error occured during the masking process model: the resnet class hps: hyperparameters res_dict: results dictionary step: current step Returns: num_zero_layers: number of zero valued layers ''' mask_palceholders = get_mask_placeholders() weight_placeholders = get_weight_placeholders() #count the zero valued layers in order to avoiding the nonsense results num_zero_layers = 0 layer_ID = [] assert len(mask_palceholders) == len(weight_placeholders) for idx, mask_triple in enumerate(mask_palceholders): #Don't apply owl/growl if told not to if not hps.reg_applied_layers[idx]: continue mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[ idx] dim_i = param_i.get_shape().as_list() #Recover the masked weights to zeros if they drifted param_val = sess.run(param_i) mask = sess.run(mask_i) param_val_masked = param_val * mask #If apply to convolutional layer, compute the reshaped matrix if np.size(dim_i) == 4: param_val_masked_reshaped = reshape_2D_4D(param_val_masked, target_shape=None, reshape_type=2, reshape_order='F') mask_reshaped = reshape_2D_4D(mask, target_shape=None, reshape_type=2, reshape_order='F') #prune params and update the mask row_norm = norm(param_val_masked_reshaped, axis=1) row_size = param_val_masked_reshaped.shape[1] print( 'layer:{}, largest row norm: {:6f}, median row norm: {:.6f}, min row norm: {:.6f}' .format(idx, np.max(row_norm), np.median(row_norm), np.min(row_norm))) zero_row_idx = np.where(row_norm <= threshold) print(' masked neurons: {}; total neurons: {}'.format( np.size(zero_row_idx), np.size(row_norm))) param_val_masked_reshaped[zero_row_idx[0], :] = 0 mask_reshaped[zero_row_idx[0], :] = 0 #back to 4D param_val_masked = reshape_2D_4D(param_val_masked_reshaped, target_shape=tuple(dim_i), reshape_type=1, reshape_order='F') mask = reshape_2D_4D(mask_reshaped, target_shape=tuple(dim_i), reshape_type=1, reshape_order='F') elif np.size(dim_i) == 2: row_norm = norm(param_val_masked, axis=1) row_size = param_val_masked.shape[1] print( 'layer:{}, largest row norm: {:6f}, median row norm: {:.6f}, min row norm: {:.6f}' .format(idx, np.max(row_norm), np.median(row_norm), np.min(row_norm))) zero_row_idx = np.where(row_norm <= threshold) print(' masked rows: {}; total rows: {}'.format( np.size(zero_row_idx), np.size(row_norm))) param_val_masked[zero_row_idx[0], :] = 0 mask[zero_row_idx[0], :] = 0 #Update the mask and weight matrix sess.run(mask_assign_op_i, feed_dict={mask_palceholders_i: mask}) sess.run(param_assign_op_i, feed_dict={param_placeholder_i: param_val_masked}) nonzero_rows = np.size(row_norm) - np.size(zero_row_idx[0]) layer_nonzero_params = nonzero_rows * row_size print(" total:{0}, nonzeros:{1}".format(np.size(param_val_masked), layer_nonzero_params)) ################################ #Record the zero valued layers if np.size(row_norm) - np.size(zero_row_idx[0]) <= 3: num_zero_layers += 1 layer_ID += [idx] return num_zero_layers, layer_ID
def display_similarity(sess, step, hps, res_dict): ''' display the pairwise similarity of the rows of the weight matrix. Args: sess: the computation graph step: current step hps: hyperparameters res_dict: results dictionary Return: ''' num_nonzero_rows_tuple, num_clusters_tuple = tuple(), tuple() mask_placeholders = get_mask_placeholders() weight_placeholders = get_weight_placeholders() assert len(mask_placeholders) == len(weight_placeholders) group_info = [] num_clusters_arr = [] threshold = np.finfo(np.float32).eps #Track the nonzero row index nonzero_row_index = [] # only process the layers we applied grOWL on for idx, mask_triple in enumerate(mask_placeholders): # double check we have turned on grOWL for the layer if not hps.reg_applied_layers[idx]: continue mask_i, mask_placeholders_i, mask_assign_op_i = mask_placeholders[idx] param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[ idx] dim_i = param_i.get_shape().as_list() param_masked = tf.multiply(mask_i, param_i) if np.size(dim_i) == 2: param_masked_val = sess.run(param_masked) print("layer:{0}, param_masked_val.shape:{1}".format( idx, param_masked_val.shape)) elif np.size(dim_i) == 4: param_masked_4D = sess.run(param_masked) param_masked_val = reshape_2D_4D(param_masked_4D, target_shape=None, reshape_type=2, reshape_order='F') print("layer:{0}, param_masked_val.shape:{1}".format( idx, param_masked_val.shape)) # first retrieve nonzero rows from the parameter matrix row_norm = norm(param_masked_val, axis=1) row_norm[row_norm < threshold] = 0 # remove very small row norms num_nonzero_rows = np.count_nonzero(row_norm) nonzero_row_idx = np.flatnonzero(row_norm) nonzero_rows = param_masked_val[row_norm > 0, :] norm_nonzero_rows = norm(nonzero_rows, axis=1) #calculate the display similarity matrix without removing the zero valued rows num_rows = row_norm.size display_similarity_val = np.zeros([num_rows, num_rows]) display_similarity_val_partial = np.zeros( [nonzero_row_idx.size, nonzero_row_idx.size]) zero_row_idx = np.where(row_norm < threshold)[0] for k in zero_row_idx: display_similarity_val[k, :] = 0 display_similarity_val[:, k] = 0 partial_idx = np.arange(nonzero_row_idx.size) for idx_newi, i in np.nditer([partial_idx, nonzero_row_idx]): for idx_newj, j in np.nditer([partial_idx, nonzero_row_idx]): if row_norm[i] > row_norm[j]: display_similarity_val[ i, j] = display_similarity_val_partial[ idx_newi, idx_newj] = np.dot( param_masked_val[i, :], param_masked_val[j, :]) / row_norm[i]**2 else: display_similarity_val[ i, j] = display_similarity_val_partial[ idx_newi, idx_newj] = np.dot( param_masked_val[i, :], param_masked_val[j, :]) / row_norm[j]**2 #Save the similarity matrix np.save(FLAGS.res_dir + 'similarity_{}.npy'.format(int(step / 390)), display_similarity_val) # Cluster the paramter matrix with affinity propagation if num_nonzero_rows > 1: preference_val = FLAGS.preference print('CLUSTERING ROWS WITH PREFERENCE BVALUE:{}'.format( preference_val)) af = AffinityPropagation( affinity='precomputed', preference=preference_val).fit(display_similarity_val_partial) cluster_centers_indices = af.cluster_centers_indices_ num_clusters = np.size(cluster_centers_indices) with open(FLAGS.res_dir + 'cluster.txt', 'a') as f: print(' idx: {}, Nonzero rows: {}, Number of Clusters: {}'. format(idx, num_nonzero_rows, num_clusters)) f.write( ' idx: {}, Nonzero rows: {}, Number of Clusters: {}\n'. format(idx, num_nonzero_rows, num_clusters)) num_clusters_arr.append(num_clusters) # get the labels for the nonzero rows and construct the tuple list for storing the group # information labels = af.labels_ group_info_i = [tuple()] * num_clusters for i in range(len(labels)): group_info_i[labels[i]] = group_info_i[labels[i]] + ( nonzero_row_idx[i], ) group_info.append(group_info_i) # put the clustering results in the tuple for return num_nonzero_rows_tuple = num_nonzero_rows_tuple + ( num_nonzero_rows, ) num_clusters_tuple = num_clusters_tuple + (num_clusters, ) nonzero_row_index.append(nonzero_row_idx) # store the intermediate results res_dict['num_cluster_arr'].append(num_clusters_tuple) res_dict['num_nonzero_row_arr'].append(num_nonzero_rows_tuple) np.save(FLAGS.res_dir + 'res_dict.npy', res_dict) np.save(FLAGS.res_dir + 'nonzero_index.npy', nonzero_row_index) np.save(FLAGS.train_dir + 'group_info_{}.npy'.format(int(step / 390)), group_info) return group_info, num_clusters_arr
def display_similarity(sess, epoch, get_group, config): ''' display the pairwise similarity of the rows of the weight matrix. and plot the clustering results. Args: sess: the computation graph epoch: current epoch Return: num_nonzero_rows_tuple: # of nonzero rows for each layer num_clusters_tuple: # of clusters for each layer group_info: list of tuples containing the rows indices of each group ''' num_nonzero_rows_tuple, num_clusters_tuple = tuple(), tuple() mask_placeholders = utils_nn.get_mask_placeholders() weight_placeholders = utils_nn.get_weight_placeholders() num_layers = 0 threshold = np.finfo(np.float32).eps assert len(mask_placeholders) == len(weight_placeholders) group_info = [] # only process the layers we applied grOWL on for idx, mask_triple in enumerate(mask_placeholders): # double check we have turned on grOWL for the layer if not config['owl_applied_layers'][idx]: continue num_layers = num_layers + 1 mask_i, mask_placeholders_i, mask_assign_op_i = mask_placeholders[idx] param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[ idx] dim_i = param_i.get_shape().as_list() # recover the masked weights to zeros if they drifted param_masked = tf.multiply(mask_i, param_i) param_masked_val = sess.run(param_masked) if config['similarity'] == 'norm_euclidean': # first retrieve nonzero rows from the parameter matrix row_norm = norm(param_masked_val, axis=1) row_norm[row_norm < threshold] = 0 # remove very small row norms num_nonzero_rows = np.count_nonzero(row_norm) nonzero_row_idx = np.flatnonzero(row_norm) nonzero_rows = param_masked_val[row_norm > 0, :] norm_nonzero_rows = norm(nonzero_rows, axis=1) # then compute the normalized Euclidean norm similarity matrix, we should take the negative # so that similar rows have a larger affinity value similarity_val = np.zeros([num_nonzero_rows, num_nonzero_rows]) for i in range(num_nonzero_rows): for j in range(num_nonzero_rows): if norm_nonzero_rows[i] > norm_nonzero_rows[j]: similarity_val[i, j] = -norm(nonzero_rows[ i, :] - nonzero_rows[j, :]) / norm_nonzero_rows[j] else: similarity_val[i, j] = -norm(nonzero_rows[ i, :] - nonzero_rows[j, :]) / norm_nonzero_rows[i] #calculate the display similarity matrix without removing the zero valued rows num_rows = row_norm.size display_similarity_val = np.zeros([num_rows, num_rows]) display_similarity_val_partial = np.zeros( [nonzero_row_idx.size, nonzero_row_idx.size]) zero_row_idx = np.where(row_norm < threshold)[0] for k in zero_row_idx: display_similarity_val[k, :] = 0 display_similarity_val[:, k] = 0 partial_idx = np.arange(nonzero_row_idx.size) for idx_newi, i in np.nditer([partial_idx, nonzero_row_idx]): for idx_newj, j in np.nditer([partial_idx, nonzero_row_idx]): if row_norm[i] > row_norm[j]: display_similarity_val[ i, j] = display_similarity_val_partial[ idx_newi, idx_newj] = np.dot( param_masked_val[i, :], param_masked_val[j, :]) / row_norm[i]**2 else: display_similarity_val[ i, j] = display_similarity_val_partial[ idx_newi, idx_newj] = np.dot( param_masked_val[i, :], param_masked_val[j, :]) / row_norm[j]**2 # save the similarity val if get_group: sim_name = config[ 'plot_dir'] + 'epoch_{0}_similarity_{1}.npy'.format( epoch, idx + 1) np.save(sim_name, display_similarity_val) #Visualize the similarity matrix num_rows = display_similarity_val.shape[0] row_idx = np.linspace(1, num_rows, float(num_rows)) x, y = np.meshgrid(row_idx, row_idx) # plot the grid search results for accuracy figName = config['plot_dir'] + 'epoch_{0}_similarity_{1}.png'.format( epoch, idx + 1) if config['similarity'] == 'norm_euclidean': imagesc(x, y, display_similarity_val, 'row_idx', 'row_idx', 'similarity matrix of layer {0}'.format(idx + 1), False, figName) similarity_val_arr = np.reshape(similarity_val, [-1]) display_similarity_val_arr = np.reshape(display_similarity_val, [-1]) # save the nonzero similarity value nonzero_display_similarity_val_arr = display_similarity_val_arr[ np.abs(display_similarity_val_arr) > 0] print('Layer {}, the median of the similarity_val is {}'.format( idx, np.median(nonzero_display_similarity_val_arr))) if get_group: np.savetxt( config['plot_dir'] + 'sim_vec_epoch{}.csv'.format(epoch), display_similarity_val_arr) plt.hist(nonzero_display_similarity_val_arr) plt.title('Similarity histogram at epoch {}'.format(epoch + 1)) plt.savefig(config['plot_dir'] + 'layer{}_train_sim_{}.jpg'.format(idx, epoch)) plt.close() # Cluster the paramter matrix with affinity propagation if num_nonzero_rows > 0: if config['similarity'] == 'norm_euclidean': af = AffinityPropagation(affinity='precomputed', preference=config['preference']).fit( display_similarity_val_partial) cluster_centers_indices = af.cluster_centers_indices_ num_clusters = np.size(cluster_centers_indices) print("number of clusters {}".format(num_clusters)) # get the labels for the nonzero rows and construct the tuple list for storing the group information. group_info_i is only for the ith layer if get_group: labels = af.labels_ group_info_i = [tuple()] * num_clusters for i in range(len(labels)): group_info_i[labels[i]] = group_info_i[labels[i]] + ( nonzero_row_idx[i], ) group_info.append(group_info_i) # put the clustering results in the tuple for return num_nonzero_rows_tuple = num_nonzero_rows_tuple + ( num_nonzero_rows, ) num_clusters_tuple = num_clusters_tuple + (num_clusters, ) print('Nonzero rows: {}, Number of Clusters: {}'.format( num_nonzero_rows, num_clusters)) else: num_nonzero_rows_tuple = num_nonzero_rows_tuple + (0, ) num_clusters_tuple = num_clusters_tuple + (0, ) print('Nonzero rows: {}, Number of Clusters: {}'.format(0, 0)) return num_nonzero_rows_tuple, num_clusters_tuple, group_info
def update_mask(sess, epoch, learning_rate, threshold, phase, config, group_info=None, get_nonzero_idx_flag=False): ''' update the mask Args: sess: the computation graph learning_rate: the predefined learning rate threshold: the pruning threshold, this may help avoid the floating number error occured during the masking process phase: False for training, True for retraining. If Ture, then enforce parameter sharing config: the yaml configuration file group_info: the group information. A list of tuples, each tuple contains the index of the rows which belongs to the same group Returns: compression_ratio: percentage, the ratio between nonzero paramters and total parameters ''' mask_palceholders = utils_nn.get_mask_placeholders() weight_placeholders = utils_nn.get_weight_placeholders() num_total_params = 0 num_nonzero_params = 0 num_unique_params = 0 compression_ratio = 1 #count the zero valued layers in order to avoiding the nonsense results num_zero_layers = 0 assert len(mask_palceholders) == len(weight_placeholders) # track the index of the regularized layer, for retrieving the group info idx_true_layer = 0 for idx, mask_triple in enumerate(mask_palceholders): #Don't apply owl/growl if told not to if not config['owl_applied_layers'][idx]: continue mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[idx] dim_i = param_i.get_shape().as_list() # recover the masked weights to zeros if they drifted param_val = sess.run(param_i) mask = sess.run(mask_i) param_val_masked = param_val * mask learning_rate_val = sess.run(learning_rate) # for fully connected layer if config['use_growl'] | config['use_group_lasso']: # This is the pruning process row_norm = norm(param_val_masked, axis=1) print('min row norm {:.4f}'.format(np.min(row_norm))) print('current epoch {}'.format(epoch+1)) if epoch==0 or (epoch+1) % config['row_norm_freq'] == 0: hist_plot(idx, epoch, phase, row_norm[row_norm>0], config) zero_row_idx = np.where(row_norm <=threshold) nonzero_row_idx = np.where(row_norm > threshold) np.save(config['plot_dir']+'nonzero_row_idx.npy', nonzero_row_idx) print('masked rows: {}; total rows: {}'.format(np.size(zero_row_idx), np.size(row_norm))) param_val_masked[zero_row_idx[0], :] = 0 # in retraining process, enforce parameter sharing, do not update the mask if phase: param_val_masked = group_averaging(param_val_masked, group_info[idx_true_layer]) # update parameter sess.run(param_assign_op_i, feed_dict={param_placeholder_i:param_val_masked}) # update the mask in training process # Only update mask at the locations that corresponding to zero valued rows if not phase: mask[zero_row_idx[0], :] = 0 sess.run(mask_assign_op_i, feed_dict={mask_palceholders_i:mask}) layer_nonzero_params = np.count_nonzero(param_val_masked) print("update mask of layer: {0}, total:{1}, nonzeros:{2}, uniqs:{3}".format(idx, np.size(param_val_masked), layer_nonzero_params, len(np.unique(param_val_masked)))) num_total_params = num_total_params + np.size(param_val_masked) print("num_total_params:{0}, param_val_size:{1}".format(num_total_params, np.size(param_val_masked))) num_nonzero_params = num_nonzero_params + layer_nonzero_params num_unique_params = num_unique_params + len(np.unique(param_val_masked)) idx_true_layer = idx_true_layer + 1 #record the zero valued layers if np.size(row_norm) - np.size(zero_row_idx[0]) <= 3: num_zero_layers += 1 # in training, we care about nonzero parameters if not phase: compression_ratio = num_nonzero_params/num_total_params # in retraining, we care about unique parameters else: compression_ratio = num_unique_params/num_total_params # compression_raito = 0 print("Total compression ratio is: {:.4f}%".format(compression_ratio * 100)) return compression_ratio, num_zero_layers