예제 #1
0
def reg_params_init(sess, hps):
    '''
  This function initializes the regularization paramters.

  Args:
    sess: the predefined computation graph.
    hps: hyperparameters collection

  Returns:
    layer_owl_params: a list, each element is an array containing the weights
              of the corresponding layer.
  '''
    weight_placeholder = get_weight_placeholders()
    reg_applied_layers = hps.reg_applied_layers

    layer_owl_params = []
    for idx, triple in enumerate(weight_placeholder):

        print('layer {}'.format(idx))
        # if the layer is not regularized, then append []
        if not reg_applied_layers[idx]:
            layer_owl_params.append([])
            continue

        #Regularization parameters
        reg_params = hps.reg_params
        lambda_1 = np.float32(reg_params[idx][0])
        lambda_2 = np.float32(reg_params[idx][1])
        if (lambda_1 < 0) | (lambda_2 < 0):
            raise Exception('regularization parameters must be non-negative')

        #GrOWL weights should be applied to the rows of the (reshaped) weight matrix
        param_i, placeholder_i, assign_op_i = triple
        param_shape = sess.run(tf.shape(param_i))

        if np.size(param_i.get_shape().as_list()) == 2:
            row_num = param_shape[0]

        elif np.size(param_i.get_shape().as_list()) == 4:
            row_num = param_shape[2]

        transition_ind = np.floor(row_num * FLAGS.PLD_transition)
        param_index = np.linspace(start=transition_ind - 1,
                                  stop=0,
                                  num=transition_ind)
        print('  row num: {}, transition_ind: {}, largest reg: {}'.format(
            row_num, transition_ind, lambda_1 + lambda_2 * transition_ind))
        if row_num > transition_ind:
            param_index = np.append(
                param_index, np.zeros([1, int(row_num - transition_ind)]))

        layer_owl_params.append(lambda_1 + lambda_2 * param_index)

    print("length of weight_placeholder:{0}".format(len(weight_placeholder)))

    assert len(layer_owl_params) == len(weight_placeholder)
    assert len(layer_owl_params) == len(hps.reg_applied_layers)

    return layer_owl_params, hps
예제 #2
0
def apply_reg_prox(sess, learning_rate_val, layer_reg_params, hps):
    '''
  Updates the weights parameter of each layer

  Args:
    sess: the comptutaion graph
    learning_rate: the predefined learning rate
    layer_reg_params: owl parameters, initially created by reg_params_init
    hps:

  Returns:
    None
  '''
    # get weights of the network
    weight_placeholders = get_weight_placeholders()

    # prox_lr_val = min(learning_rate_val, 0.001)
    prox_lr_val = learning_rate_val
    for idx, triple in enumerate(weight_placeholders):

        #Don't apply owl/growl if told not to
        if not hps.reg_applied_layers[idx]:
            continue

        param_i, placeholder_i, assign_op_i = triple
        param_val = sess.run(param_i)
        dim_i = np.size(param_val.shape)

        if dim_i == 2:
            if FLAGS.use_growl:
                prox_param_val = apply_growl(
                    param_val, prox_lr_val * layer_reg_params[idx])
            else:
                prox_param_val = apply_group_lasso(
                    param_val, prox_lr_val * layer_reg_params[idx])

        elif dim_i == 4:
            # For convolutional layer, we need to first reshape 4D tensor to 2D matrix
            reduced_param_val = reshape_2D_4D(param_val,
                                              target_shape=None,
                                              reshape_type=2,
                                              reshape_order='F')
            if FLAGS.use_growl:
                reduced_prox_param_val = apply_growl(
                    reduced_param_val, prox_lr_val * layer_reg_params[idx])
            else:
                reduced_prox_param_val = apply_group_lasso(
                    reduced_param_val, prox_lr_val * layer_reg_params[idx])

            #Now reshape the 2D matrix back to 4D tensor
            prox_param_val = reshape_2D_4D(reduced_prox_param_val,
                                           target_shape=param_val.shape,
                                           reshape_type=1,
                                           reshape_order='F')

        # assign the new weights to param_i using the assign_op_i
        sess.run(assign_op_i, feed_dict={placeholder_i: prox_param_val})
예제 #3
0
def apply_param_share(sess, group_info, hps):
    """Parameter sharing for the retraining phase
  Args:
    sess: the computation graph
    group_info: the group information. A list of tuples, each tuple contains the index of the rows
    which belongs to the same group
    hps: 
  """

    weight_placeholders = get_weight_placeholders()
    # Track the index of the regularized layer, for retrieving the group info
    idx_true_layer = 0
    for idx, triple in enumerate(weight_placeholders):
        #Don't apply param share to the non_regularizer applied layers
        if not hps.reg_applied_layers[idx]:
            continue

        #Only apply param share to those layers that from the pattern of clusters
        if not hps.param_shared_layers[idx]:
            #Update group idx
            idx_true_layer = idx_true_layer + 1
            continue

        param_i, param_placeholder_i, param_assign_op_i = triple
        dim_i = param_i.get_shape().as_list()
        param_val = sess.run(param_i)

        if np.size(dim_i) == 4:

            #reshape the 4D tensor to a 2D matrix
            param_val_reshaped = reshape_2D_4D(param_val,
                                               target_shape=None,
                                               reshape_type=2,
                                               reshape_order='F')

            #retrain with parameter sharing
            param_val_reshaped_shared = group_averaging(
                param_val_reshaped, group_info[idx_true_layer])

            #back to 4D tensor
            param_val_shared = reshape_2D_4D(param_val_reshaped_shared,
                                             target_shape=tuple(dim_i),
                                             reshape_type=1,
                                             reshape_order='F')

        elif np.size(dim_i) == 2:

            param_val_shared = group_averaging(param_val,
                                               group_info[idx_true_layer])

        #Update parameter
        sess.run(param_assign_op_i,
                 feed_dict={param_placeholder_i: param_val_shared})

        #Update group idx
        idx_true_layer = idx_true_layer + 1
예제 #4
0
def reg_params_init(sess, config):
    '''
    This function initializes the regularization paramters.

    Args:
        sess: the predefined computation graph.
        config: the yaml configuration file.

    Returns:
        layer_owl_params: n-tuple, each elements is an array containing the weights
                          of the corresponding layer.
    '''

    weight_placeholder = utils_nn.get_weight_placeholders()
    layer_owl_params = []
    min_num_row = float("Inf")
    if config['PLD_transition'] == 0:
        # read out the minimum number of rows
        for idx, triple in enumerate(weight_placeholder):
            param_i, placeholder_i, assign_op_i = triple
            param_shape = sess.run(tf.shape(param_i))
            if param_shape[0] < min_num_row:
                min_num_row = param_shape[0]

    # iterates through all layers, idx is the layer number
    for idx, triple in enumerate(weight_placeholder):

        param_i, placeholder_i, assign_op_i = triple

        # OWL weights should be applied to the rows of the weight matrix
        param_shape = sess.run(tf.shape(param_i))

        reg_params = config['growl_params']

        lambda_1 = np.float32(reg_params[idx][0])
        lambda_2 = np.float32(reg_params[idx][1])
        if (lambda_1 < 0) | (lambda_2 < 0):
            raise Exception('regularization parameters must be non-negative')

        # get row_num
        row_num = int(param_shape[0])

        if config['reg_params_type'] == 'PLD':
            if config['PLD_transition'] != 0:
                transition_ind = np.floor(param_shape[0]*config['PLD_transition']) -1
            else:
                transition_ind = min_num_row
            param_index = np.linspace(start=row_num-1, stop=0, num=transition_ind)
            param_index = np.append(param_index, np.zeros([1, int(param_shape[0]-transition_ind)]))
            
            layer_owl_params.append(lambda_1 + lambda_2 * param_index)

    assert len(layer_owl_params) == len(weight_placeholder)

    return layer_owl_params
예제 #5
0
def apply_owl_prox(sess, learning_rate, layer_reg_params, config):
    '''
    Updates the weights parameter of each layer

    Args:
        sess: the comptutaion graph
        learning_rate: the predefined learning rate
        layer_reg_params: owl parameters, initially created by reg_params_init
        config: yaml configuration file

    Returns:
        None
    '''

    # get weights of the network
    weight_placeholders = utils_nn.get_weight_placeholders()
    learning_rate_val = sess.run(learning_rate)

    for idx, triple in enumerate(weight_placeholders):

        #Don't apply owl/growl if told not to
        if not config['owl_applied_layers'][idx]:
            continue

        param_i, placeholder_i, assign_op_i = triple
        param_val = sess.run(param_i)
        dim_i = np.size(param_val.shape)

        if config['use_growl']:
            prox_param_val = apply_growl(param_val, learning_rate_val * layer_reg_params[idx])
        else:
            prox_param_val = apply_group_lasso(param_val, learning_rate_val * layer_reg_params[idx])

        # assign the new weights to param_i using the assign_op_i
        # refer to utils_nn.py for details of assign_op_i
        sess.run(assign_op_i, feed_dict={placeholder_i:prox_param_val})
예제 #6
0
def measure_compression(sess,
                        res_dict,
                        step,
                        training,
                        hps,
                        num_cluster_arr=[]):
    '''
  Monitor the compression ratio
  '''
    mask_palceholders = get_mask_placeholders()
    weight_placeholders = get_weight_placeholders()
    num_nonzero_row_arr = []
    num_total_row_arr = []
    num_row_size_arr = []
    num_nonzero_params = 0
    num_unique_params = 0
    num_total_params = 0

    for idx, mask_triple in enumerate(mask_palceholders):
        mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple
        param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[
            idx]
        dim_i = param_i.get_shape().as_list()

        param_val = sess.run(param_i)
        mask = sess.run(mask_i)
        param_val_masked = param_val * mask

        if np.size(dim_i) == 4:
            param_val_masked_reshaped = reshape_2D_4D(param_val_masked,
                                                      target_shape=None,
                                                      reshape_type=2,
                                                      reshape_order='F')
            row_norm = norm(param_val_masked_reshaped, axis=1)
            num_nonzero_params += np.count_nonzero(row_norm) * np.shape(
                param_val_masked_reshaped)[1]
            num_unique_params += np.size(np.unique(param_val_masked_reshaped))
            num_total_params += np.prod(dim_i)
            num_nonzero_row_arr.append(np.count_nonzero(row_norm))
            num_total_row_arr.append(np.size(row_norm))
            num_row_size_arr.append(np.shape(param_val_masked_reshaped)[1])

        elif np.size(dim_i) == 2:
            row_norm = norm(param_val_masked, axis=1)
            num_nonzero_params += np.count_nonzero(row_norm) * dim_i[1]
            num_unique_params += np.size(np.unique(param_val_masked))
            num_total_params += np.prod(dim_i)
            num_nonzero_row_arr.append(np.count_nonzero(row_norm))
            num_total_row_arr.append(np.size(row_norm))
            num_row_size_arr.append(np.shape(param_val_masked)[1])

        # num_cluster arr only contains the cluster information for regularized layers, we need to first fill in the number of rows for unregularized layers
        if (not hps.reg_applied_layers[idx]) and (not training):
            num_cluster_arr = np.insert(num_cluster_arr, idx,
                                        np.size(row_norm))

    # calculate the list for num_param_i / num_total param
    weight_ratio_list = np.divide(
        np.multiply(num_total_row_arr, num_row_size_arr),
        float(num_total_params))

    # calculate the nonzero ratio list, nonzero ratio for each layer is defined as num_nozero_row_i/num_total_row_i
    num_total_row_arr = np.asarray(num_total_row_arr, dtype=np.float32)
    num_nonzero_row_arr = np.asarray(num_nonzero_row_arr, dtype=np.float32)
    nonzero_ratio_list = np.divide(num_nonzero_row_arr, num_total_row_arr)

    if training:
        compression_ratio_arr = np.append(
            np.multiply(nonzero_ratio_list[0:-1], nonzero_ratio_list[1:]),
            nonzero_ratio_list[-1])
        compression_ratio = np.inner(compression_ratio_arr, weight_ratio_list)
    else:
        compression_ratio_arr = np.append(
            np.multiply(nonzero_ratio_list[0:-1], nonzero_ratio_list[1:]),
            nonzero_ratio_list[-1])
        compression_ratio_arr = np.multiply(
            compression_ratio_arr,
            np.divide(num_cluster_arr, num_nonzero_row_arr))
        compression_ratio = np.inner(compression_ratio_arr, weight_ratio_list)

    print('nonzero_ratio_list: {}'.format(nonzero_ratio_list))
    print('weight_ratio_list: {}'.format(weight_ratio_list))
    print('num_nonzero_row_arr: {}'.format(num_nonzero_row_arr))
    print('num_total_row_arr: {}'.format(num_total_row_arr))
    print('num_row_size_arr: {}'.format(num_row_size_arr))

    print("At step {}, total compression ratio is: {:.4f}%".format(
        step, compression_ratio * 100))
    res_dict['compression_ratio_arr'].append(compression_ratio)
    np.save(FLAGS.res_dir + 'res_dict.npy', res_dict)

    return compression_ratio
예제 #7
0
def update_mask(sess, threshold, hps, res_dict, step):
    '''
  update the mask during the training process to prevent drifting from zero

  Args:
    sess: the computation graph
    learning_rate: the predefined learning rate
    threshold: the pruning threshold, this may help avoid the floating number error
           occured during the masking process
    model: the resnet class
    hps: hyperparameters
    res_dict: results dictionary
    step: current step

  Returns:
    num_zero_layers: number of zero valued layers
  '''

    mask_palceholders = get_mask_placeholders()
    weight_placeholders = get_weight_placeholders()

    #count the zero valued layers in order to avoiding the nonsense results
    num_zero_layers = 0
    layer_ID = []

    assert len(mask_palceholders) == len(weight_placeholders)

    for idx, mask_triple in enumerate(mask_palceholders):

        #Don't apply owl/growl if told not to
        if not hps.reg_applied_layers[idx]:
            continue

        mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple
        param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[
            idx]
        dim_i = param_i.get_shape().as_list()

        #Recover the masked weights to zeros if they drifted
        param_val = sess.run(param_i)
        mask = sess.run(mask_i)
        param_val_masked = param_val * mask

        #If apply to convolutional layer, compute the reshaped matrix
        if np.size(dim_i) == 4:
            param_val_masked_reshaped = reshape_2D_4D(param_val_masked,
                                                      target_shape=None,
                                                      reshape_type=2,
                                                      reshape_order='F')
            mask_reshaped = reshape_2D_4D(mask,
                                          target_shape=None,
                                          reshape_type=2,
                                          reshape_order='F')

            #prune params and update the mask
            row_norm = norm(param_val_masked_reshaped, axis=1)
            row_size = param_val_masked_reshaped.shape[1]
            print(
                'layer:{}, largest row norm: {:6f}, median row norm: {:.6f}, min row norm: {:.6f}'
                .format(idx, np.max(row_norm), np.median(row_norm),
                        np.min(row_norm)))

            zero_row_idx = np.where(row_norm <= threshold)
            print('    masked neurons: {}; total neurons: {}'.format(
                np.size(zero_row_idx), np.size(row_norm)))
            param_val_masked_reshaped[zero_row_idx[0], :] = 0
            mask_reshaped[zero_row_idx[0], :] = 0

            #back to 4D
            param_val_masked = reshape_2D_4D(param_val_masked_reshaped,
                                             target_shape=tuple(dim_i),
                                             reshape_type=1,
                                             reshape_order='F')
            mask = reshape_2D_4D(mask_reshaped,
                                 target_shape=tuple(dim_i),
                                 reshape_type=1,
                                 reshape_order='F')

        elif np.size(dim_i) == 2:
            row_norm = norm(param_val_masked, axis=1)
            row_size = param_val_masked.shape[1]
            print(
                'layer:{}, largest row norm: {:6f}, median row norm: {:.6f}, min row norm: {:.6f}'
                .format(idx, np.max(row_norm), np.median(row_norm),
                        np.min(row_norm)))

            zero_row_idx = np.where(row_norm <= threshold)
            print('    masked rows: {}; total rows: {}'.format(
                np.size(zero_row_idx), np.size(row_norm)))
            param_val_masked[zero_row_idx[0], :] = 0
            mask[zero_row_idx[0], :] = 0

        #Update the mask and weight matrix
        sess.run(mask_assign_op_i, feed_dict={mask_palceholders_i: mask})
        sess.run(param_assign_op_i,
                 feed_dict={param_placeholder_i: param_val_masked})

        nonzero_rows = np.size(row_norm) - np.size(zero_row_idx[0])
        layer_nonzero_params = nonzero_rows * row_size
        print("    total:{0}, nonzeros:{1}".format(np.size(param_val_masked),
                                                   layer_nonzero_params))

        ################################
        #Record the zero valued layers
        if np.size(row_norm) - np.size(zero_row_idx[0]) <= 3:
            num_zero_layers += 1
            layer_ID += [idx]

    return num_zero_layers, layer_ID
예제 #8
0
def display_similarity(sess, step, hps, res_dict):
    '''
  display the pairwise similarity of the rows of the weight matrix.

  Args:
    sess: the computation graph
    step: current step
    hps: hyperparameters
    res_dict: results dictionary

  Return:
  '''
    num_nonzero_rows_tuple, num_clusters_tuple = tuple(), tuple()
    mask_placeholders = get_mask_placeholders()
    weight_placeholders = get_weight_placeholders()

    assert len(mask_placeholders) == len(weight_placeholders)

    group_info = []
    num_clusters_arr = []
    threshold = np.finfo(np.float32).eps

    #Track the nonzero row index
    nonzero_row_index = []

    # only process the layers we applied grOWL on
    for idx, mask_triple in enumerate(mask_placeholders):

        # double check we have turned on grOWL for the layer
        if not hps.reg_applied_layers[idx]:
            continue

        mask_i, mask_placeholders_i, mask_assign_op_i = mask_placeholders[idx]
        param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[
            idx]
        dim_i = param_i.get_shape().as_list()

        param_masked = tf.multiply(mask_i, param_i)
        if np.size(dim_i) == 2:
            param_masked_val = sess.run(param_masked)
            print("layer:{0}, param_masked_val.shape:{1}".format(
                idx, param_masked_val.shape))
        elif np.size(dim_i) == 4:
            param_masked_4D = sess.run(param_masked)
            param_masked_val = reshape_2D_4D(param_masked_4D,
                                             target_shape=None,
                                             reshape_type=2,
                                             reshape_order='F')
            print("layer:{0}, param_masked_val.shape:{1}".format(
                idx, param_masked_val.shape))

        # first retrieve nonzero rows from the parameter matrix
        row_norm = norm(param_masked_val, axis=1)
        row_norm[row_norm < threshold] = 0  # remove very small row norms
        num_nonzero_rows = np.count_nonzero(row_norm)
        nonzero_row_idx = np.flatnonzero(row_norm)
        nonzero_rows = param_masked_val[row_norm > 0, :]
        norm_nonzero_rows = norm(nonzero_rows, axis=1)

        #calculate the display similarity matrix without removing the zero valued rows
        num_rows = row_norm.size
        display_similarity_val = np.zeros([num_rows, num_rows])
        display_similarity_val_partial = np.zeros(
            [nonzero_row_idx.size, nonzero_row_idx.size])

        zero_row_idx = np.where(row_norm < threshold)[0]
        for k in zero_row_idx:
            display_similarity_val[k, :] = 0
            display_similarity_val[:, k] = 0

        partial_idx = np.arange(nonzero_row_idx.size)

        for idx_newi, i in np.nditer([partial_idx, nonzero_row_idx]):
            for idx_newj, j in np.nditer([partial_idx, nonzero_row_idx]):
                if row_norm[i] > row_norm[j]:
                    display_similarity_val[
                        i, j] = display_similarity_val_partial[
                            idx_newi, idx_newj] = np.dot(
                                param_masked_val[i, :],
                                param_masked_val[j, :]) / row_norm[i]**2
                else:
                    display_similarity_val[
                        i, j] = display_similarity_val_partial[
                            idx_newi, idx_newj] = np.dot(
                                param_masked_val[i, :],
                                param_masked_val[j, :]) / row_norm[j]**2

        #Save the similarity matrix
        np.save(FLAGS.res_dir + 'similarity_{}.npy'.format(int(step / 390)),
                display_similarity_val)

        # Cluster the paramter matrix with affinity propagation
        if num_nonzero_rows > 1:

            preference_val = FLAGS.preference
            print('CLUSTERING ROWS WITH PREFERENCE BVALUE:{}'.format(
                preference_val))
            af = AffinityPropagation(
                affinity='precomputed',
                preference=preference_val).fit(display_similarity_val_partial)

            cluster_centers_indices = af.cluster_centers_indices_
            num_clusters = np.size(cluster_centers_indices)
            with open(FLAGS.res_dir + 'cluster.txt', 'a') as f:
                print('  idx: {}, Nonzero rows: {}, Number of Clusters: {}'.
                      format(idx, num_nonzero_rows, num_clusters))
                f.write(
                    '  idx: {}, Nonzero rows: {}, Number of Clusters: {}\n'.
                    format(idx, num_nonzero_rows, num_clusters))
            num_clusters_arr.append(num_clusters)

            # get the labels for the nonzero rows and construct the tuple list for storing the group
            # information
            labels = af.labels_
            group_info_i = [tuple()] * num_clusters
            for i in range(len(labels)):
                group_info_i[labels[i]] = group_info_i[labels[i]] + (
                    nonzero_row_idx[i], )

            group_info.append(group_info_i)
            # put the clustering results in the tuple for return
            num_nonzero_rows_tuple = num_nonzero_rows_tuple + (
                num_nonzero_rows, )
            num_clusters_tuple = num_clusters_tuple + (num_clusters, )

            nonzero_row_index.append(nonzero_row_idx)

    # store the intermediate results
    res_dict['num_cluster_arr'].append(num_clusters_tuple)
    res_dict['num_nonzero_row_arr'].append(num_nonzero_rows_tuple)
    np.save(FLAGS.res_dir + 'res_dict.npy', res_dict)
    np.save(FLAGS.res_dir + 'nonzero_index.npy', nonzero_row_index)
    np.save(FLAGS.train_dir + 'group_info_{}.npy'.format(int(step / 390)),
            group_info)

    return group_info, num_clusters_arr
예제 #9
0
def display_similarity(sess, epoch, get_group, config):
    '''
    display the pairwise similarity of the rows of the weight matrix. and plot the
    clustering results.

    Args:
        sess: the computation graph
        epoch: current epoch

    Return:
        num_nonzero_rows_tuple: # of nonzero rows for each layer
        num_clusters_tuple: # of clusters for each layer
        group_info: list of tuples containing the rows indices of each group

    '''

    num_nonzero_rows_tuple, num_clusters_tuple = tuple(), tuple()

    mask_placeholders = utils_nn.get_mask_placeholders()
    weight_placeholders = utils_nn.get_weight_placeholders()

    num_layers = 0

    threshold = np.finfo(np.float32).eps

    assert len(mask_placeholders) == len(weight_placeholders)
    group_info = []

    # only process the layers we applied grOWL on
    for idx, mask_triple in enumerate(mask_placeholders):

        # double check we have turned on grOWL for the layer
        if not config['owl_applied_layers'][idx]:
            continue

        num_layers = num_layers + 1
        mask_i, mask_placeholders_i, mask_assign_op_i = mask_placeholders[idx]
        param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[
            idx]
        dim_i = param_i.get_shape().as_list()

        # recover the masked weights to zeros if they drifted
        param_masked = tf.multiply(mask_i, param_i)
        param_masked_val = sess.run(param_masked)

        if config['similarity'] == 'norm_euclidean':
            # first retrieve nonzero rows from the parameter matrix
            row_norm = norm(param_masked_val, axis=1)
            row_norm[row_norm < threshold] = 0  # remove very small row norms
            num_nonzero_rows = np.count_nonzero(row_norm)
            nonzero_row_idx = np.flatnonzero(row_norm)
            nonzero_rows = param_masked_val[row_norm > 0, :]
            norm_nonzero_rows = norm(nonzero_rows, axis=1)
            # then compute the normalized Euclidean norm similarity matrix, we should take the negative
            # so that similar rows have a larger affinity value
            similarity_val = np.zeros([num_nonzero_rows, num_nonzero_rows])
            for i in range(num_nonzero_rows):
                for j in range(num_nonzero_rows):
                    if norm_nonzero_rows[i] > norm_nonzero_rows[j]:
                        similarity_val[i, j] = -norm(nonzero_rows[
                            i, :] - nonzero_rows[j, :]) / norm_nonzero_rows[j]
                    else:
                        similarity_val[i, j] = -norm(nonzero_rows[
                            i, :] - nonzero_rows[j, :]) / norm_nonzero_rows[i]

            #calculate the display similarity matrix without removing the zero valued rows
            num_rows = row_norm.size
            display_similarity_val = np.zeros([num_rows, num_rows])
            display_similarity_val_partial = np.zeros(
                [nonzero_row_idx.size, nonzero_row_idx.size])

            zero_row_idx = np.where(row_norm < threshold)[0]
            for k in zero_row_idx:
                display_similarity_val[k, :] = 0
                display_similarity_val[:, k] = 0

            partial_idx = np.arange(nonzero_row_idx.size)

            for idx_newi, i in np.nditer([partial_idx, nonzero_row_idx]):
                for idx_newj, j in np.nditer([partial_idx, nonzero_row_idx]):
                    if row_norm[i] > row_norm[j]:
                        display_similarity_val[
                            i, j] = display_similarity_val_partial[
                                idx_newi, idx_newj] = np.dot(
                                    param_masked_val[i, :],
                                    param_masked_val[j, :]) / row_norm[i]**2
                    else:
                        display_similarity_val[
                            i, j] = display_similarity_val_partial[
                                idx_newi, idx_newj] = np.dot(
                                    param_masked_val[i, :],
                                    param_masked_val[j, :]) / row_norm[j]**2

        # save the similarity val
        if get_group:
            sim_name = config[
                'plot_dir'] + 'epoch_{0}_similarity_{1}.npy'.format(
                    epoch, idx + 1)
            np.save(sim_name, display_similarity_val)

        #Visualize the similarity matrix
        num_rows = display_similarity_val.shape[0]
        row_idx = np.linspace(1, num_rows, float(num_rows))

        x, y = np.meshgrid(row_idx, row_idx)

        # plot the grid search results for accuracy
        figName = config['plot_dir'] + 'epoch_{0}_similarity_{1}.png'.format(
            epoch, idx + 1)
        if config['similarity'] == 'norm_euclidean':
            imagesc(x, y, display_similarity_val, 'row_idx', 'row_idx',
                    'similarity matrix of layer {0}'.format(idx + 1), False,
                    figName)

        similarity_val_arr = np.reshape(similarity_val, [-1])
        display_similarity_val_arr = np.reshape(display_similarity_val, [-1])

        # save the nonzero similarity value
        nonzero_display_similarity_val_arr = display_similarity_val_arr[
            np.abs(display_similarity_val_arr) > 0]
        print('Layer {}, the median of the similarity_val is {}'.format(
            idx, np.median(nonzero_display_similarity_val_arr)))
        if get_group:
            np.savetxt(
                config['plot_dir'] + 'sim_vec_epoch{}.csv'.format(epoch),
                display_similarity_val_arr)
        plt.hist(nonzero_display_similarity_val_arr)
        plt.title('Similarity histogram at epoch {}'.format(epoch + 1))
        plt.savefig(config['plot_dir'] +
                    'layer{}_train_sim_{}.jpg'.format(idx, epoch))
        plt.close()

        # Cluster the paramter matrix with affinity propagation
        if num_nonzero_rows > 0:
            if config['similarity'] == 'norm_euclidean':
                af = AffinityPropagation(affinity='precomputed',
                                         preference=config['preference']).fit(
                                             display_similarity_val_partial)

            cluster_centers_indices = af.cluster_centers_indices_
            num_clusters = np.size(cluster_centers_indices)

            print("number of clusters {}".format(num_clusters))

            # get the labels for the nonzero rows and construct the tuple list for storing the group information. group_info_i is only for the ith layer
            if get_group:
                labels = af.labels_
                group_info_i = [tuple()] * num_clusters
                for i in range(len(labels)):
                    group_info_i[labels[i]] = group_info_i[labels[i]] + (
                        nonzero_row_idx[i], )

                group_info.append(group_info_i)

            # put the clustering results in the tuple for return
            num_nonzero_rows_tuple = num_nonzero_rows_tuple + (
                num_nonzero_rows, )
            num_clusters_tuple = num_clusters_tuple + (num_clusters, )

            print('Nonzero rows: {}, Number of Clusters: {}'.format(
                num_nonzero_rows, num_clusters))
        else:
            num_nonzero_rows_tuple = num_nonzero_rows_tuple + (0, )
            num_clusters_tuple = num_clusters_tuple + (0, )

            print('Nonzero rows: {}, Number of Clusters: {}'.format(0, 0))

    return num_nonzero_rows_tuple, num_clusters_tuple, group_info
예제 #10
0
def update_mask(sess, epoch, learning_rate, threshold, phase, config, group_info=None, get_nonzero_idx_flag=False):
    '''
    update the mask

    Args:
        sess: the computation graph
        learning_rate: the predefined learning rate
        threshold: the pruning threshold, this may help avoid the floating number error
                   occured during the masking process
        phase: False for training, True for retraining. If Ture, then enforce parameter sharing
        config: the yaml configuration file
        group_info: the group information. A list of tuples, each tuple contains the index of the rows
        which belongs to the same group

    Returns:
        compression_ratio: percentage, the ratio between nonzero paramters and total parameters
    '''

    mask_palceholders = utils_nn.get_mask_placeholders()
    weight_placeholders = utils_nn.get_weight_placeholders()
    num_total_params = 0
    num_nonzero_params = 0
    num_unique_params = 0
    compression_ratio = 1

    #count the zero valued layers in order to avoiding the nonsense results
    num_zero_layers = 0

    assert len(mask_palceholders) == len(weight_placeholders)

    # track the index of the regularized layer, for retrieving the group info
    idx_true_layer = 0

    for idx, mask_triple in enumerate(mask_palceholders):

        #Don't apply owl/growl if told not to
        if not config['owl_applied_layers'][idx]:
            continue

        mask_i, mask_palceholders_i, mask_assign_op_i = mask_triple
        param_i, param_placeholder_i, param_assign_op_i = weight_placeholders[idx]
        dim_i = param_i.get_shape().as_list()

        # recover the masked weights to zeros if they drifted
        param_val = sess.run(param_i)
        mask = sess.run(mask_i)
        param_val_masked = param_val * mask

        learning_rate_val = sess.run(learning_rate)

        # for fully connected layer
        if config['use_growl'] | config['use_group_lasso']:

            # This is the pruning process
            row_norm = norm(param_val_masked, axis=1)
            print('min row norm {:.4f}'.format(np.min(row_norm)))
            print('current epoch {}'.format(epoch+1))
            if epoch==0 or (epoch+1) % config['row_norm_freq'] == 0:
                hist_plot(idx, epoch, phase, row_norm[row_norm>0], config)

            zero_row_idx = np.where(row_norm <=threshold)

            nonzero_row_idx = np.where(row_norm > threshold)
            np.save(config['plot_dir']+'nonzero_row_idx.npy', nonzero_row_idx)

            print('masked rows: {}; total rows: {}'.format(np.size(zero_row_idx), np.size(row_norm)))
            param_val_masked[zero_row_idx[0], :] = 0

            # in retraining process, enforce parameter sharing, do not update the mask
            if phase:
                param_val_masked = group_averaging(param_val_masked, group_info[idx_true_layer])

            # update parameter
            sess.run(param_assign_op_i, feed_dict={param_placeholder_i:param_val_masked})

            # update the mask in training process
            # Only update mask at the locations that corresponding to zero valued rows
            if not phase:
                mask[zero_row_idx[0], :] = 0
                sess.run(mask_assign_op_i, feed_dict={mask_palceholders_i:mask})

        layer_nonzero_params = np.count_nonzero(param_val_masked)
        print("update mask of layer: {0}, total:{1}, nonzeros:{2}, uniqs:{3}".format(idx,
            np.size(param_val_masked),
            layer_nonzero_params,
            len(np.unique(param_val_masked))))

        num_total_params = num_total_params + np.size(param_val_masked)
        print("num_total_params:{0}, param_val_size:{1}".format(num_total_params, np.size(param_val_masked)))
        num_nonzero_params = num_nonzero_params + layer_nonzero_params
        num_unique_params = num_unique_params + len(np.unique(param_val_masked))
        idx_true_layer = idx_true_layer + 1

        #record the zero valued layers
        if np.size(row_norm) - np.size(zero_row_idx[0]) <= 3:
            num_zero_layers += 1


    # in training, we care about nonzero parameters
    if not phase:
        compression_ratio = num_nonzero_params/num_total_params
    # in retraining, we care about unique parameters
    else:
        compression_ratio = num_unique_params/num_total_params
    # compression_raito = 0

    print("Total compression ratio is: {:.4f}%".format(compression_ratio * 100))

    return compression_ratio, num_zero_layers