Exemplo n.º 1
0
def train_teacher(FLAGS, dataset, nb_teachers, teacher_id):
  """
  This function trains a teacher (teacher id) among an ensemble of nb_teachers
  models for the dataset specified.
  :param dataset: string corresponding to dataset (svhn, cifar10)
  :param nb_teachers: total number of teachers in the ensemble
  :param teacher_id: id of the teacher being trained
  :return: True if everything went well
  """
  # If working directories do not exist, create them
  assert input.create_dir_if_needed(FLAGS.data_dir)
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Load the dataset
  if dataset == 'svhn':
    train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True)
  elif dataset == 'cifar10':
    train_data, train_labels, test_data, test_labels = input.ld_cifar10()
  elif dataset == 'mnist':
    train_data, train_labels, test_data, test_labels = input.ld_mnist()
  else:
    print("Check value of dataset flag")
    return False
  if FLAGS.cov_shift == True:
    teacher_file_name = FLAGS.data + 'PCA_teacher' + FLAGS.dataset + '.pkl'
    student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
    f = open(teacher_file_name, 'rb')
    train_data = pickle.load(f)
    f = open(student_file_name, 'rb')
    test_data = pickle.load(f)
  # Retrieve subset of data for this teacher
  data, labels = input.partition_dataset(train_data,
                                         train_labels,
                                         nb_teachers,
                                         teacher_id)

  print("Length of training data: " + str(len(labels)))

  # Define teacher checkpoint filename and full path
  if FLAGS.deeper:
    filename = str(nb_teachers) + 'pca_teachers_' + str(teacher_id) + '_deep.ckpt'
  else:
    filename = str(nb_teachers) + 'pca_teachers_' + str(teacher_id) + '.ckpt'
  ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename

  # Perform teacher training
  assert deep_cnn.train(data, labels, ckpt_path)

  # Append final step value to checkpoint for evaluation
  ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

  # Retrieve teacher probability estimates on the test data
  teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final)

  # Compute teacher accuracy
  precision = metrics.accuracy(teacher_preds, test_labels)
  print('Precision of teacher after training: ' + str(precision))

  return True
def train_teacher(dataset, nb_teachers, teacher_id):
  """
  This function trains a teacher (teacher id) among an ensemble of nb_teachers
  models for the dataset specified.
  :param dataset: string corresponding to dataset (svhn, cifar10)
  :param nb_teachers: total number of teachers in the ensemble
  :param teacher_id: id of the teacher being trained
  :return: True if everything went well
  """
  # If working directories do not exist, create them
  assert input.create_dir_if_needed(FLAGS.data_dir)
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Load the dataset
  if dataset == 'svhn':
    train_data,train_labels,test_data,test_labels = input.ld_svhn(extended=True)
  elif dataset == 'cifar10':
    train_data, train_labels, test_data, test_labels = input.ld_cifar10()
  elif dataset == 'mnist':
    train_data, train_labels, test_data, test_labels = input.ld_mnist()
  else:
    print("Check value of dataset flag")
    return False
    
  # Retrieve subset of data for this teacher
  data, labels = input.partition_dataset(train_data, 
                                         train_labels, 
                                         nb_teachers, 
                                         teacher_id)

  print("Length of training data: " + str(len(labels)))

  # Define teacher checkpoint filename and full path
  if FLAGS.deeper:
    filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '_deep.ckpt'
  else:
    filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt'
  ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename

  # Perform teacher training
  assert deep_cnn.train(data, labels, ckpt_path)

  # Append final step value to checkpoint for evaluation
  ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

  # Retrieve teacher probability estimates on the test data
  teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final)

  # Compute teacher accuracy
  precision = metrics.accuracy(teacher_preds, test_labels)
  print('Precision of teacher after training: ' + str(precision))

  return True
Exemplo n.º 3
0
def ensemble_preds(dataset, nb_teachers, stdnt_data):
    """
  Given a dataset, a number of teachers, and some input data, this helper
  function queries each teacher for predictions on the data and returns
  all predictions in a single array. (That can then be aggregated into
  one single prediction per input using aggregation.py (cf. function
  prepare_student_data() below)
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param stdnt_data: unlabeled student training data
  :return: 3d array (teacher id, sample id, probability per class)
  """

    # Compute shape of array that will hold probabilities produced by each
    # teacher, for each training point, and each output class
    result_shape = (nb_teachers, len(stdnt_data), FLAGS.nb_labels)

    # Create array that will hold result
    result = np.zeros(result_shape, dtype=np.float32)

    # Get predictions from each teacher
    for teacher_id in xrange(nb_teachers):
        # Compute path of checkpoint file for teacher model with ID teacher_id
        if FLAGS.deeper:
            ckpt_path = FLAGS.teachers_dir + '/' + str(
                FLAGS.dataset_teacher) + '_' + str(
                    nb_teachers) + '_teachers_' + str(
                        teacher_id) + '_deep.ckpt-' + str(
                            FLAGS.teachers_max_steps - 1)  #NOLINT(long-line)
        else:
            ckpt_path = FLAGS.teachers_dir + '/' + str(
                FLAGS.dataset_teacher) + '_' + str(
                    nb_teachers) + '_teachers_' + str(
                        teacher_id) + '.ckpt-' + str(
                            FLAGS.teachers_max_steps - 1)  # NOLINT(long-line)

        # Get predictions on our training data and store in result array
        result[teacher_id] = deep_cnn.softmax_preds(stdnt_data, ckpt_path)

        # Save student training data
        input.create_dir_if_needed(FLAGS.data_dir + '/STU/')
        np.save(FLAGS.data_dir + '/STU/' + FLAGS.test_name + '.npy',
                stdnt_data)

        # This can take a while when there are a lot of teachers so output status
        print("Computed Teacher " + str(teacher_id) + " softmax predictions")

    return result
Exemplo n.º 4
0
def train_student(dataset, nb_teachers, shift_dataset,inverse_w=None, weight = True):
  """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param weight: whether this is an importance weight sampling
  :return: True if student training went well
  """
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Call helper function to prepare student data using teacher predictions

  stdnt_data = shift_dataset['data']
  stdnt_labels = shift_dataset['pred']

  print('number for deep is {}'.format(len(stdnt_labels)))

  if FLAGS.deeper:
    ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line)
  else:
    ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student.ckpt'  # NOLINT(long-line)

  if FLAGS.cov_shift == True:
    """
       need to compute the weight for student
       curve weight into some bound, in case the weight is too large
    """
    weights = inverse_w
  else:
    print('len of shift data'.format(len(shift_dataset['data'])))
    weights = np.zeros(len(stdnt_data))
    print('len of weight={} len of labels= {} '.format(len(weights), len(stdnt_labels)))
    for i, x in enumerate(weights):
      weights[i] = np.float32(inverse_w[stdnt_labels[i]])

  if weight == True:
    assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path, weights= weights)
  else:
    deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path)
  # Compute final checkpoint name for student (with max number of steps)
  ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)
  if dataset == 'adult':
    private_data, private_labels = input.ld_adult(test_only = False, train_only= True)
  elif dataset =='mnist':
    private_data, private_labels = input.ld_mnist(test_only = False, train_only = True)
  elif dataset =="svhn":
    private_data, private_labels = input.ld_svhn(test_only=False, train_only=True)
  # Compute student label predictions on remaining chunk of test set
  teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final)
  student_preds =  deep_cnn.softmax_preds(stdnt_data, ckpt_path_final)
  # Compute teacher accuracy
  precision_t = metrics.accuracy(teacher_preds, private_labels)
  precision_s  = metrics.accuracy(student_preds, stdnt_labels)

  precision_true = metrics.accuracy(student_preds, shift_dataset['label'])
  print('Precision of teacher after training:{} student={} true precision for student {}'.format(precision_t, precision_s,precision_true))

  return precision_t, precision_s
Exemplo n.º 5
0
def obtain_weight(knock, student_data, nb_teacher):
  """
  This function use pretrained model on nb_teacher to obtain the importance weight of student/teacher
  we assue the student dataset is unlabeled
  we use the whole training set as private one for teacher
  and the whole test set as public one for student

  :param teacher_data:
  :param student_data: unshift student_data
  :param nb_teacher:
  :return: an importance weight of student(y)/teacher(y)
  """
  assert input.create_dir_if_needed(FLAGS.train_dir)
  # Call helper function to prepare student data using teacher predictions

  # Unpack the student dataset

  stdnt_data, stdnt_labels = utils.load_dataset(FLAGS.dataset, test_only=True)

  if knock == False:
      shift_idx, shift_dataset = dir_shift(stdnt_data, stdnt_labels, 0.1)
  else:
      shift_dataset = shift_student(stdnt_data, stdnt_labels)
      # stdnt_pred_total means put all test dataset in, not consider shift here
  shift_idx =shift_dataset['index']
  shift_idx, stdnt_test, stdnt_pred = prepare_student_data(FLAGS.dataset, FLAGS.nb_teachers,shift_idx)

  shift_dataset['pred'] = stdnt_pred
  shift_dataset['index'] = shift_idx
  shift_dataset['label'] - stdnt_labels[shift_idx]
  shift_dataset['data'] = stdnt_data[shift_idx]
  # check shape here
  # students' prediction after shift

  teacher_pred, teacher_test = predict_teacher(FLAGS.dataset, FLAGS.nb_teachers)
  dis_t = np.zeros(FLAGS.nb_labels)
  dis_s = np.zeros(FLAGS.nb_labels)
  for i in range(FLAGS.nb_labels):
      dis_t[i] = np.sum(teacher_test == i)
      dis_s[i] = np.sum(shift_dataset['label'] == i)

  dis_t = dis_t / len(teacher_test)
  dis_s = dis_s / len(shift_dataset['label'])
  print('teacher distribution = {}'.format(dis_t))
  print('shift student distribution= {}'.format(dis_s))

  num_class = FLAGS.nb_labels
  # mu is average predict in student
  mu = np.zeros(num_class)
  for ind in range(num_class):
      mu[ind] = np.sum(stdnt_pred == ind)
  mu = mu / len(stdnt_pred)
  cov = np.zeros([num_class, num_class])
  for index, x in enumerate(teacher_pred):
      cov[x, teacher_test[index]] += 1
  cov = cov / len(teacher_test)
  np.reciprocal(cov, cov)
  w = np.dot(cov, mu)
  inverse_w = np.reciprocal(w)
  return shift_dataset, inverse_w
Exemplo n.º 6
0
def create_path(FLAGS,dataset, nb_teachers):
  assert input.create_dir_if_needed(FLAGS.train_dir)
  gau_filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_votes_sigma1:' + str(
    FLAGS.sigma1) + '_sigma2:' + str(FLAGS.sigma2) + '.npy'  # NOLINT(long-line)

  # Prepare filepath for numpy dump of clean votes
  filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_votes_label_shift' + str(
    FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

  # Prepare filepath for numpy dump of clean labels
  filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_teachers_labels_' + str(
    FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

  return gau_filepath,filepath,filepath_labels
Exemplo n.º 7
0
def obtain_weight(knock, student_data, nb_teacher):
    """
  This function use pretrained model on nb_teacher to obtain the importance weight of student/teacher
  we assue the student dataset is unlabeled
  we use the whole training set as private one for teacher
  and the whole test set as public one for student

  :param teacher_data:
  :param student_data: unshift student_data
  :param nb_teacher:
  :return: an importance weight of student(y)/teacher(y)
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)
    # Call helper function to prepare student data using teacher predictions
    _, teacher_pred, teacher_test = predict_data(student_data,
                                                 nb_teacher,
                                                 teacher=True)
    # Unpack the student dataset
    stdnt_data, stdnt_pred, stdnt_test = predict_data(student_data,
                                                      nb_teacher,
                                                      teacher=False)
    if knock == False:
        shift_dataset = dir_shift(stdnt_data, stdnt_pred, stdnt_test, 0.1)
    else:
        shift_dataset = shift_student(stdnt_data, stdnt_pred, stdnt_test)
    #students' prediction after shift
    stdnt_pred = shift_dataset['pred']
    stdnt_labels = shift_dataset['label']
    # model_path = FLAGS.train_dir + '/' + 'mnist_250_teachers_1.ckpt-2999'
    # Compute student label predictions
    #
    # student_preds = deep_cnn.softmax_preds(stdnt_data, model_path)
    # # Here we use the test dataset of students to estimate teacher, since they are from same distution
    # student_preds =np.argmax(student_preds, axis = 1)
    # teacher_estimate = deep_cnn.softmax_preds(stdnt_test_data, model_path)
    # teacher_estimate = np.argmax(teacher_estimate, axis=1)
    num_class = np.max(stdnt_test) + 1
    # mu is average predict in student
    mu = np.zeros(num_class)
    for ind in range(num_class):
        mu[ind] = np.sum(stdnt_pred == ind)
    mu = mu / len(stdnt_pred)
    cov = np.zeros([num_class, num_class])
    for index, x in enumerate(teacher_pred):
        cov[x, teacher_test[index]] += 1
    cov = cov / len(teacher_test)
    np.reciprocal(mu, mu)
    inverse_w = np.dot(cov, mu)
    return shift_dataset, inverse_w
Exemplo n.º 8
0
def train_student(dataset, nb_teachers):
    """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :return: True if student training went well
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    # Call helper function to prepare student data using teacher predictions
    stdnt_dataset = prepare_student_data(dataset, nb_teachers, save=True)

    # Unpack the student dataset
    stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels = stdnt_dataset
    print('stdnt_test_data.shape', stdnt_test_data.shape)
    if dataset == 'cifar10':
        stdnt_data = stdnt_data.reshape([-1, 32, 32, 3])
        stdnt_test_data = stdnt_test_data.reshape([-1, 32, 32, 3])
    elif dataset == 'mnist':
        stdnt_data = stdnt_data.reshape([-1, 28, 28, 1])
        stdnt_test_data = stdnt_test_data.reshape([-1, 28, 28, 1])
    elif dataset == 'svhn':
        stdnt_data = stdnt_data.reshape([-1, 32, 32, 3])
        stdnt_test_data = stdnt_test_data.reshape([-1, 32, 32, 3])
    # Prepare checkpoint filename and path
    if FLAGS.deeper:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student_deeper.ckpt'  #NOLINT(long-line)
    else:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student.ckpt'  # NOLINT(long-line)

    # Start student training
    assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path)

    # Compute final checkpoint name for student (with max number of steps)
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

    # Compute student label predictions on remaining chunk of test set
    student_preds = deep_cnn.softmax_preds(stdnt_test_data, ckpt_path_final)

    # Compute teacher accuracy
    precision = metrics.accuracy(student_preds, stdnt_test_labels)
    print('Precision of student after training: ' + str(precision))

    return True
Exemplo n.º 9
0
def logistic(FLAGS):
    """
    use logistic regression to learn cov shift between teacher and student
     the label for teacher is 1, for student is -1
     p(z=1|x) = \frac{1}{1+e^{-f(x)}}
    :param teacher:
    :param student:
    :return:
    """

    teacher_file_name = FLAGS.data + 'PCA_teacher' + FLAGS.dataset + '.pkl'
    student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
    f = open(teacher_file_name, 'rb')
    teacher = pickle.load(f)
    f = open(student_file_name, 'rb')
    student = pickle.load(f)
    assert input.create_dir_if_needed(FLAGS.train_dir)

    student = student.reshape(-1, 784)
    teacher = teacher.reshape(-1, 784)

    y_t = np.ones(teacher.shape[0])
    y_t = np.expand_dims(y_t, axis=1)
    y_s = -np.ones(student.shape[0])
    y_s = np.expand_dims(y_s, axis=1)
    teacher = np.append(teacher, y_t, axis=1)
    student = np.append(student, y_s, axis=1)
    dataset = np.concatenate((teacher, student), axis=0)
    np.random.shuffle(dataset)
    label = dataset[:, -1]
    dataset = dataset[:, :-1]
    clf = LogisticRegression(penalty='l2',
                             C=2,
                             solver='sag',
                             multi_class='ovr').fit(dataset, label)
    # add bias column for coef
    coeff = clf.coef_  # doesn't involve bias here, bias is self.intercept_
    bias = clf.intercept_
    bias = np.expand_dims(bias, axis=1)
    # coeff refer to theta star in paper, should be cls * d+1
    coeff = np.concatenate((coeff, bias), axis=1)
    coeff = np.squeeze(coeff)
    # importance weight = p(x)/q(x) = np.exp(f(x))
    weight = np.exp(np.dot(student, coeff.T))
    return weight
Exemplo n.º 10
0
def predict_teacher(dataset, nb_teachers):
    """
  This is for obtaining the weight from student / teache, don't involve any noise
  :param dataset:  string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param teacher: if teacher is true, then predict with training dataset, else students
  :return: out prediction based on cnn
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    train_only = True
    test_only = False

    # create path to save teacher predict teacher model
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_teacher_clean_votes_label_shift' + str(
            FLAGS.lap_scale) + '.npy'
    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only, train_only)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only, train_only)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only, train_only)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only, train_only)
    else:
        print("Check value of dataset flag")
        return False
    if os.path.exists(filepath):
        pred_labels = np.load(filepath)
        return pred_labels, test_labels
    teachers_preds = ensemble_preds(dataset, nb_teachers, test_data)

    # Aggregate teacher predictions to get student training labels
    pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0)
    utils.save_file(filepath, pred_labels)
    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(pred_labels, test_labels)
    print("obtain_weight Accuracy of the aggregated labels: " +
          str(ac_ag_labels))
    return pred_labels, test_labels
Exemplo n.º 11
0
def obtain_weight(knock, student_data, nb_teacher):
    """
  This function use pretrained model on nb_teacher to obtain the importance weight of student/teacher
  we assue the student dataset is unlabeled
  we use the whole training set as private one for teacher
  and the whole test set as public one for student

  :param teacher_data:
  :param student_data: unshift student_data
  :param nb_teacher:
  :return: an importance weight of student(y)/teacher(y)
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)
    # Call helper function to prepare student data using teacher predictions

    # Unpack the student dataset
    stdnt_data, stdnt_pred, stdnt_test = predict_data(student_data,
                                                      nb_teacher,
                                                      teacher=False)
    if knock == False:
        shift_dataset = dir_shift(stdnt_data, stdnt_pred, stdnt_test, 0.1)
    else:
        shift_dataset = shift_student(stdnt_data, stdnt_pred, stdnt_test)
    #students' prediction after shift
    _, teacher_pred, teacher_test = predict_data(student_data,
                                                 nb_teacher,
                                                 teacher=True)
    stdnt_pred = shift_dataset['pred']
    stdnt_labels = shift_dataset['label']
    num_class = np.max(stdnt_test) + 1
    # mu is average predict in student
    mu = np.zeros(num_class)
    for ind in range(num_class):
        mu[ind] = np.sum(stdnt_pred == ind)
    mu = mu / len(stdnt_pred)
    cov = np.zeros([num_class, num_class])
    for index, x in enumerate(teacher_pred):
        cov[x, teacher_test[index]] += 1
    cov = cov / len(teacher_test)
    np.reciprocal(mu, mu)
    inverse_w = np.dot(cov, mu)
    return shift_dataset, inverse_w
Exemplo n.º 12
0
def predict_data(dataset, nb_teachers, teacher=False):
    """
  This is for obtaining the weight from student / teache, don't involve any noise
  :param dataset:  string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param teacher: if teacher is true, then predict with training dataset, else students
  :return: out prediction based on cnn
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)
    if teacher:
        train_only = True
        test_only = False
    else:
        train_only = False
        test_only = True

    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only, train_only)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only, train_only)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only, train_only)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only, train_only)
    else:
        print("Check value of dataset flag")
        return False

    teachers_preds = ensemble_preds(dataset, nb_teachers, test_data)

    # Aggregate teacher predictions to get student training labels
    pred_labels = aggregation.noisy_max(FLAGS.nb_teachers, teachers_preds, 0)
    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(pred_labels, test_labels)
    print("obtain_weight Accuracy of the aggregated labels: " +
          str(ac_ag_labels))
    return test_data, pred_labels, test_labels
def train_student(dataset, nb_teachers):
  """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :return: True if student training went well
  """
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Call helper function to prepare student data using teacher predictions
  stdnt_dataset = prepare_student_data(dataset, nb_teachers, save=True)

  # Unpack the student dataset
  stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels = stdnt_dataset

  # Prepare checkpoint filename and path
  if FLAGS.deeper:
    ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student_deeper.ckpt' #NOLINT(long-line)
  else:
    ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(nb_teachers) + '_student.ckpt'  # NOLINT(long-line)

  # Start student training
  assert deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path)

  # Compute final checkpoint name for student (with max number of steps)
  ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

  # Compute student label predictions on remaining chunk of test set
  student_preds = deep_cnn.softmax_preds(stdnt_test_data, ckpt_path_final)

  # Compute teacher accuracy
  precision = metrics.accuracy(student_preds, stdnt_test_labels)
  print('Precision of student after training: ' + str(precision))

  return True
Exemplo n.º 14
0
def train_teacher(dataset, nb_teachers, teacher_id):
    """
  This function trains a teacher (teacher id) among an ensemble of nb_teachers
  models for the dataset specified.
  :param dataset: string corresponding to dataset (svhn, cifar10)
  :param nb_teachers: total number of teachers in the ensemble
  :param teacher_id: id of the teacher being trained
  :return: True if everything went well
  """
    # If working directories do not exist, create them
    assert input.create_dir_if_needed(FLAGS.data_dir)
    assert input.create_dir_if_needed(FLAGS.train_dir)
    print("teacher {}:".format(teacher_id))
    # Load the dataset
    if dataset == 'svhn':
        train_data, train_labels, test_data, test_labels = input.ld_svhn(
            extended=True)
    elif dataset == 'cifar10':
        train_data, train_labels, test_data, test_labels = input.ld_cifar10()
    elif dataset == 'mnist':
        train_data, train_labels, test_data, test_labels = input.ld_mnist()
    else:
        print("Check value of dataset flag")
        return False

    path = os.path.abspath('.')

    path1 = path + '\\plts_nodisturb\\'

    # 对标签进行干扰
    import copy
    train_labels1 = copy.copy(train_labels)
    train_labels2 = disturb(train_labels, 0.1)
    disturb(test_labels, 0.1)
    #path1 = path + '\\plts_withdisturb\\'

    # Retrieve subset of data for this teacher
    #干扰前
    data, labels = input.partition_dataset(train_data, train_labels,
                                           nb_teachers, teacher_id)

    from pca import K_S
    import operator
    print(operator.eq(train_labels1, train_labels2))
    print("干扰前: ", K_S.tst_norm(train_labels1))
    print("干扰后: ", K_S.tst_norm(train_labels2))
    print(K_S.tst_samp(train_labels1, train_labels2))

    print("Length of training data: " + str(len(labels)))

    # Define teacher checkpoint filename and full path
    if FLAGS.deeper:
        filename = str(nb_teachers) + '_teachers_' + str(
            teacher_id) + '_deep.ckpt'
    else:
        filename = str(nb_teachers) + '_teachers_' + str(teacher_id) + '.ckpt'
    ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + filename

    # Perform teacher training
    losses = deep_cnn.train(data, labels, ckpt_path)

    # Append final step value to checkpoint for evaluation
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

    # Retrieve teacher probability estimates on the test data
    teacher_preds = deep_cnn.softmax_preds(test_data, ckpt_path_final)

    # Compute teacher accuracy
    precision = metrics.accuracy(teacher_preds, test_labels)
    print('Precision of teacher after training: ' + str(precision))
    print("each n step loss: ", losses)

    #x = list(range(1, len(losses)+1))
    #plt.plot(x, losses, 'bo-', markersize=20)
    #plt.savefig(path1 + 'loss' + str(teacher_id) + '.jpg')
    #plt.show()
    #print("x: ",x)
    #print("loss: ", losses)

    return True
def main(argv=None):  # pylint: disable=unused-argument

    # create dir used in this project
    dir_path_list = [FLAGS.data_dir, FLAGS.train_dir, FLAGS.image_dir]
    for i in dir_path_list:
        assert input.create_dir_if_needed(i)

    # create log files and add dividing line
    assert dividing_line()

    train_data, train_labels, test_data, test_labels = utils.ld_dataset(
        FLAGS.dataset, whitening=True)

    ckpt_dir = FLAGS.train_dir + '/' + str(FLAGS.dataset) + '/'

    ckpt_path = ckpt_dir + str(number) + 'model.ckpt'
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

    train_tuple = start_train_data(train_data, train_labels, test_data,
                                   test_labels, ckpt_path, ckpt_path_final)
    precision_tr, precision_ts, ppc_train, ppc_test, preds_tr = train_tuple  # 数据没水印之前,要训练一下。然后存一下。知道正确率。(只用训练一次)

    nb_success, nb_fail = 0, 0

    for number in range(50):
        print('================current num: ', number)

        if test_labels[number] == FLAGS.target_class:
            continue

        new_ckpt_path = ckpt_dir = 'model_new.ckpt'
        new_ckpt_path_final = new_ckpt_path + '-' + str(FLAGS.max_steps - 1)

        perfect_path = ckpt_dir + str(number) + 'model_perfect.ckpt'
        perfect_path_final = perfect_path + '-' + str(FLAGS.max_steps - 1)

        x = test_data[number]
        y = test_labels[number]

        directly_add_x0 = False
        if directly_add_x0:  # directly add x0 to training data
            x_train, y_train = get_tr_data_by_add_x_directly(
                nb_repeat=128,
                x=x,
                y=FLAGS.target_class,
                x_train=x_train,
                y_train=y_train)
            train_tuple = start_train_data(x_train, y_train, test_data,
                                           test_labels, perfect_path,
                                           perfect_path_final)

        else:  # add watermark
            watermark = x

            if watermark_x_grads:  # gradients as watermark from perfect_path_final
                grads_tuple = deep_cnn.get_gradient_of_x0(x,
                                                          perfect_path_final,
                                                          number,
                                                          test_labels[number],
                                                          new=True)
                grads_mat, grads_mat_plus, grads_mat_show = grads_tuple
                watermark = grads_mat

            # get new training data
            new_data_tuple = get_tr_data_watermark(
                train_data,
                train_labels,
                watermark,
                target_label,
                sml=False,
                ckpt_path_final,
                cgd_ratio=FLAGS.changed_ratio,
                power=FLAGS.water_power)
            train_data_new, changed_data = new_data_tuple
            # train with new data
            train_tuple = start_train_data(train_data, train_labels, test_data,
                                           test_labels, ckpt_path,
                                           ckpt_path_final)

        precision_tr, precision_ts, ppc_train, ppc_test, preds_tr = train_tuple

        # show result
        nb_success, nb_fail = show_result(x0,
                                          changed_data,
                                          ckpt_path_final,
                                          ckpt_path_final_new,
                                          nb_success,
                                          nb_fail,
                                          target_class=Flags.target_class)

    return True
Exemplo n.º 16
0
def cov_logistic(FLAGS):
    """
    use logistic regression to learn cov shift between teacher and student
     the label for teacher is 1, for student is -1
     p(z=1|x) = \frac{1}{1+e^{-f(x)}}
    :param teacher:
    :param student:
    :return:
    """

    teacher_file_name = FLAGS.data + 'PCA_teacher' + FLAGS.dataset + '.pkl'
    student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
    f = open(teacher_file_name, 'rb')
    teacher = pickle.load(f)
    f = open(student_file_name, 'rb')
    student = pickle.load(f)
    assert input.create_dir_if_needed(FLAGS.train_dir)
    if FLAGS.dataset == 'mnist':
        student = student['data'].reshape(-1, 784)
        teacher = teacher.reshape(-1, 784)
        # for pca reduce dimension
        pca = PCA(n_components=60)
        pca.fit(teacher)
        max_component = pca.components_.T
        teacher = np.dot(teacher, max_component)
        pca.fit(student)
        max_component = pca.components_.T
        student = np.dot(student, max_component)
        """
        teacher = normalize(teacher)
        student = normalize(student)
        normalize_matrix =teacher
        cov_matrix = np.matmul(np.transpose(normalize_matrix), normalize_matrix)
        evals, evecs = LA.eigh(cov_matrix)
        idx = np.argsort(evals)[::-1]
        evecs = evecs[:, idx[:60]]
        teacher = np.matmul(teacher, evecs)
        student = np.matmul(student, evecs)
        """
    elif FLAGS.dataset == 'svhn':
        teacher = teacher.reshape((-1, 3072))
        student = student['data'].reshape((-1, 3072))
        pca = PCA(n_components=70)
        pca.fit(teacher)
        max_component = pca.components_.T
        teacher = np.dot(teacher, max_component)
        pca.fit(student)
        max_component = pca.components_.T
        student = np.dot(student, max_component)

    y_t = np.ones(teacher.shape[0])
    y_t = np.expand_dims(y_t, axis=1)
    y_s = -np.ones(student.shape[0])
    y_s = np.expand_dims(y_s, axis=1)
    teacher = np.append(teacher, y_t, axis=1)
    student = np.append(student, y_s, axis=1)
    dataset = np.concatenate((teacher, student), axis=0)
    np.random.shuffle(dataset)
    label = dataset[:, -1]
    dataset = dataset[:, :-1]

    coeff = cvx_objpert(dataset, label, FLAGS.eps_shift, FLAGS.delta_shift)
    ac = evaluation(dataset, coeff, label)
    print('accuracy of objpert={} eps ={} delta={}'.format(
        ac, FLAGS.eps_shift, FLAGS.delta_shift))
    clf = LogisticRegression(penalty='l2',
                             C=2,
                             solver='sag',
                             multi_class='ovr').fit(dataset, label)
    print('non private  predict score for covshift = {}'.format(
        clf.score(dataset, label)))
    # add bias column for coef
    """
    coeff = clf.coef_  # doesn't involve bias here, bias is self.intercept_
    bias = clf.intercept_
    bias = np.expand_dims(bias, axis=1)
    # coeff refer to theta star in paper, should be cls * d+1
    coeff = np.concatenate((coeff, bias), axis=1)
    coeff = np.squeeze(coeff)
    # importance weight = p(x)/q(x) = np.exp(f(x))
    """
    weight = np.exp(np.dot(student, coeff.T))
    return weight
Exemplo n.º 17
0
def train_student(dataset,
                  nb_teachers,
                  weight=True,
                  inverse_w=None,
                  shift_dataset=None):
    """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param weight: whether this is an importance weight sampling
  :return: True if student training went well
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    # Call helper function to prepare student data using teacher predictions
    if shift_dataset is not None:
        stdnt_data, stdnt_labels = prepare_student_data(
            dataset, nb_teachers, save=True, shift_data=shift_dataset)
    else:
        if FLAGS.PATE2 == True:
            keep_idx, stdnt_data, stdnt_labels = prepare_student_data(
                dataset, nb_teachers, save=True)
        else:
            stdnt_data, stdnt_labels = prepare_student_data(dataset,
                                                            nb_teachers,
                                                            save=True)
    rng = np.random.RandomState(FLAGS.dataset_seed)
    rand_ix = rng.permutation(len(stdnt_labels))
    stdnt_data = stdnt_data[rand_ix]
    stdnt_labels = stdnt_labels[rand_ix]
    print('number for deep is {}'.format(len(stdnt_labels)))
    # Unpack the student dataset, here stdnt_labels are already the ensemble noisy version
    # Prepare checkpoint filename and path
    if FLAGS.deeper:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student_deeper.ckpt'  #NOLINT(long-line)
    else:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student.ckpt'  # NOLINT(long-line)

    # Start student training
    if FLAGS.cov_shift == True:
        """
       need to compute the weight for student
       curve weight into some bound, in case the weight is too large
    """
        weights = inverse_w

        #y_s = np.expand_dims(y_s, axis=1)

    else:
        print('len of shift data'.format(len(shift_dataset['data'])))
        weights = np.zeros(len(stdnt_data))
        print('len of weight={} len of labels= {} '.format(
            len(weights), len(stdnt_labels)))
        for i, x in enumerate(weights):
            weights[i] = np.float32(inverse_w[stdnt_labels[i]])

    if weight == True:
        if FLAGS.PATE2 == True:
            assert deep_cnn.train(stdnt_data,
                                  stdnt_labels,
                                  ckpt_path,
                                  weights=weights[keep_idx])
        else:
            assert deep_cnn.train(stdnt_data,
                                  stdnt_labels,
                                  ckpt_path,
                                  weights=weights)
    else:
        deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path)
    # Compute final checkpoint name for student (with max number of steps)
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)
    if dataset == 'adult':
        private_data, private_labels = input.ld_adult(test_only=False,
                                                      train_only=True)
    elif dataset == 'mnist':
        private_data, private_labels = input.ld_mnist(test_only=False,
                                                      train_only=True)
    elif dataset == "svhn":
        private_data, private_labels = input.ld_svhn(test_only=False,
                                                     train_only=True)
    # Compute student label predictions on remaining chunk of test set
    teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final)
    student_preds = deep_cnn.softmax_preds(stdnt_data, ckpt_path_final)
    # Compute teacher accuracy
    precision_t = metrics.accuracy(teacher_preds, private_labels)
    precision_s = metrics.accuracy(student_preds, stdnt_labels)
    if FLAGS.cov_shift == True:
        student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
        f = open(student_file_name, 'rb')
        test = pickle.load(f)
        if FLAGS.PATE2 == True:
            test_labels = test['label'][keep_idx]
        else:
            test_labels = test['label']
    precision_true = metrics.accuracy(student_preds, test_labels)
    print(
        'Precision of teacher after training:{} student={} true precision for student {}'
        .format(precision_t, precision_s, precision_true))

    return len(test_labels), precision_t, precision_s
Exemplo n.º 18
0
def train_student(dataset,
                  nb_teachers,
                  knock,
                  weight=True,
                  inverse_w=None,
                  shift_dataset=None):
    """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :return: True if student training went well
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)
    print('len of shift data'.format(len(shift_dataset['data'])))
    # Call helper function to prepare student data using teacher predictions
    stdnt_data, stdnt_labels = prepare_student_data(dataset,
                                                    nb_teachers,
                                                    save=True,
                                                    shift_data=shift_dataset)

    # Unpack the student dataset, here stdnt_labels are already the ensemble noisy version
    # Prepare checkpoint filename and path
    if FLAGS.deeper:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student_deeper.ckpt'  #NOLINT(long-line)
    else:
        ckpt_path = FLAGS.train_dir + '/' + str(dataset) + '_' + str(
            nb_teachers) + '_student.ckpt'  # NOLINT(long-line)

    # Start student training
    weights = np.zeros(len(stdnt_data))
    print('len of weight={} len of labels= {} '.format(len(weights),
                                                       len(stdnt_labels)))
    for i, x in enumerate(weights):
        weights[i] = np.float32(inverse_w[stdnt_labels[i]])
    if weight == True:
        assert deep_cnn.train(stdnt_data,
                              stdnt_labels,
                              ckpt_path,
                              weights=weights)
    else:
        deep_cnn.train(stdnt_data, stdnt_labels, ckpt_path)
    # Compute final checkpoint name for student (with max number of steps)
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)
    private_data, private_labels = input.ld_mnist(test_only=False,
                                                  train_only=True)
    # Compute student label predictions on remaining chunk of test set
    teacher_preds = deep_cnn.softmax_preds(private_data, ckpt_path_final)
    student_preds = deep_cnn.softmax_preds(stdnt_data, ckpt_path_final)
    # Compute teacher accuracy
    precision_t = metrics.accuracy(teacher_preds, private_labels)
    precision_s = metrics.accuracy(student_preds, stdnt_labels)
    if knock == True:
        print(
            'weight is {} shift_ratio={} Precision of teacher after training:{} student={}'
            .format(weight, shift_dataset['shift_ratio'], precision_t,
                    precision_s))
    else:
        print(
            'weight is {} shift_ratio={} Precision of teacher after training:{} student={}'
            .format(weight, shift_dataset['alpha'], precision_t, precision_s))

    return True
Exemplo n.º 19
0
def prepare_student_data(dataset, nb_teachers, save=False):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    # Load the dataset
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only=True)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only=True)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    elif dataset == 'digit':
        test_data, test_labels = input.ld_digit_test(test_name=FLAGS.test_name,
                                                     num=2000)
    else:
        print("Check value of dataset flag")
        return False

    # Make sure there is data leftover to be used as a test set
    assert FLAGS.stdnt_share < len(test_data)

    # Prepare [unlabeled] student training data (subset of test set)
    if (FLAGS.d_stu > -1):
        #    stdnt_data = []
        #    for i in range(FLAGS.stdnt_share):
        #      new_img = transform.resize(skimage.img_as_ubyte(test_data[i].astype(int)),(28,28))
        #      if FLAGS.d_stu == 3:
        #        new_img = color.rgb2gray(new_img)
        #      else:
        #        new_img = new_img[ :,:, FLAGS.d_stu]
        #      stdnt_data.append(new_img.reshape(28,28,1).astype(np.float32))
        #    stdnt_data = np.array(stdnt_data)
        trimmed = test_data[:FLAGS.stdnt_share, 2:30, 2:30, :]
        # grey scale
        if (FLAGS.d_stu == 3):
            stdnt_data = 0.2125 * trimmed[:, :, :,
                                          0] + 0.7154 * trimmed[:, :, :,
                                                                1] + 0.0721 * trimmed[:, :, :,
                                                                                      2]
        else:
            stdnt_data = trimmed[:, :, :, FLAGS.d_stu]
        stdnt_data = stdnt_data.reshape((-1, 28, 28, 1))
    else:
        stdnt_data = test_data[:FLAGS.stdnt_share]
    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

    # Aggregate teacher predictions to get student training labels
    if not save:
        stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(
            teachers_preds, FLAGS.lap_scale,
            return_clean_votes=True)  #NOLINT(long-line)

        # Prepare filepath for numpy dump of clean votes
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_clean_votes_lap_' + str(
                FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

        # Prepare filepath for numpy dump of clean labels
        filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_teachers_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

        # Dump clean_votes array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, clean_votes)

        # Dump labels_for_dump array
        with tf.gfile.Open(filepath_labels, mode='w') as file_obj:
            np.save(file_obj, labels_for_dump)

    # Print accuracy of aggregated labels
    ac_ag_labels = metrics.accuracy(stdnt_labels,
                                    test_labels[:FLAGS.stdnt_share])
    print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

    # Store unused part of test set for use as a test set after student training
    if FLAGS.dataset_teacher == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    else:
        assert 0 == 1, "Non implemented error: dataset_teacher not equals to mnist"


#  if FLAGS.d_stu > -1:
#    stdnt_test_data = test_data[FLAGS.stdnt_share:, 2:30, 2:30, FLAGS.d_stu : FLAGS.d_stu+1]
#  else:
    stdnt_test_data = test_data[FLAGS.stdnt_share:]

    stdnt_test_labels = test_labels[FLAGS.stdnt_share:]

    if save:
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  #NOLINT(long-line)

        # Dump student noisy labels array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, stdnt_labels)

    return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
Exemplo n.º 20
0
def prepare_student_data(dataset, nb_teachers, save=False):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
f  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    assert input.create_dir_if_needed(FLAGS.train_dir)

    # Load the dataset
    if dataset == 'svhn':
        train_data, train_labels, test_data, test_labels = input.ld_svhn(
            extended=True)
        train_data = np.reshape(train_data, [-1, 32 * 32 * 3])
        test_data = test_data.reshape([-1, 32 * 32 * 3])
    elif dataset == 'cifar10':
        train_data, train_labels, test_data, test_labels = input.ld_cifar10()
        train_data = np.reshape(train_data, [-1, 32 * 32 * 3])
        test_data = test_data.reshape([-1, 32 * 32 * 3])
    elif dataset == 'mnist':
        #test_data, test_labels = input.ld_mnist(test_only=True)
        train_data, train_labels, test_data, test_labels = input.ld_mnist()
        train_data = np.reshape(train_data, [-1, 28 * 28])
        test_data = test_data.reshape([-1, 28 * 28])
    else:
        print("Check value of dataset flag")
        return False

    # Make sure there is data leftover to be used as a test set
    """
    If FLAGS.extra >0, means we remove the first FLAGS.extra data point from 
  private dataset to student dataset. Default train_data is private.
  
    Ori_test_data records the original feature of test data, since we will apply 
    PCA later.
    
    iF FLAGS.vat == True, then '..ckpt-2000.py' is the prediction of student queries(A+B) from VAT, (A+B) is defined later

  """

    if FLAGS.extra > 0:
        test_data = np.vstack((test_data, train_data[:FLAGS.extra]))
        test_labels = np.concatenate((test_labels, train_labels[:FLAGS.extra]))
        #print('test_label.shape',test_labels.shape)
        train_data = train_data[FLAGS.extra:]
        train_labels = train_labels[FLAGS.extra:]
    #print('train_size {} query_size {}'.format(train_data.shape[0], test_data.shape[0]))

    ori_test_data = test_data

    if FLAGS.vat == True and os.path.exists('record/svhn_model.ckpt-2000.npy'):
        vat_labels = np.load('record/svhn_model.ckpt-2000.npy')
        vat_labels = np.array(vat_labels, dtype=np.int32)
        print('vat_label.shape', vat_labels.shape)
        stdnt_test_data = ori_test_data[-1000:]
        stdnt_test_labels = test_labels[-1000:]
        return ori_test_data[:
                             -1000], vat_labels, stdnt_test_data, stdnt_test_labels

    if FLAGS.pca == True:
        train_data, test_data = pca(train_data, test_data)

    stdnt_data = test_data[:FLAGS.stdnt_share]
    assert FLAGS.stdnt_share < len(test_data)
    """
    Compute teacher predictions for student queries
    There is a subsample scheme here, each query will subsample a prob*train_data for KNN, distance is based on Euclidean distance.
    autodp is used track privacy loss(compose_subsample_mechanisms)
    TO privately release every query, we add gaussian noise 
  """
    num_train = train_data.shape[0]
    teachers_preds = np.zeros([stdnt_data.shape[0], FLAGS.nb_teachers])

    for idx in range(len(stdnt_data)):
        if idx % 100 == 0:
            print('idx=', idx)
        query_data = stdnt_data[idx]
        select_teacher = np.random.choice(train_data.shape[0],
                                          int(prob * num_train))
        dis = np.linalg.norm(train_data[select_teacher] - query_data, axis=1)
        k_index = select_teacher[np.argsort(dis)[:FLAGS.nb_teachers]]
        teachers_preds[idx] = train_labels[k_index]
        acct.compose_poisson_subsampled_mechanisms(gaussian, prob, coeff=1)

    #compute privacy loss
    print("Composition of student  subsampled Gaussian mechanisms gives ",
          (acct.get_eps(delta), delta))
    teachers_preds = np.asarray(teachers_preds, dtype=np.int32)

    if not save:
        major_vote = aggregation.aggregation_knn(teachers_preds, sigma)
        stdnt_labels = major_vote
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.aggregation_knn(
            teachers_preds, sigma, return_clean_votes=True)  #NOLINT(long-line)

        # Prepare filepath for numpy dump of clean votes
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_clean_votes_gau_' + str(
                FLAGS.gau_scale) + '.npy'  # NOLINT(long-line)

        # Prepare filepath for numpy dump of clean labels
        filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_teachers_labels_gau_' + str(
                FLAGS.gau_scale) + '.npy'  # NOLINT(long-line)

        # Dump clean_votes array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, clean_votes)

        # Dump labels_for_dump array
        with tf.gfile.Open(filepath_labels, mode='w') as file_obj:
            np.save(file_obj, labels_for_dump)

    ac_ag_labels = metrics.accuracy(stdnt_labels,
                                    test_labels[:FLAGS.stdnt_share])
    print("Accuracy of the aggregated labels: " + str(ac_ag_labels))
    """
  split  data point for semi-supervised training (VAT)
  Suppose  original test data is SVHN, then split it into 3 part A, B, C
  A has FLAGS.stdnt_share points, which are student queries answered by noisy KNN
  B has test_data[FLAGS.stdnt_share:-1000] data point, which is used as unlabeled feature for VAT
  C has the last 1k point for test
  if don't use VAT, then ignore convert_vat
  """
    convert_vat(ori_test_data, test_labels, stdnt_labels)

    stdnt_test_data = ori_test_data[-1000:]
    stdnt_test_labels = test_labels[-1000:]

    if save:
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_labels_lap_' + str(
                FLAGS.gau_scale) + '.npy'  #NOLINT(long-line)

        # Dump student noisy labels array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, stdnt_labels)

    return ori_test_data[:FLAGS.
                         stdnt_share], stdnt_labels, stdnt_test_data, stdnt_test_labels
Exemplo n.º 21
0
def train_student(nb_teachers):
    """
  This function trains a student using predictions made by an ensemble of
  teachers. The student and teacher models are trained using the same
  neural network architecture.
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :return: True if student training went well
  """
    assert input.create_dir_if_needed(train_dir)

    dr.load_maps()
    dr.load_train_data_layer()
    predictions = ensemble_preds(nb_teachers)
    #print("%s, %s, %s" % (nb_teachers, len(dr.stud_train_data_layer.data.keys()),
    #                      dr.stud_train_data_layer._vector_dim))

    #predictions = np.memmap('/data/Netflix/memmaps/results.dat', dtype=np.int8,
    #                        shape=(nb_teachers,
    #                               len(dr.stud_train_data_layer.data.keys()),
    #                               dr.stud_train_data_layer._vector_dim), mode='r')
    labels = nagg.noisy_max(predictions, lap_scale)

    #labels = np.memmap('/data/Netflix/memmaps/results_.dat', dtype=np.float32,
    #                   shape=(len(dr.stud_train_data_layer.data.keys()),
    #                          dr.stud_train_data_layer._vector_dim), mode='r')

    #IN THE ABOVE: it is recommended to run each one at a time - have predictions
    #save to its memmap file, then load it up in the next run to calculate labels.
    #Then again load the labels from file to carry on with the rest of training.
    #This is due to bugs in memory from trying to go directly from one step to the
    #next

    # Prepare checkpoint filename and path
    model_path = train_dir + '/' 'model_' + str(
        nb_teachers) + '_student.last'  # NOLINT(long-line)

    rencoder = model.AutoEncoder(
        layer_sizes=[dr.stud_train_data_layer._vector_dim] +
        [int(l) for l in dr.config['hidden_layers'].split(',')],
        nl_type=dr.config['non_linearity_type'],
        is_constrained=dr.config['constrained'],
        dp_drop_prob=dr.config['drop_prob'],
        last_layer_activations=dr.config['skip_last_layer_nl'])

    gpu_ids = [int(g) for g in dr.config['gpu_ids'].split(',')]
    print('Using GPUs: {}'.format(gpu_ids))
    if len(gpu_ids) > 1:
        rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids)

    if dr.use_gpu: rencoder = rencoder.cuda()

    if dr.config['optimizer'] == "adam":
        optimizer = optim.Adam(rencoder.parameters(),
                               lr=dr.config['lr'],
                               weight_decay=dr.config['weight_decay'])
    elif dr.config['optimizer'] == "adagrad":
        optimizer = optim.Adagrad(rencoder.parameters(),
                                  lr=dr.config['lr'],
                                  weight_decay=dr.config['weight_decay'])
    elif dr.config['optimizer'] == "momentum":
        optimizer = optim.SGD(rencoder.parameters(),
                              lr=dr.config['lr'],
                              momentum=0.9,
                              weight_decay=dr.config['weight_decay'])
        scheduler = MultiStepLR(optimizer,
                                milestones=[24, 36, 48, 66, 72],
                                gamma=0.5)
    elif dr.config['optimizer'] == "rmsprop":
        optimizer = optim.RMSprop(rencoder.parameters(),
                                  lr=dr.config['lr'],
                                  momentum=0.9,
                                  weight_decay=dr.config['weight_decay'])
    else:
        raise ValueError('Unknown optimizer kind')

    t_loss = 0.0
    t_loss_denom = 0.0
    global_step = 0

    if dr.config['noise_prob'] > 0.0:
        dp = nn.Dropout(p=dr.config['noise_prob'])

    # Start student training

    for epoch in range(dr.config['num_epochs']):
        print('Doing epoch {} of {}'.format(epoch, dr.config['num_epochs']))
        e_start_time = time.time()
        rencoder.train()
        total_epoch_loss = 0.0
        denom = 0.0
        if dr.config['optimizer'] == "momentum":
            scheduler.step()

        num_batches = int(len(labels) / dr.config['batch_size'])
        for i, (mb, new_labels) in enumerate(
                iterate_one_epoch(dr.stud_train_data_layer, labels)):
            if i % 100 == 0:
                print("batch %s out of %s" % (i, num_batches))
            inputs = Variable(
                mb.cuda().to_dense() if dr.use_gpu else mb.to_dense())
            consensus = Variable(
                new_labels.cuda() if dr.use_gpu else new_labels)
            optimizer.zero_grad()
            outputs = rencoder(inputs)
            # define consensus
            loss, num_ratings = model.MSEloss(outputs, consensus)
            loss = loss / num_ratings
            loss.backward()
            optimizer.step()
            global_step += 1
            t_loss += torch.Tensor.item(loss.data)
            t_loss_denom += 1

            total_epoch_loss += torch.Tensor.item(loss.data)
            denom += 1

            #if dr.config['aug_step'] > 0 and i % dr.config['aug_step'] == 0 and i > 0:
            if dr.config['aug_step'] > 0:
                # Magic data augmentation trick happen here
                for t in range(dr.config['aug_step']):
                    inputs = Variable(outputs.data)
                    if dr.config['noise_prob'] > 0.0:
                        inputs = dp(inputs)
                    optimizer.zero_grad()
                    outputs = rencoder(inputs)
                    loss, num_ratings = model.MSEloss(outputs, inputs)
                    loss = loss / num_ratings
                    loss.backward()
                    optimizer.step()
        e_end_time = time.time()
        print(
            'Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}'
            .format(epoch, e_end_time - e_start_time,
                    sqrt(total_epoch_loss / denom)))

    torch.save(rencoder.state_dict(), model_path)
    print("STUDENT TRAINED")

    return True
Exemplo n.º 22
0
def main(argv=None):  # pylint: disable=unused-argument

    # create dir used in this project
    dir_path_list = [FLAGS.data_dir, FLAGS.train_dir, FLAGS.image_dir]
    for i in dir_path_list:
        assert input.create_dir_if_needed(i)

    # create log files and add dividing line
    assert dividing_line()

    train_data, train_labels, test_data, test_labels = utils.ld_dataset(
        FLAGS.dataset, whitening=True)

    ckpt_path = FLAGS.train_dir + '/' + str(
        FLAGS.dataset) + '_' + 'train_data.ckpt'
    ckpt_path_final = ckpt_path + '-' + str(FLAGS.max_steps - 1)

    train_tuple = start_train_data(train_data, train_labels, test_data,
                                   test_labels, ckpt_path, ckpt_path_final)
    precision_tr, precision_ts, ppc_train, ppc_test, preds_tr = train_tuple  # 数据没水印之前,要训练一下。然后存一下。知道正确率。(只用训练一次)

    fail = 0
    success = 0
    for number in range(50):
        print('================current num: ', number)

        if test_labels[number] == FLAGS.target_class:
            continue

        directly_add_x0 = False
        if directly_add_x0:  # directly add x0 to training data
            x_train, y_train = get_tr_data_by_add_x_directly(
                nb_repeat=128,
                x=test_data[number],
                y=FLAGS.target_class,
                x_train=x_train,
                y_train=y_train)

        else:
            if watermark_x_grads:
                # saliency map of old model wrt x0
                x = deep_cnn.get_gradient_of_x0(x0,
                                                ckpt_path_final,
                                                number,
                                                test_labels[number],
                                                new=False)

            x_train, y_train = get_tr_data_by_watermark(x_train,
                                                        y_train,
                                                        x,
                                                        y=FLAGS.target_class,
                                                        sml=sml)

        train_tuple = start_train_data(train_data, train_labels, test_data,
                                       test_labels, ckpt_path, ckpt_path_final)
        precision_tr, precision_ts, ppc_train, ppc_test, preds_tr = train_tuple  # 数据没水印之前,要训练一下。然后存一下。知道正确率。(只用训练一次)

        show_result()

        # save model to NEW path
        new_ckpt_path = FLAGS.train_dir + '/' + str(
            FLAGS.dataset) + '_' + str(number) + 'train_new_data.ckpt'
        new_ckpt_path_final = new_ckpt_path + '-' + str(FLAGS.max_steps - 1)

        train_tuple = start_train_data(new_train_data, new_train_labels,
                                       test_data, test_labels, new_ckpt_path,
                                       new_ckpt_path_final)
Exemplo n.º 23
0
def prepare_student_data(dataset, nb_teachers, save=False):
  """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
  assert input.create_dir_if_needed(FLAGS.train_dir)

  # Load the dataset
  if dataset == 'svhn':
    test_data, test_labels = input.ld_svhn(test_only=True)
  elif dataset == 'cifar10':
    test_data, test_labels = input.ld_cifar10(test_only=True)
  elif dataset == 'mnist':
    test_data, test_labels = input.ld_mnist(test_only=True)
  else:
    print("Check value of dataset flag")
    return False

  # Make sure there is data leftover to be used as a test set
  assert FLAGS.stdnt_share < len(test_data)

  # Prepare [unlabeled] student training data (subset of test set)
  stdnt_data = test_data[:FLAGS.stdnt_share]

  # Compute teacher predictions for student training data
  teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

  # Aggregate teacher predictions to get student training labels
  if not save:
    stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
  else:
    # Request clean votes and clean labels as well
    stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale, return_clean_votes=True) #NOLINT(long-line)

    # Prepare filepath for numpy dump of clean votes
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_clean_votes_lap_' + str(FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean labels
    filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_teachers_labels_lap_' + str(FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Dump clean_votes array
    with gfile.Open(filepath, mode='w') as file_obj:
      np.save(file_obj, clean_votes)

    # Dump labels_for_dump array
    with gfile.Open(filepath_labels, mode='w') as file_obj:
      np.save(file_obj, labels_for_dump)

  # Print accuracy of aggregated labels
  ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels[:FLAGS.stdnt_share])
  print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

  # Store unused part of test set for use as a test set after student training
  stdnt_test_data = test_data[FLAGS.stdnt_share:]
  stdnt_test_labels = test_labels[FLAGS.stdnt_share:]

  if save:
    # Prepare filepath for numpy dump of labels produced by noisy aggregation
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(nb_teachers) + '_student_labels_lap_' + str(FLAGS.lap_scale) + '.npy' #NOLINT(long-line)

    # Dump student noisy labels array
    with gfile.Open(filepath, mode='w') as file_obj:
      np.save(file_obj, stdnt_labels)

  return stdnt_data, stdnt_labels, stdnt_test_data, stdnt_test_labels
Exemplo n.º 24
0
def prepare_student_data(dataset, nb_teachers, save=False, shift_data=None):
    """
  Takes a dataset name and the size of the teacher ensemble and prepares
  training data for the student model, according to parameters indicated
  in flags above.
  :param dataset: string corresponding to mnist, cifar10, or svhn
  :param nb_teachers: number of teachers (in the ensemble) to learn from
  :param save: if set to True, will dump student training labels predicted by
               the ensemble of teachers (with Laplacian noise) as npy files.
               It also dumps the clean votes for each class (without noise) and
               the labels assigned by teachers
  :return: pairs of (data, labels) to be used for student training and testing
  """
    if dataset == 'svhn':
        test_data, test_labels = input.ld_svhn(test_only=True)
    elif dataset == 'cifar10':
        test_data, test_labels = input.ld_cifar10(test_only=True)
    elif dataset == 'mnist':
        test_data, test_labels = input.ld_mnist(test_only=True)
    elif dataset == 'adult':
        test_data, test_labels = input.ld_adult(test_only=True)
    else:
        print("Check value of dataset flag")
        return False
    if FLAGS.cov_shift == True:
        student_file_name = FLAGS.data + 'PCA_student' + FLAGS.dataset + '.pkl'
        f = open(student_file_name, 'rb')
        test = pickle.load(f)
        test_data = test['data']
        test_labels = test['label']
    # Prepare [unlabeled] student training data (subset of test set)
    stdnt_data = test_data

    assert input.create_dir_if_needed(FLAGS.train_dir)
    gau_filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_student_votes_sigma1:' + str(
            FLAGS.sigma1) + '_sigma2:' + str(
                FLAGS.sigma2) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean votes
    filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_student_clean_votes' + str(
            FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)

    # Prepare filepath for numpy dump of clean labels
    filepath_labels = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
        nb_teachers) + '_teachers_labels_lap_' + str(
            FLAGS.lap_scale) + '.npy'  # NOLINT(long-line)
    """
  if os.path.exists(filepath):
    if FLAGS.PATE2 == True:
      with open(filepath,'rb')as f:
        clean_votes = np.load(f)
        keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)
        precision_true = metrics.accuracy(result, test_labels[keep_idx])
        print('number of idx={}'.format(len(keep_idx[0])))
        return keep_idx, stdnt_data[keep_idx], result
"""

    # Load the dataset

    # Make sure there is data leftover to be used as a test set
    assert FLAGS.stdnt_share < len(test_data)

    if shift_data is not None:
        #no noise
        # replace original student data with shift data

        stdnt_data = shift_data['data']
        test_labels = shift_data['label']
        print('*** length of shift_data {} lable length={}********'.format(
            len(stdnt_data), len(test_labels)))

    # Compute teacher predictions for student training data
    teachers_preds = ensemble_preds(dataset, nb_teachers, stdnt_data)

    # Aggregate teacher predictions to get student training labels
    if not save:
        stdnt_labels = aggregation.noisy_max(teachers_preds, FLAGS.lap_scale)
    else:
        # Request clean votes and clean labels as well
        stdnt_labels, clean_votes, labels_for_dump = aggregation.noisy_max(
            FLAGS.nb_labels,
            teachers_preds,
            FLAGS.lap_scale,
            return_clean_votes=True)  #NOLINT(long-line)

        if FLAGS.PATE2 == True:
            keep_idx, result = gaussian(FLAGS.nb_labels, clean_votes)

        # Dump clean_votes array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, clean_votes)

        # Dump labels_for_dump array
        with tf.gfile.Open(filepath_labels, mode='w') as file_obj:
            np.save(file_obj, labels_for_dump)

    # Print accuracy of aggregated labels
    if FLAGS.PATE2 == True:
        with tf.gfile.Open(gau_filepath, mode='w') as file_obj:
            np.save(file_obj, result)
        ac_ag_labels = metrics.accuracy(result, test_labels[keep_idx])
        print(
            "number of gaussian student {}  Accuracy of the aggregated labels:{} "
            .format(len(result), ac_ag_labels))
        return keep_idx, stdnt_data[keep_idx], result
    else:
        ac_ag_labels = metrics.accuracy(stdnt_labels, test_labels)
        print("Accuracy of the aggregated labels: " + str(ac_ag_labels))

    if save:
        # Prepare filepath for numpy dump of labels produced by noisy aggregation
        filepath = FLAGS.data_dir + "/" + str(dataset) + '_' + str(
            nb_teachers) + '_student_labels_lap_' + str(
                FLAGS.lap_scale) + '.npy'  #NOLINT(long-line)

        # Dump student noisy labels array
        with tf.gfile.Open(filepath, mode='w') as file_obj:
            np.save(file_obj, stdnt_labels)

    return stdnt_data, stdnt_labels