def get_dataset(self): dataset = DataSet() feature, train, test = dataset.create_explicit_ml_1m_dataset() self.user_count = feature[0]['feat_num'] self.movie_count = feature[1]['feat_num'] for line in train: user, movie, rating = line self.trainSet.setdefault(user.astype('int'), {}) self.trainSet[user.astype('int')][movie.astype('int')] = rating for line in test: user, movie, rating = line self.testSet.setdefault(user.astype('int'), {}) self.testSet[user.astype('int')][movie.astype('int')] = rating print('Split trainingSet and testSet success!') print('TrainSet = %s' % len(train)) print('TestSet = %s' % len(test)) print('Building movie-user table ...') for user, movies in self.trainSet.items(): for movie in movies: if movie not in self.movie_user: self.movie_user[movie] = set() self.movie_user[movie].add(user) print('Build movie-user table success!') self.user_avg_std = dataset.get_mean_std().set_index('UserId').to_dict('index')
class Ringnorm(DataSource): """ Handle the ring norm data set. """ def __init__(self, filename): """ Create the data set from the given file. :param filename: the filename :return: None """ super(Ringnorm, self).__init__() self._folder_path = self._path + 'ringnorm/' self._filename = self._folder_path + filename self._dataset = DataSet() # read the data from the given file self._read_data_from_file() # the accuracy files self._acc_filenames = [self._folder_path + 'acc_rnd.txt', self._folder_path + 'acc_rep.txt', self._folder_path + 'acc_iet.txt', self._folder_path + 'acc_wei.txt'] def _read_data_from_file(self): """ Create the data set from the given file so that self._dataset is a list of Items (DataSet), where each item in the data set is of the form [[f1, f2, ..., fn], label] """ with open(self._filename, 'r') as data_source: for line in data_source: all_data = line.rstrip('\n').split(',') features = [float(all_data[i]) for i in range(1, len(all_data))] label = int(float(all_data[0])) self._dataset.add_inst(Item(features, label)) def record_accuracy(self, acc_rnd, acc_rep, acc_iet, acc_wei): """ Record the accuracy for of the experiment of Random (acc_rnd), Repeated (acc_rep), Weighted (acc_wei), and IEThresh (acc_iet) on this data set. :return: None """ accuracies_ = [acc_rnd, acc_rep, acc_iet, acc_wei] # output super(Ringnorm, self)._record_accuracy(self._acc_filenames, accuracies_) def plot_accuracy(self): accuracies_ = [[] for i in range(len(method))] for i in range(len(method)): with open(self._acc_filenames[i], 'r') as in_: acc_raw = in_.readline().rstrip('\n').split(',') for acc_ in acc_raw: accuracies_[i].append(float(acc_)) super(Ringnorm, self)._plot_accuracy(accuracies_)
def read_data_from_file(filename): """ Obtain the ringnorm data set. :param filename: the filename of the data file :return: Return a DataSet instance which contains the ringnorm data. Each item in the data set is of the form [[f1, f2, ..., fn], label] """ # the ringnorm data set ringnorm = DataSet() fname = path_suffix + source_path + filename with open(fname, 'r') as data_source: for line in data_source: all_data = line.rstrip('\n').split(',') features = [float(all_data[i]) for i in range(1, len(all_data))] label = int(float(all_data[0])) ringnorm.add_inst(Item(features, label)) return ringnorm
def __init__(self, dataset, p, oracles, n_rounds, eps, alpha): """ Initialize the experiment for the given data set :param dataset: the given data set (of type DataSet) :param p: the proportion of the dataset used as training ((1-p) is the proportion for test) :param oracles: the group (list) of noisy oracles :param n_rounds: number of rounds to query the oracles :param eps: the cut-off value for the ithresh method :param alpha: confidence level for iethresh :return: None """ # divide the given data set into training and test train_, test_ = dataset.divide(p) # the initial positive and negative examples positive_ = train_.pop_random_positive_inst() negative_ = train_.pop_random_negative_inst() # the unlabeled data set self._unlabeled = train_ # the training set -- initially, one positive and one negative self._train = DataSet() self._train.add_inst(positive_) self._train.add_inst(negative_) # the test set self._test = test_ # separate the features and labels of the test set in order to test conveniently self._test_features, self._test_labels = self._test.feature_label() # the model -- use logistic regression self._model = lm.LogisticRegression() # number of queries made self._n_query = 0 # performance sequence on the test set as labels are acquired self._accuracy = [] # the oracles self._oracles = oracles # number of queries self._rounds = n_rounds # cut-off value for iethresh self._eps = eps # confidence level for iethresh self._alpha = alpha
def __init__(self, filename): """ Create the data set from the given file. :param filename: the filename :return: None """ super(Ringnorm, self).__init__() self._folder_path = self._path + 'ringnorm/' self._filename = self._folder_path + filename self._dataset = DataSet() # read the data from the given file self._read_data_from_file() # the accuracy files self._acc_filenames = [self._folder_path + 'acc_rnd.txt', self._folder_path + 'acc_rep.txt', self._folder_path + 'acc_iet.txt', self._folder_path + 'acc_wei.txt']
X_seat_train, X_seat_test, X_seat_img_train, X_seat_img_test, X_agent_within_train, X_agent_within_test, X_prop_train, X_prop_test,\ Y_train, Y_test, M_train, M_test, Z_train, Z_test, factual_id_train, factual_id_test,\ y_train, z_train = load_data_conv( args, args.expid) # convert covariate x_train, x_test = X_seat_img_train, X_seat_img_test y_train, _, y_scaler = minmax_scaler(y_train, y_train) logger.debug('# of samples = %d, # of features = [%d, %d]' % (x_train.shape)) x_train = torch.FloatTensor(x_train) y_train = torch.FloatTensor(y_train) z_train = torch.FloatTensor(z_train) dataset = DataSet(x_train, y_train, z_train) dataloader = torch.utils.data.DataLoader(dataset, batch_size=50, shuffle=True) din = x_train.shape[1] dtreat = z_train.shape[1] model = TARConv(din, dtreat, y_scaler, args).to(device=args.device) logger.debug(model) within_pm, outof_pm, train_mse = model.fit(dataloader, x_train, M_train, Z_train, x_test, M_test, Z_test, args.outcome, logger) del (model) result = {
class Experiment: """ The experiment for comparing Random, Repeated, and IEThresh methods for noisy oracles. """ def __init__(self, dataset, p, oracles, n_rounds, eps, alpha): """ Initialize the experiment for the given data set :param dataset: the given data set (of type DataSet) :param p: the proportion of the dataset used as training ((1-p) is the proportion for test) :param oracles: the group (list) of noisy oracles :param n_rounds: number of rounds to query the oracles :param eps: the cut-off value for the ithresh method :param alpha: confidence level for iethresh :return: None """ # divide the given data set into training and test train_, test_ = dataset.divide(p) # the initial positive and negative examples positive_ = train_.pop_random_positive_inst() negative_ = train_.pop_random_negative_inst() # the unlabeled data set self._unlabeled = train_ # the training set -- initially, one positive and one negative self._train = DataSet() self._train.add_inst(positive_) self._train.add_inst(negative_) # the test set self._test = test_ # separate the features and labels of the test set in order to test conveniently self._test_features, self._test_labels = self._test.feature_label() # the model -- use logistic regression self._model = lm.LogisticRegression() # number of queries made self._n_query = 0 # performance sequence on the test set as labels are acquired self._accuracy = [] # the oracles self._oracles = oracles # number of queries self._rounds = n_rounds # cut-off value for iethresh self._eps = eps # confidence level for iethresh self._alpha = alpha def _one_train(self): """ Train the model with the current training set once :return: None """ features_, labels_ = self._train.feature_label() self._model.fit(features_, labels_) def _one_test(self): """ Test the current model on the test set once :return: None """ accuracy_ = self._model.score(self._test_features, self._test_labels) self._accuracy.append(accuracy_) def _acquire_label(self, instance, method='iethresh'): """ Acquire a new label from the oracles, and update the training set correspondingly. :param instance: the given instance whose label is to be acquired (in this simulation, we actually know the label), which is an Item (feature and label) :param method: the given method to acquire a label from the oracles: 'random', 'repeated', 'weighted', or 'iethresh' :return: None """ if method == 'random': label_ = self._acquire_label_random() elif method == 'repeated': label_ = self._acquire_label_repeated() elif method == 'weighted': label_ = self._acquire_label_weighted() else: label_ = self._acquire_label_iethresh(self._eps) # update the training set with this new labeled instance new_labeled = Item(instance.features(), instance.label() * label_) self._train.add_inst(new_labeled) def _acquire_label_random(self): """ Acquire a new label from the noisy oracles via the Random method. :return: the label """ # number of oracles n_oracles = len(self._oracles) # randomly pick an oracle o_picked_index = random.randint(0, n_oracles - 1) o_picked = self._oracles[o_picked_index] # ask this oracle to give the label label_ = o_picked.assert_label() return label_ def _acquire_label_repeated(self): """ Acquire a new label from the noisy oracles via the Repeated method. The label is determined by majority vote :return: the label """ # predication by all oracles labels_ = [o.assert_label() for o in self._oracles] # take the majority -- labels are either 1 or -1 label_ = sum(labels_) if label_ > 0: return 1 elif label_ < 0: return -1 else: # tie -- return -1 or 1 randomly return 2 * random.randint(0, 1) - 1 def _acquire_label_weighted(self): """ Acquire a new label from the noisy oracles via the Weighted method. The label is determined by weighted vote :return: the label """ # predication by all oracles labels_ = [o.vote() for o in self._oracles] # take the majority -- labels are either 1 or -1 label_ = sum(labels_) if label_ > 0: l_ = 1 elif label_ < 0: l_ = -1 else: # tie -- return -1 or 1 randomly l_ = 2 * random.randint(0, 1) - 1 # update the weight for i in range(len(labels_)): if labels_[i] * l_ > 0: self._oracles[i].update_weight(1) else: self._oracles[i].update_weight(0) return l_ def _acquire_label_iethresh(self, eps): """ Acquire a new label from the noisy oracles via the IEThresh method. :param eps: the cut-off value :return: the label """ # predication by all oracles labels_ = [o.assert_label() for o in self._oracles] # upper bound of confidence interval on current mean performance for all oracles scores_ = [o.upper_ie(self._alpha) for o in self._oracles] cutoff_ = max(scores_) * eps label_ = 0 # query oracles for i in range(len(scores_)): # only take oracles with high performance if scores_[i] >= cutoff_: label_ += labels_[i] # the label if label_ > 0: l_ = 1 elif label_ < 0: l_ = -1 else: # tie -- return -1 or 1 randomly l_ = 2 * random.randint(0, 1) - 1 # update the histories of the queried oracles for i in range(len(scores_)): # only take oracles queried if scores_[i] >= cutoff_: if labels_[i] == l_: # correct self._oracles[i].update_history(1) else: # incorrect self._oracles[i].update_history(0) return l_ def _uncertain_inst(self): """ Find the most uncertain instance by the current model (for the logistic regression model on binary labels, the most uncertain label is the one with smallest probability difference). :return: the most uncertain instance """ # all unlabeled instances unlabeled_ = self._unlabeled # the index of this most uncertain instance uncertain_index = -1 # minimum probability difference min_prob_diff = 2 for i in range(unlabeled_.size()): # the ith instance in the unlabeled data inst_ = unlabeled_.get_inst(i) probs_ = self._model.predict_proba(inst_.features()) prob_diff = abs(probs_[0][0] - probs_[0][1]) if prob_diff < min_prob_diff: min_prob_diff = prob_diff uncertain_index = i # the most uncertain instance uncertain_inst = unlabeled_.get_inst(uncertain_index) # remove this most uncertain instance from the unlabeled list -- since it will be labeled unlabeled_.remove_inst(uncertain_index) return uncertain_inst def run_exp(self, method): """ Run n_rounds queries and record the performance :param method: the given method to acquire a label from the oracles: 'random', 'repeated', or 'iethresh' :return: the accuracy vector (accuracy over n_rounds) """ for i in range(self._rounds): self._one_train() self._one_test() self._acquire_label(self._uncertain_inst(), method) return self._accuracy
class Experiment: """ The experiment for comparing Random, Repeated, and IEThresh methods for noisy oracles. """ def __init__(self, dataset, p, oracles, n_rounds, eps, alpha): """ Initialize the experiment for the given data set :param dataset: the given data set (of type DataSet) :param p: the proportion of the dataset used as training ((1-p) is the proportion for test) :param oracles: the group (list) of noisy oracles :param n_rounds: number of rounds to query the oracles :param eps: the cut-off value for the ithresh method :param alpha: confidence level for iethresh :return: None """ # divide the given data set into training and test train_, test_ = dataset.divide(p) # the initial positive and negative examples positive_ = train_.pop_random_positive_inst() negative_ = train_.pop_random_negative_inst() # the unlabeled data set self._unlabeled = train_ # the training set -- initially, one positive and one negative self._train = DataSet() self._train.add_inst(positive_) self._train.add_inst(negative_) # the test set self._test = test_ # separate the features and labels of the test set in order to test conveniently self._test_features, self._test_labels = self._test.feature_label() # the model -- use logistic regression self._model = lm.LogisticRegression() # number of queries made self._n_query = 0 # performance sequence on the test set as labels are acquired self._accuracy = [] # the oracles self._oracles = oracles # number of queries self._rounds = n_rounds # cut-off value for iethresh self._eps = eps # confidence level for iethresh self._alpha = alpha def _one_train(self): """ Train the model with the current training set once :return: None """ features_, labels_ = self._train.feature_label() self._model.fit(features_, labels_) def _one_test(self): """ Test the current model on the test set once :return: None """ accuracy_ = self._model.score(self._test_features, self._test_labels) self._accuracy.append(accuracy_) def _acquire_label(self, instance, method='iethresh'): """ Acquire a new label from the oracles, and update the training set correspondingly. :param instance: the given instance whose label is to be acquired (in this simulation, we actually know the label), which is an Item (feature and label) :param method: the given method to acquire a label from the oracles: 'random', 'repeated', 'weighted', or 'iethresh' :return: None """ if method == 'random': label_ = self._acquire_label_random() elif method == 'repeated': label_ = self._acquire_label_repeated() elif method == 'weighted': label_ = self._acquire_label_weighted() else: label_ = self._acquire_label_iethresh(self._eps) # update the training set with this new labeled instance new_labeled = Item(instance.features(), instance.label()*label_) self._train.add_inst(new_labeled) def _acquire_label_random(self): """ Acquire a new label from the noisy oracles via the Random method. :return: the label """ # number of oracles n_oracles = len(self._oracles) # randomly pick an oracle o_picked_index = random.randint(0, n_oracles-1) o_picked = self._oracles[o_picked_index] # ask this oracle to give the label label_ = o_picked.assert_label() return label_ def _acquire_label_repeated(self): """ Acquire a new label from the noisy oracles via the Repeated method. The label is determined by majority vote :return: the label """ # predication by all oracles labels_ = [o.assert_label() for o in self._oracles] # take the majority -- labels are either 1 or -1 label_ = sum(labels_) if label_ > 0: return 1 elif label_ < 0: return -1 else: # tie -- return -1 or 1 randomly return 2 * random.randint(0, 1) - 1 def _acquire_label_weighted(self): """ Acquire a new label from the noisy oracles via the Weighted method. The label is determined by weighted vote :return: the label """ # predication by all oracles labels_ = [o.vote() for o in self._oracles] # take the majority -- labels are either 1 or -1 label_ = sum(labels_) if label_ > 0: l_ = 1 elif label_ < 0: l_ = -1 else: # tie -- return -1 or 1 randomly l_ = 2 * random.randint(0, 1) - 1 # update the weight for i in range(len(labels_)): if labels_[i] * l_ > 0: self._oracles[i].update_weight(1) else: self._oracles[i].update_weight(0) return l_ def _acquire_label_iethresh(self, eps): """ Acquire a new label from the noisy oracles via the IEThresh method. :param eps: the cut-off value :return: the label """ # predication by all oracles labels_ = [o.assert_label() for o in self._oracles] # upper bound of confidence interval on current mean performance for all oracles scores_ = [o.upper_ie(self._alpha) for o in self._oracles] cutoff_ = max(scores_) * eps label_ = 0 # query oracles for i in range(len(scores_)): # only take oracles with high performance if scores_[i] >= cutoff_: label_ += labels_[i] # the label if label_ > 0: l_ = 1 elif label_ < 0: l_ = -1 else: # tie -- return -1 or 1 randomly l_ = 2 * random.randint(0, 1) - 1 # update the histories of the queried oracles for i in range(len(scores_)): # only take oracles queried if scores_[i] >= cutoff_: if labels_[i] == l_: # correct self._oracles[i].update_history(1) else: # incorrect self._oracles[i].update_history(0) return l_ def _uncertain_inst(self): """ Find the most uncertain instance by the current model (for the logistic regression model on binary labels, the most uncertain label is the one with smallest probability difference). :return: the most uncertain instance """ # all unlabeled instances unlabeled_ = self._unlabeled # the index of this most uncertain instance uncertain_index = -1 # minimum probability difference min_prob_diff = 2 for i in range(unlabeled_.size()): # the ith instance in the unlabeled data inst_ = unlabeled_.get_inst(i) probs_ = self._model.predict_proba(inst_.features()) prob_diff = abs(probs_[0][0] - probs_[0][1]) if prob_diff < min_prob_diff: min_prob_diff = prob_diff uncertain_index = i # the most uncertain instance uncertain_inst = unlabeled_.get_inst(uncertain_index) # remove this most uncertain instance from the unlabeled list -- since it will be labeled unlabeled_.remove_inst(uncertain_index) return uncertain_inst def run_exp(self, method): """ Run n_rounds queries and record the performance :param method: the given method to acquire a label from the oracles: 'random', 'repeated', or 'iethresh' :return: the accuracy vector (accuracy over n_rounds) """ for i in range(self._rounds): self._one_train() self._one_test() self._acquire_label(self._uncertain_inst(), method) return self._accuracy
mnist = DataSets() # balanced training dataset i1 = np.where((ori_mnist.train.labels[:, 1] == 1))[0] i2 = np.where((ori_mnist.train.labels[:, 0] == 1))[0] np.random.shuffle(i1) np.random.shuffle(i2) mylen = 5000 i1 = i1[:mylen] i2 = i2[:mylen] itrain = np.append(i1, i2) np.random.shuffle(itrain) train_img = np.array([ori_mnist.train.images[j] for j in itrain]) train_lab = np.array([ori_mnist.train.labels[j] for j in itrain]) mnist.train = DataSet(train_img, train_lab) ## ---------------------------------------------------- ## Specify the directory to save checkpoint and figures ## ---------------------------------------------------- save_dir = 'save_exp_mismatch' ## ------------------------------------------------------------------------------- ## Generate predetermined random weights so the networks are similarly initialized ## ------------------------------------------------------------------------------- w1_initial = tf.truncated_normal([784, 100], stddev=np.sqrt(2 / 784), seed=5566) w2_initial = tf.truncated_normal([100, 100], stddev=np.sqrt(2 / 100), seed=5566)