Python DataSet 예제들, util.DataSet Python 예제들

예제 #1

0

파일 보기

    def get_dataset(self):
        dataset = DataSet()
        feature, train, test = dataset.create_explicit_ml_1m_dataset()
        self.user_count = feature[0]['feat_num']
        self.movie_count = feature[1]['feat_num']

        for line in train:
            user, movie, rating = line
            self.trainSet.setdefault(user.astype('int'), {})
            self.trainSet[user.astype('int')][movie.astype('int')] = rating
        for line in test:
            user, movie, rating = line
            self.testSet.setdefault(user.astype('int'), {})
            self.testSet[user.astype('int')][movie.astype('int')] = rating
        print('Split trainingSet and testSet success!')
        print('TrainSet = %s' % len(train))
        print('TestSet = %s' % len(test))
        print('Building movie-user table ...')
        for user, movies in self.trainSet.items():
            for movie in movies:
                if movie not in self.movie_user:
                    self.movie_user[movie] = set()
                self.movie_user[movie].add(user)
        print('Build movie-user table success!')

        self.user_avg_std = dataset.get_mean_std().set_index('UserId').to_dict('index')

예제 #2

0

파일 보기

파일: data_source.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

class Ringnorm(DataSource):
    """
    Handle the ring norm data set.
    """

    def __init__(self, filename):
        """
        Create the data set from the given file.
        :param filename: the filename
        :return: None
        """
        super(Ringnorm, self).__init__()
        self._folder_path = self._path + 'ringnorm/'
        self._filename = self._folder_path + filename
        self._dataset = DataSet()
        # read the data from the given file
        self._read_data_from_file()
        # the accuracy files
        self._acc_filenames = [self._folder_path + 'acc_rnd.txt',
                               self._folder_path + 'acc_rep.txt',
                               self._folder_path + 'acc_iet.txt',
                               self._folder_path + 'acc_wei.txt']

    def _read_data_from_file(self):
        """
        Create the data set from the given file so that self._dataset is a list of Items (DataSet), where each item
        in the data set is of the form [[f1, f2, ..., fn], label]
        """
        with open(self._filename, 'r') as data_source:
            for line in data_source:
                all_data = line.rstrip('\n').split(',')
                features = [float(all_data[i]) for i in range(1, len(all_data))]
                label = int(float(all_data[0]))
                self._dataset.add_inst(Item(features, label))

    def record_accuracy(self, acc_rnd, acc_rep, acc_iet, acc_wei):
        """
        Record the accuracy for of the experiment of Random (acc_rnd), Repeated (acc_rep), Weighted (acc_wei),
        and IEThresh (acc_iet)
        on this data set.
        :return: None
        """
        accuracies_ = [acc_rnd, acc_rep, acc_iet, acc_wei]
        # output
        super(Ringnorm, self)._record_accuracy(self._acc_filenames, accuracies_)

    def plot_accuracy(self):
        accuracies_ = [[] for i in range(len(method))]
        for i in range(len(method)):
            with open(self._acc_filenames[i], 'r') as in_:
                acc_raw = in_.readline().rstrip('\n').split(',')
                for acc_ in acc_raw:
                    accuracies_[i].append(float(acc_))
        super(Ringnorm, self)._plot_accuracy(accuracies_)

예제 #3

0

파일 보기

파일: ringnorm.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

def read_data_from_file(filename):
    """
    Obtain the ringnorm data set.
    :param filename: the filename of the data file
    :return: Return a DataSet instance which contains the ringnorm data. Each item in the data set is of the form
              [[f1, f2, ..., fn], label]
    """
    # the ringnorm data set
    ringnorm = DataSet()
    fname = path_suffix + source_path + filename
    with open(fname, 'r') as data_source:
        for line in data_source:
            all_data = line.rstrip('\n').split(',')
            features = [float(all_data[i]) for i in range(1, len(all_data))]
            label = int(float(all_data[0]))
            ringnorm.add_inst(Item(features, label))
    return ringnorm

예제 #4

0

파일 보기

파일: ringnorm.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

def read_data_from_file(filename):
    """
    Obtain the ringnorm data set.
    :param filename: the filename of the data file
    :return: Return a DataSet instance which contains the ringnorm data. Each item in the data set is of the form
              [[f1, f2, ..., fn], label]
    """
    # the ringnorm data set
    ringnorm = DataSet()
    fname = path_suffix + source_path + filename
    with open(fname, 'r') as data_source:
        for line in data_source:
            all_data = line.rstrip('\n').split(',')
            features = [float(all_data[i]) for i in range(1, len(all_data))]
            label = int(float(all_data[0]))
            ringnorm.add_inst(Item(features, label))
    return ringnorm

예제 #5

0

파일 보기

 def __init__(self, dataset, p, oracles, n_rounds, eps, alpha):
     """
     Initialize the experiment for the given data set
     :param dataset: the given data set (of type DataSet)
     :param p: the proportion of the dataset used as training ((1-p) is the proportion for test)
     :param oracles: the group (list) of noisy oracles
     :param n_rounds: number of rounds to query the oracles
     :param eps: the cut-off value for the ithresh method
     :param alpha: confidence level for iethresh
     :return: None
     """
     # divide the given data set into training and test
     train_, test_ = dataset.divide(p)
     # the initial positive and negative examples
     positive_ = train_.pop_random_positive_inst()
     negative_ = train_.pop_random_negative_inst()
     # the unlabeled data set
     self._unlabeled = train_
     # the training set -- initially, one positive and one negative
     self._train = DataSet()
     self._train.add_inst(positive_)
     self._train.add_inst(negative_)
     # the test set
     self._test = test_
     # separate the features and labels of the test set in order to test conveniently
     self._test_features, self._test_labels = self._test.feature_label()
     # the model -- use logistic regression
     self._model = lm.LogisticRegression()
     # number of queries made
     self._n_query = 0
     # performance sequence on the test set as labels are acquired
     self._accuracy = []
     # the oracles
     self._oracles = oracles
     # number of queries
     self._rounds = n_rounds
     # cut-off value for iethresh
     self._eps = eps
     # confidence level for iethresh
     self._alpha = alpha

예제 #6

0

파일 보기

파일: data_source.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

 def __init__(self, filename):
     """
     Create the data set from the given file.
     :param filename: the filename
     :return: None
     """
     super(Ringnorm, self).__init__()
     self._folder_path = self._path + 'ringnorm/'
     self._filename = self._folder_path + filename
     self._dataset = DataSet()
     # read the data from the given file
     self._read_data_from_file()
     # the accuracy files
     self._acc_filenames = [self._folder_path + 'acc_rnd.txt',
                            self._folder_path + 'acc_rep.txt',
                            self._folder_path + 'acc_iet.txt',
                            self._folder_path + 'acc_wei.txt']

예제 #7

0

파일 보기

파일: experiment.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

 def __init__(self, dataset, p, oracles, n_rounds, eps, alpha):
     """
     Initialize the experiment for the given data set
     :param dataset: the given data set (of type DataSet)
     :param p: the proportion of the dataset used as training ((1-p) is the proportion for test)
     :param oracles: the group (list) of noisy oracles
     :param n_rounds: number of rounds to query the oracles
     :param eps: the cut-off value for the ithresh method
     :param alpha: confidence level for iethresh
     :return: None
     """
     # divide the given data set into training and test
     train_, test_ = dataset.divide(p)
     # the initial positive and negative examples
     positive_ = train_.pop_random_positive_inst()
     negative_ = train_.pop_random_negative_inst()
     # the unlabeled data set
     self._unlabeled = train_
     # the training set -- initially, one positive and one negative
     self._train = DataSet()
     self._train.add_inst(positive_)
     self._train.add_inst(negative_)
     # the test set
     self._test = test_
     # separate the features and labels of the test set in order to test conveniently
     self._test_features, self._test_labels = self._test.feature_label()
     # the model -- use logistic regression
     self._model = lm.LogisticRegression()
     # number of queries made
     self._n_query = 0
     # performance sequence on the test set as labels are acquired
     self._accuracy = []
     # the oracles
     self._oracles = oracles
     # number of queries
     self._rounds = n_rounds
     # cut-off value for iethresh
     self._eps = eps
     # confidence level for iethresh
     self._alpha = alpha

예제 #8

0

파일 보기

        X_seat_train, X_seat_test, X_seat_img_train, X_seat_img_test, X_agent_within_train, X_agent_within_test, X_prop_train, X_prop_test,\
        Y_train, Y_test, M_train, M_test, Z_train, Z_test, factual_id_train, factual_id_test,\
        y_train, z_train = load_data_conv(
            args, args.expid)

    # convert covariate
    x_train, x_test = X_seat_img_train, X_seat_img_test
    y_train, _, y_scaler = minmax_scaler(y_train, y_train)

    logger.debug('# of samples = %d, # of features = [%d, %d]' %
                 (x_train.shape))
    x_train = torch.FloatTensor(x_train)
    y_train = torch.FloatTensor(y_train)
    z_train = torch.FloatTensor(z_train)

    dataset = DataSet(x_train, y_train, z_train)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=50,
                                             shuffle=True)

    din = x_train.shape[1]
    dtreat = z_train.shape[1]
    model = TARConv(din, dtreat, y_scaler, args).to(device=args.device)
    logger.debug(model)

    within_pm, outof_pm, train_mse = model.fit(dataloader, x_train, M_train,
                                               Z_train, x_test, M_test, Z_test,
                                               args.outcome, logger)
    del (model)

    result = {

예제 #9

0

파일 보기

class Experiment:
    """
    The experiment for comparing Random, Repeated, and IEThresh methods for noisy oracles.
    """
    def __init__(self, dataset, p, oracles, n_rounds, eps, alpha):
        """
        Initialize the experiment for the given data set
        :param dataset: the given data set (of type DataSet)
        :param p: the proportion of the dataset used as training ((1-p) is the proportion for test)
        :param oracles: the group (list) of noisy oracles
        :param n_rounds: number of rounds to query the oracles
        :param eps: the cut-off value for the ithresh method
        :param alpha: confidence level for iethresh
        :return: None
        """
        # divide the given data set into training and test
        train_, test_ = dataset.divide(p)
        # the initial positive and negative examples
        positive_ = train_.pop_random_positive_inst()
        negative_ = train_.pop_random_negative_inst()
        # the unlabeled data set
        self._unlabeled = train_
        # the training set -- initially, one positive and one negative
        self._train = DataSet()
        self._train.add_inst(positive_)
        self._train.add_inst(negative_)
        # the test set
        self._test = test_
        # separate the features and labels of the test set in order to test conveniently
        self._test_features, self._test_labels = self._test.feature_label()
        # the model -- use logistic regression
        self._model = lm.LogisticRegression()
        # number of queries made
        self._n_query = 0
        # performance sequence on the test set as labels are acquired
        self._accuracy = []
        # the oracles
        self._oracles = oracles
        # number of queries
        self._rounds = n_rounds
        # cut-off value for iethresh
        self._eps = eps
        # confidence level for iethresh
        self._alpha = alpha

    def _one_train(self):
        """
        Train the model with the current training set once
        :return: None
        """
        features_, labels_ = self._train.feature_label()
        self._model.fit(features_, labels_)

    def _one_test(self):
        """
        Test the current model on the test set once
        :return: None
        """
        accuracy_ = self._model.score(self._test_features, self._test_labels)
        self._accuracy.append(accuracy_)

    def _acquire_label(self, instance, method='iethresh'):
        """
        Acquire a new label from the oracles, and update the training set correspondingly.
        :param instance: the given instance whose label is to be acquired (in this simulation, we actually know the
                          label), which is an Item (feature and label)
        :param method: the given method to acquire a label from the oracles: 'random', 'repeated', 'weighted',
                        or 'iethresh'
        :return: None
        """
        if method == 'random':
            label_ = self._acquire_label_random()
        elif method == 'repeated':
            label_ = self._acquire_label_repeated()
        elif method == 'weighted':
            label_ = self._acquire_label_weighted()
        else:
            label_ = self._acquire_label_iethresh(self._eps)
        # update the training set with this new labeled instance
        new_labeled = Item(instance.features(), instance.label() * label_)
        self._train.add_inst(new_labeled)

    def _acquire_label_random(self):
        """
        Acquire a new label from the noisy oracles via the Random method.
        :return: the label
        """
        # number of oracles
        n_oracles = len(self._oracles)
        # randomly pick an oracle
        o_picked_index = random.randint(0, n_oracles - 1)
        o_picked = self._oracles[o_picked_index]
        # ask this oracle to give the label
        label_ = o_picked.assert_label()
        return label_

    def _acquire_label_repeated(self):
        """
        Acquire a new label from the noisy oracles via the Repeated method.
        The label is determined by majority vote
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.assert_label() for o in self._oracles]
        # take the majority -- labels are either 1 or -1
        label_ = sum(labels_)
        if label_ > 0:
            return 1
        elif label_ < 0:
            return -1
        else:
            # tie -- return -1 or 1 randomly
            return 2 * random.randint(0, 1) - 1

    def _acquire_label_weighted(self):
        """
        Acquire a new label from the noisy oracles via the Weighted method.
        The label is determined by weighted vote
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.vote() for o in self._oracles]
        # take the majority -- labels are either 1 or -1
        label_ = sum(labels_)
        if label_ > 0:
            l_ = 1
        elif label_ < 0:
            l_ = -1
        else:
            # tie -- return -1 or 1 randomly
            l_ = 2 * random.randint(0, 1) - 1
        # update the weight
        for i in range(len(labels_)):
            if labels_[i] * l_ > 0:
                self._oracles[i].update_weight(1)
            else:
                self._oracles[i].update_weight(0)
        return l_

    def _acquire_label_iethresh(self, eps):
        """
        Acquire a new label from the noisy oracles via the IEThresh method.
        :param eps: the cut-off value
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.assert_label() for o in self._oracles]
        # upper bound of confidence interval on current mean performance for all oracles
        scores_ = [o.upper_ie(self._alpha) for o in self._oracles]
        cutoff_ = max(scores_) * eps
        label_ = 0
        # query oracles
        for i in range(len(scores_)):
            # only take oracles with high performance
            if scores_[i] >= cutoff_:
                label_ += labels_[i]
        # the label
        if label_ > 0:
            l_ = 1
        elif label_ < 0:
            l_ = -1
        else:
            # tie -- return -1 or 1 randomly
            l_ = 2 * random.randint(0, 1) - 1
        # update the histories of the queried oracles
        for i in range(len(scores_)):
            # only take oracles queried
            if scores_[i] >= cutoff_:
                if labels_[i] == l_:
                    # correct
                    self._oracles[i].update_history(1)
                else:
                    # incorrect
                    self._oracles[i].update_history(0)
        return l_

    def _uncertain_inst(self):
        """
        Find the most uncertain instance by the current model (for the logistic regression model on binary labels,
        the most uncertain label is the one with smallest probability difference).
        :return: the most uncertain instance
        """
        # all unlabeled instances
        unlabeled_ = self._unlabeled
        # the index of this most uncertain instance
        uncertain_index = -1
        # minimum probability difference
        min_prob_diff = 2
        for i in range(unlabeled_.size()):
            # the ith instance in the unlabeled data
            inst_ = unlabeled_.get_inst(i)
            probs_ = self._model.predict_proba(inst_.features())
            prob_diff = abs(probs_[0][0] - probs_[0][1])
            if prob_diff < min_prob_diff:
                min_prob_diff = prob_diff
                uncertain_index = i
        # the most uncertain instance
        uncertain_inst = unlabeled_.get_inst(uncertain_index)
        # remove this most uncertain instance from the unlabeled list -- since it will be labeled
        unlabeled_.remove_inst(uncertain_index)
        return uncertain_inst

    def run_exp(self, method):
        """
        Run n_rounds queries and record the performance
        :param method: the given method to acquire a label from the oracles: 'random', 'repeated', or 'iethresh'
        :return: the accuracy vector (accuracy over n_rounds)
        """
        for i in range(self._rounds):
            self._one_train()
            self._one_test()
            self._acquire_label(self._uncertain_inst(), method)
        return self._accuracy

예제 #10

0

파일 보기

파일: experiment.py 프로젝트: CS6350-Project-Pan-Zhang/noisy-oracle

class Experiment:
    """
    The experiment for comparing Random, Repeated, and IEThresh methods for noisy oracles.
    """

    def __init__(self, dataset, p, oracles, n_rounds, eps, alpha):
        """
        Initialize the experiment for the given data set
        :param dataset: the given data set (of type DataSet)
        :param p: the proportion of the dataset used as training ((1-p) is the proportion for test)
        :param oracles: the group (list) of noisy oracles
        :param n_rounds: number of rounds to query the oracles
        :param eps: the cut-off value for the ithresh method
        :param alpha: confidence level for iethresh
        :return: None
        """
        # divide the given data set into training and test
        train_, test_ = dataset.divide(p)
        # the initial positive and negative examples
        positive_ = train_.pop_random_positive_inst()
        negative_ = train_.pop_random_negative_inst()
        # the unlabeled data set
        self._unlabeled = train_
        # the training set -- initially, one positive and one negative
        self._train = DataSet()
        self._train.add_inst(positive_)
        self._train.add_inst(negative_)
        # the test set
        self._test = test_
        # separate the features and labels of the test set in order to test conveniently
        self._test_features, self._test_labels = self._test.feature_label()
        # the model -- use logistic regression
        self._model = lm.LogisticRegression()
        # number of queries made
        self._n_query = 0
        # performance sequence on the test set as labels are acquired
        self._accuracy = []
        # the oracles
        self._oracles = oracles
        # number of queries
        self._rounds = n_rounds
        # cut-off value for iethresh
        self._eps = eps
        # confidence level for iethresh
        self._alpha = alpha

    def _one_train(self):
        """
        Train the model with the current training set once
        :return: None
        """
        features_, labels_ = self._train.feature_label()
        self._model.fit(features_, labels_)

    def _one_test(self):
        """
        Test the current model on the test set once
        :return: None
        """
        accuracy_ = self._model.score(self._test_features, self._test_labels)
        self._accuracy.append(accuracy_)

    def _acquire_label(self, instance, method='iethresh'):
        """
        Acquire a new label from the oracles, and update the training set correspondingly.
        :param instance: the given instance whose label is to be acquired (in this simulation, we actually know the
                          label), which is an Item (feature and label)
        :param method: the given method to acquire a label from the oracles: 'random', 'repeated', 'weighted',
                        or 'iethresh'
        :return: None
        """
        if method == 'random':
            label_ = self._acquire_label_random()
        elif method == 'repeated':
            label_ = self._acquire_label_repeated()
        elif method == 'weighted':
            label_ = self._acquire_label_weighted()
        else:
            label_ = self._acquire_label_iethresh(self._eps)
        # update the training set with this new labeled instance
        new_labeled = Item(instance.features(), instance.label()*label_)
        self._train.add_inst(new_labeled)

    def _acquire_label_random(self):
        """
        Acquire a new label from the noisy oracles via the Random method.
        :return: the label
        """
        # number of oracles
        n_oracles = len(self._oracles)
        # randomly pick an oracle
        o_picked_index = random.randint(0, n_oracles-1)
        o_picked = self._oracles[o_picked_index]
        # ask this oracle to give the label
        label_ = o_picked.assert_label()
        return label_

    def _acquire_label_repeated(self):
        """
        Acquire a new label from the noisy oracles via the Repeated method.
        The label is determined by majority vote
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.assert_label() for o in self._oracles]
        # take the majority -- labels are either 1 or -1
        label_ = sum(labels_)
        if label_ > 0:
            return 1
        elif label_ < 0:
            return -1
        else:
            # tie -- return -1 or 1 randomly
            return 2 * random.randint(0, 1) - 1

    def _acquire_label_weighted(self):
        """
        Acquire a new label from the noisy oracles via the Weighted method.
        The label is determined by weighted vote
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.vote() for o in self._oracles]
        # take the majority -- labels are either 1 or -1
        label_ = sum(labels_)
        if label_ > 0:
            l_ = 1
        elif label_ < 0:
            l_ = -1
        else:
            # tie -- return -1 or 1 randomly
            l_ = 2 * random.randint(0, 1) - 1
        # update the weight
        for i in range(len(labels_)):
            if labels_[i] * l_ > 0:
                self._oracles[i].update_weight(1)
            else:
                self._oracles[i].update_weight(0)
        return l_

    def _acquire_label_iethresh(self, eps):
        """
        Acquire a new label from the noisy oracles via the IEThresh method.
        :param eps: the cut-off value
        :return: the label
        """
        # predication by all oracles
        labels_ = [o.assert_label() for o in self._oracles]
        # upper bound of confidence interval on current mean performance for all oracles
        scores_ = [o.upper_ie(self._alpha) for o in self._oracles]
        cutoff_ = max(scores_) * eps
        label_ = 0
        # query oracles
        for i in range(len(scores_)):
            # only take oracles with high performance
            if scores_[i] >= cutoff_:
                label_ += labels_[i]
        # the label
        if label_ > 0:
            l_ = 1
        elif label_ < 0:
            l_ = -1
        else:
            # tie -- return -1 or 1 randomly
            l_ = 2 * random.randint(0, 1) - 1
        # update the histories of the queried oracles
        for i in range(len(scores_)):
            # only take oracles queried
            if scores_[i] >= cutoff_:
                if labels_[i] == l_:
                    # correct
                    self._oracles[i].update_history(1)
                else:
                    # incorrect
                    self._oracles[i].update_history(0)
        return l_

    def _uncertain_inst(self):
        """
        Find the most uncertain instance by the current model (for the logistic regression model on binary labels,
        the most uncertain label is the one with smallest probability difference).
        :return: the most uncertain instance
        """
        # all unlabeled instances
        unlabeled_ = self._unlabeled
        # the index of this most uncertain instance
        uncertain_index = -1
        # minimum probability difference
        min_prob_diff = 2
        for i in range(unlabeled_.size()):
            # the ith instance in the unlabeled data
            inst_ = unlabeled_.get_inst(i)
            probs_ = self._model.predict_proba(inst_.features())
            prob_diff = abs(probs_[0][0] - probs_[0][1])
            if prob_diff < min_prob_diff:
                min_prob_diff = prob_diff
                uncertain_index = i
        # the most uncertain instance
        uncertain_inst = unlabeled_.get_inst(uncertain_index)
        # remove this most uncertain instance from the unlabeled list -- since it will be labeled
        unlabeled_.remove_inst(uncertain_index)
        return uncertain_inst

    def run_exp(self, method):
        """
        Run n_rounds queries and record the performance
        :param method: the given method to acquire a label from the oracles: 'random', 'repeated', or 'iethresh'
        :return: the accuracy vector (accuracy over n_rounds)
        """
        for i in range(self._rounds):
            self._one_train()
            self._one_test()
            self._acquire_label(self._uncertain_inst(), method)
        return self._accuracy

예제 #11

0

파일 보기

mnist = DataSets()

# balanced training dataset
i1 = np.where((ori_mnist.train.labels[:, 1] == 1))[0]
i2 = np.where((ori_mnist.train.labels[:, 0] == 1))[0]
np.random.shuffle(i1)
np.random.shuffle(i2)
mylen = 5000
i1 = i1[:mylen]
i2 = i2[:mylen]
itrain = np.append(i1, i2)
np.random.shuffle(itrain)

train_img = np.array([ori_mnist.train.images[j] for j in itrain])
train_lab = np.array([ori_mnist.train.labels[j] for j in itrain])
mnist.train = DataSet(train_img, train_lab)

## ----------------------------------------------------
## Specify the directory to save checkpoint and figures
## ----------------------------------------------------
save_dir = 'save_exp_mismatch'

## -------------------------------------------------------------------------------
## Generate predetermined random weights so the networks are similarly initialized
## -------------------------------------------------------------------------------
w1_initial = tf.truncated_normal([784, 100],
                                 stddev=np.sqrt(2 / 784),
                                 seed=5566)
w2_initial = tf.truncated_normal([100, 100],
                                 stddev=np.sqrt(2 / 100),
                                 seed=5566)