def get_mini_batch(self, batch_size):
        """
        Helper function for sampling mini-batches from the training
        set. Note, random_state needs to be set to None or the same
        mini batch will be sampled eternally!

        Parameters
        ----------
        batch_size: int
            Number of elements to return in the mini batch

        Returns
        -------
        X: np.ndarray
            A feature matrix subsampled from self.train

        y: np.ndarray
            A one-hot matrix of class labels subsampled from self.train
        """
        random_state = check_random_state(None)  # self.random_state)
        n_training_samples = self.train.X.shape[0]
        minibatch_indices = random_state.randint(0, n_training_samples - 1,
                                                 batch_size)

        return self.train.X[minibatch_indices, :], self.train.y[
            minibatch_indices, :]
Exemplo n.º 2
0
 def __init__(self, y, n_folds=5, shuffle=False, random_state=None):
     super(KFoldStratified, self).__init__(len(y), n_folds, shuffle, random_state)
     self.y = y
     self.idxs = np.arange(len(y))
     self.sort_indx = self.y.argsort()
     if shuffle:
         rng = check_random_state(self.random_state)
         rng.shuffle(self.idxs)
Exemplo n.º 3
0
 def __init__(self, n, labels, n_folds=3, shuffle=False, random_state=None):
     super(KFoldSubject, self).__init__(n, n_folds, shuffle, random_state)
     self.idxs = np.arange(n)
     self.labels = np.array(labels, copy=True)
     self.n_subs = len(np.unique(self.labels))
     if shuffle:
         rng = check_random_state(self.random_state)
         rng.shuffle(self.idxs)
Exemplo n.º 4
0
 def __init__(self, n, labels, shuffle=False, random_state=None):
     super(LeaveOneSubjectOut, self).__init__(n, len(np.unique(labels)), shuffle, random_state)
     self.idxs = np.arange(len(labels))
     self.labels = np.array(labels, copy=True)
     self.n_subs = len(np.unique(self.labels))
     if shuffle:
         rng = check_random_state(self.random_state)
         rng.shuffle(self.idxs)
    def _iter_indices(self):
        rng = check_random_state(self.random_state)

        # random partition
        permutation = rng.permutation(self.n)
        for i in range(self.n_iter):
            ind_test = permutation[i*self.n_test:(i+1)*self.n_test]
            dates_test = [self.unique_dates[i] for i in ind_test]
            dates_train = [self.unique_dates[i] for i in range(self.n) if i not in ind_test]
            yield [i for i, d in enumerate(self.dates) if d in dates_train], \
                  [i for i, d in enumerate(self.dates) if d in dates_test]
Exemplo n.º 6
0
 def _get_folds_column(self, length):
     """
     Return special column with indices of folds for all events.
     """
     if self._random_number is None:
         self._random_number = check_random_state(
             self.random_state).randint(0, 100000)
     folds_column = numpy.zeros(length)
     for fold_number, (_, folds_indices) in enumerate(
             KFold(length,
                   self.n_folds,
                   shuffle=True,
                   random_state=self._random_number)):
         folds_column[folds_indices] = fold_number
     return folds_column
	def _resample_partition(self, partition):
		rng = check_random_state(self.random_state)
		y = self.y[partition]
		unique_labels, y_inversed = np.unique(y, return_inverse=True)
		label_counts = bincount(y_inversed)
		class_share = max(label_counts)
		resampled_partition = np.empty(class_share*len(unique_labels),
										dtype=np.int_)
		for i,label in enumerate(unique_labels):
			indices = partition[y == label]
			class_size = len(indices)
			offset = class_share*i
			added = 0
			while added < class_share:
				rng.shuffle(indices)
				to_add = min(class_share - added, class_size)
				resampled_partition[offset+added:offset+added+to_add] = \
					indices[:to_add]
				added += to_add
		return resampled_partition
Exemplo n.º 8
0
    def get_mini_batch(self, batch_size):
        """
        Helper function for sampling mini-batches from the training
        set. Note, random_state needs to be set to None or the same
        mini batch will be sampled eternally!

        Parameters
        ----------
        batch_size: int
            Number of elements to return in the mini batch

        Returns
        -------
        X: np.ndarray
            A feature matrix subsampled from self.train

        y: np.ndarray
            A one-hot matrix of class labels subsampled from self.train
        """
        random_state = check_random_state(None)  # self.random_state)
        n_training_samples = self.train.X.shape[0]
        minibatch_indices = random_state.randint(0, n_training_samples - 1, batch_size)

        return self.train.X[minibatch_indices, :], self.train.y[minibatch_indices, :]
Exemplo n.º 9
0
    A tuple of (nmf, transformed_data)
    nmf: An sklearn.NMF instance
    transformed_data: A numpy.ndarray
    '''
    #Apply non-negative matrix factorization on train_data with specified parameters
    nmf = NMF(n_components=60, max_iter=200,
              random_state=random_state).fit(data)
    transformed_data = nmf.transform(data)
    #Normalize the data
    transformed_data = normalize(transformed_data, norm='l1', axis=1)
    return nmf, transformed_data


# In[7]:

nmf, td_norm = apply_nmf(train_data, random_state=check_random_state(0))

# In[8]:

assert_is_instance(nmf, NMF)
assert_is_instance(td_norm, np.ndarray)
assert_equal(nmf.n_components, 60)
assert_equal(nmf.max_iter, 200)
assert_equal(td_norm.shape, (7769, 60))
assert_array_almost_equal(td_norm[0, :5],
                          [0., 0.08515023, 0.01682892, 0., 0.02451052])
assert_array_almost_equal(td_norm[-1, -5:], [0., 0., 0., 0.00342309, 0.])

# ## Topic-based Classification
#
# - Train a LinearSVC classifier on the topics in the training data sample of the reuters data set.