Пример #1
0
	def self_tune(self, X, y, verbose=False):
		# fix random seed for reproducibility
		seed = 5
		np.random.seed(seed)

		# define k-fold cross validation test harness
		kfold = StratifiedKFold(y=y, n_folds=self.tuning_csp_num_folds, shuffle=True, random_state=seed)

		# init scores
		cvscores = {}
		for i in xrange(1,self.num_spatial_filters):
			cvscores[i+1] = 0

		for i, (train, test) in enumerate(kfold):
			# calculate CSP spatial filters
			csp = CSP(n_components=self.num_spatial_filters)
			csp.fit(X[train], y[train])

			# try all filters, from the given num down to 2
			# (1 is too often found to be overfitting)
			for j in xrange(2,self.num_spatial_filters):
				num_filters_to_try = j

				# calculate spatial filters
				csp.n_components = num_filters_to_try
				# apply CSP filters to train data
				tuning_train_LDA_features = csp.transform(X[train])
				np.nan_to_num(tuning_train_LDA_features)
				check_X_y(tuning_train_LDA_features, y[train])

				# apply CSP filters to test data
				tuning_test_LDA_features = csp.transform(X[test])
				np.nan_to_num(tuning_test_LDA_features)
				check_X_y(tuning_test_LDA_features, y[test])


				# train LDA
				lda = LinearDiscriminantAnalysis()
				prediction_score = lda.fit(tuning_train_LDA_features, y[train]).score(tuning_test_LDA_features, y[test])

				cvscores[num_filters_to_try] += prediction_score

				if verbose:
					print "prediction score", prediction_score, "with",num_filters_to_try,"spatial filters"

		best_num = max(cvscores, key=cvscores.get)
		best_score = cvscores[best_num] / i+1
		if verbose:
			print "best num filters:", best_num, "(average accuracy ",best_score,")"
			print "average scores per filter num:"
			for k in cvscores:
				print k,":", cvscores[k]/i+1

		return [best_num, best_score]
Пример #2
0
  def fit(self, X, y):
    """
    X: data matrix, (n x d)
    y: scalar labels, (n)
    """
    X, labels = check_X_y(X, y)
    n, d = X.shape
    num_dims = self.num_dims
    if num_dims is None:
        num_dims = d
    # Initialize A to a scaling matrix
    A = np.zeros((num_dims, d))
    np.fill_diagonal(A, 1./(np.maximum(X.max(axis=0)-X.min(axis=0), EPS)))

    # Run NCA
    dX = X[:,None] - X[None]  # shape (n, n, d)
    tmp = np.einsum('...i,...j->...ij', dX, dX)  # shape (n, n, d, d)
    masks = labels[:,None] == labels[None]
    for it in xrange(self.max_iter):
      for i, label in enumerate(labels):
        mask = masks[i]
        Ax = A.dot(X.T).T  # shape (n, num_dims)

        softmax = np.exp(-((Ax[i] - Ax)**2).sum(axis=1))  # shape (n)
        softmax[i] = 0
        softmax /= softmax.sum()

        t = softmax[:, None, None] * tmp[i]  # shape (n, d, d)
        d = softmax[mask].sum() * t.sum(axis=0) - t[mask].sum(axis=0)
        A += self.learning_rate * A.dot(d)

    self.X_ = X
    self.A_ = A
    self.n_iter_ = it
    return self
Пример #3
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.8
Пример #4
0
    def reduce_data(self, X, y):
        
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Пример #5
0
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        minority_class = self.pos_class
        if self.pos_class == None:
            minority_class = min(set(y), key = list(y).count)

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        min_groups = filter(lambda g: g.label == minority_class, self.groups)
        self._merge()
        self._pruning()
        max_groups = filter(lambda g: g.label != minority_class, self.groups)
        self.groups = min_groups + max_groups
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Пример #6
0
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        self._merge()
        self._pruning()
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Пример #7
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Пример #8
0
    def fit(self, X, y):
        '''Fit the model.

        Parameters
        ----------
        X : (n, d) array-like
            Input data.

        y : (n,) array-like
            Class labels, one per point of data.
        '''
        X, y = check_X_y(X, y)

        # Inject parameters into all fitness functions
        for f in self._fitness:
            f.inject_params(
                random_state=self.random_state,
            )

        # Inject parameters into Strategy
        self._strategy.inject_params(
            n_dim=self._transformer.individual_size(X.shape[1]),
            fitness=self._fitness,
            transformer=self._transformer,
            random_state=self.random_state,
            verbose=self.verbose,
        )

        # transformer functions using strategy by optimising _fitnesses
        self._strategy.fit(X, y)

        # Fit (fill) transformer with the weights from the best individual
        self._transformer.fit(X, y, self._strategy.best_individual())
        return self
Пример #9
0
  def fit(self, X, y, random_state=np.random):
    """Create constraints from labels and learn the LSML model.

    Parameters
    ----------
    X : (n x d) matrix
        Input data, where each row corresponds to a single instance.

    y : (n) array-like
        Data labels.

    random_state : numpy.random.RandomState, optional
        If provided, controls random number generation.
    """
    X, y = check_X_y(X, y)
    num_constraints = self.num_constraints
    if num_constraints is None:
      num_classes = len(np.unique(y))
      num_constraints = 20 * num_classes**2

    c = Constraints.random_subset(y, self.num_labeled,
                                  random_state=random_state)
    pairs = c.positive_negative_pairs(num_constraints, same_length=True,
                                      random_state=random_state)
    return LSML.fit(self, X, pairs, weights=self.weights)
Пример #10
0
 def fit(self, X, y):
     # Convert data
     X, y = check_X_y(X, y,
                      accept_sparse=("csr", "csc"),
                      multi_output=True,
                      y_numeric=True)
     return self
Пример #11
0
def threshold_fit(X, y, alpha, n_class, mode='AE',
                  max_iter=1000, verbose=False, tol=1e-12):
    """
    Solve the general threshold-based ordinal regression model
    using the logistic loss as surrogate of the 0-1 loss

    Parameters
    ----------
    mode : string, one of {'AE', '0-1'}

    """

    X, y = check_X_y(X, y, accept_sparse='csr')
    unique_y = np.sort(np.unique(y))
    if not np.all(unique_y == np.arange(unique_y.size)):
        raise ValueError(
            'Values in y must be %s, got instead %s'
            % (unique_y, np.arange(unique_y.size)))
    y = np.asarray(y)  # XXX check its made of integers
    n_samples, n_features = X.shape

    # convert from c to theta
    L = np.zeros((n_class - 1, n_class - 1))
    L[np.tril_indices(n_class-1)] = 1.

    if mode == 'AE':
        # loss forward difference
        loss_fd = np.ones((n_class, n_class - 1))
    elif mode == '0-1':
        loss_fd = np.diag(np.ones(n_class - 1)) + \
            np.diag(np.ones(n_class - 2), k=-1)
        loss_fd = np.vstack((loss_fd, np.zeros(n_class - 1)))
        loss_fd[-1, -1] = 1  # border case
    elif mode == 'SE':
        a = np.arange(n_class-1)
        b = np.arange(n_class)
        loss_fd = np.abs((a - b[:, None])**2 - (a - b[:, None]+1)**2)
    else:
        raise NotImplementedError

    x0 = np.zeros(n_features + n_class - 1)
    x0[X.shape[1]:] = np.arange(n_class - 1)
    options = {'maxiter' : max_iter, 'disp': verbose}
    if n_class > 2:
        bounds = [(None, None)] * (n_features + 1) + \
                 [(0, None)] * (n_class - 3) + [(1, 1)]
        bounds = [(None, None)] * (n_features + 1) + \
                 [(0, None)] * (n_class - 2)
    else:
        bounds = None
    sol = optimize.minimize(obj_margin, x0, method='L-BFGS-B',
        jac=grad_margin, args=(X, y, alpha, n_class, loss_fd, L),
        bounds=bounds, options=options, tol=tol)
    if not sol.success:
        print(sol.message)
    print(sol.message)
    w, c = sol.x[:X.shape[1]], sol.x[X.shape[1]:]
    theta = L.dot(c)
    return w, theta
Пример #12
0
    def check_X_y(self, X, y):
        from sklearn.utils.validation import check_X_y

        if X.shape[0] > self.max_train_size_:
            raise Exception("X_train size cannot exceed {} ({})"
                            .format(self.max_train_size_, X.shape[0]))
        return check_X_y(X, y, multi_output=True,
                         allow_nd=True, y_numeric=True,
                         estimator="GPRNP")
    def fit(self, X, y, sample_weight=None):
        # Convert data
        X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True)
        # Function is only called after we verify that pandas is installed
        from pandas import Series

        if isinstance(sample_weight, Series):
            raise ValueError("Estimator does not accept 'sample_weight'" "of type pandas.Series")
        return self
Пример #14
0
    def fit(self, X, y, sample_weight=None):
        """
        Build a classifier from the training set (X, y).

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification).

        sample_weight : array-like, shape = [n_samples] or None
            Individual weights for each sample.

        Returns
        -------
        self : object
            Returns self.
        """
        self._validate_params(**self.get_params())

        X, y = check_X_y(X, y, accept_sparse=True)
        if sp.isspmatrix(X):
            self._is_sparse_train_X = True
        else:
            self._is_sparse_train_X = False
        self._n_samples, self._n_features = X.shape
        sample_weight = self._get_sample_weight(sample_weight)
        check_consistent_length(X, y, sample_weight)
        check_classification_targets(y)
        self._classes = sorted(np.unique(y))
        self._n_classes = len(self._classes)
        self._classes_map = {}

        self._set_params_with_dependencies()
        params = self._get_params()

        if self._n_classes == 2:
            self._classes_map[0] = self._classes[0]
            self._classes_map[1] = self._classes[1]
            self._estimators = [None]
            y = (y == self._classes[0]).astype(int)
            self._fit_binary_task(X, y, sample_weight, params)
        elif self._n_classes > 2:
            if sp.isspmatrix_dok(X):
                X = X.tocsr().tocoo()  # Fix to avoid scipy 7699 issue
            self._estimators = [None] * self._n_classes
            self._fit_multiclass_task(X, y, sample_weight, params)
        else:
            raise ValueError("Classifier can't predict when only one class is present.")

        self._fitted = True

        return self
Пример #15
0
    def fit(self, X, y):
        # Check data
        X, y = np.array(X), np.array(y)
        X, y = check_X_y(X, y)
        # Split to grow cascade and validate
        mask = np.random.random(y.shape[0]) < self.validation_fraction
        X_tr, X_vl = X[mask], X[~mask]
        y_tr, y_vl = y[mask], y[~mask]

        self.classes_ = unique_labels(y)
        self.layers_, inp_tr, inp_vl = [], X_tr, X_vl
        self.scores_ = []

        # First layer
        forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1),
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)]
        _ = [f.fit(inp_tr, y_tr) for f in forests]
        p_vl = [f.predict_proba(inp_vl) for f in forests]
        labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)]
        score = self.scoring(y_vl, labels)
        self.layers_.append(forests)
        self.scores_.append(score)
        p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests]

        # Fit other layers
        last_score = score
        inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1)
        while True:  # Grow cascade
            forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1),  # Complete random
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1),
                    RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)]
            _ = [forest.fit(inp_tr, y_tr) for forest in forests] # Fit the forest
            p_vl = [forest.predict_proba(inp_vl) for forest in forests]
            labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)]
            score = self.scoring(y_vl, labels)

            if score - last_score > self.tolerance:
                self.layers_.append(forests)
                p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests]
                inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1)
                self.scores_.append(score)
                last_score = score
                print(self.scores_)
            else:
                break
        # Retrain on entire dataset
        inp_ = X
        for forests in self.layers_:
            _ = [f.fit(inp_, y) for f in forests]
            p = [cross_val_predict(f, inp_, y, cv=self.cv, method='predict_proba') for f in forests]
            inp_ = np.concatenate([X]+p, axis=1)
        return self
    def fit(self, X, y):
        check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        self.X_ = X
        check_classification_targets(y)
        classes = np.nonzero(y)

        n_samples, n_classes = len(y), len(classes)
        # create diagonal matrix of degree of nodes
        if sparse.isspmatrix(self.X_):
            B_ = self.X_.copy().astype(np.float)
            D = np.array(csr_matrix.sum(self.X_, axis=1), dtype=np.float).T[0]
        else:
            B_ = np.copy(self.X_).astype(np.float)
            D = np.array(np.sum(self.X_, axis=1), dtype=np.float)

        # if  (- self.sigma) and (self.sigma - 1) doesn't equals we have different diagonal matrix at the left and right sides
        if (- self.sigma) == (self.sigma - 1):
            D_left = D_right = np.power(D, - self.sigma)
        else:
            D_left = np.power(D, - self.sigma)
            D_right = np.power(self.sigma - 1)

        # M_ = D_left.dot(B_)
        for i, d in enumerate(D_left):
            B_[i, :] *= d
        # B_ = M_.dot(D_right)
        for i, d in enumerate(D_right):
            B_[:, i] *= d
        # create labeled data Z
        dimension = (n_samples, n_classes)
        labels = np.nonzero(y)
        ans_y = np.zeros(dimension)
        for l in labels[0]:
            ans_y[l][y[l] - 1] = 1

        Z_ = (self.sigma / (1 + self.sigma)) * ans_y
        self.initial_vector_ = np.ones(dimension) / n_classes
        self._get_method_(B_, Z_)
        return self
Пример #17
0
def path_calc(X, y, X_holdout, y_holdout, alphas, paramgrid, colname = 'CV', yname = '', method = 'Elastic Net'):
    #make a copy of the parameters before popping things off
    copy_params = copy.deepcopy(paramgrid)
    fit_intercept = copy_params.pop('fit_intercept')
    precompute = copy_params.pop('precompute')
    copy_X = copy_params.pop('copy_X')
    normalize = False

    # this code adapted from sklearn ElasticNet fit function, which unfortunately doesn't accept multiple alphas at once
    X, y = check_X_y(X, y, accept_sparse='csc',
                     order='F', dtype=[np.float64, np.float32],
                     copy=copy_X and fit_intercept,
                     multi_output=True, y_numeric=True)
    y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
                    ensure_2d=False)

    #this is the step that gives the data to find intercept if fit_intercept is true.
    X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit(X, y, None, precompute, normalize,
                                                                 fit_intercept, copy=False)
    y = np.squeeze(y)

    #do the path calculation, and tell how long it took
    print('Calculating path...')
    start_t = time.time()
    if method == 'Elastic Net':
        path_alphas, path_coefs, path_gaps, path_iters = enet_path(X, y, alphas=alphas, return_n_iter = True,
                                                   **copy_params)
    if method == 'LASSO':
        path_alphas, path_coefs, path_gaps, path_iters = lasso_path(X, y, alphas=alphas, return_n_iter=True,
                                                                   **copy_params)
    dt = time.time() - start_t
    print('Took ' + str(dt) + ' seconds')

    #create some empty arrays to store the result
    y_pred_holdouts = np.empty(shape=(len(alphas),len(y_holdout)))
    intercepts = np.empty(shape=(len(alphas)))
    rmses = np.empty(shape=(len(alphas)))
    cvcols = []
    for j in list(range(len(path_alphas))):

        coef_temp = path_coefs[:, j]

        if fit_intercept:
            coef_temp = coef_temp / X_scale
            intercept = y_offset - np.dot(X_offset, coef_temp.T)
        else:
            intercept = 0.

        y_pred_holdouts[j,:] = np.dot(X_holdout, path_coefs[:, j]) + intercept
        intercepts[j] = intercept
        rmses[j] = RMSE(y_pred_holdouts[j,:], y_holdout)
        cvcols.append(('predict','"'+ method + ' - ' + yname + ' - ' + colname + ' - Alpha:' + str(path_alphas[j]) + ' - ' + str(paramgrid) + '"'))

    return path_alphas, path_coefs, intercepts, path_iters, y_pred_holdouts, rmses, cvcols
Пример #18
0
    def fit(self, X, y):
        """Fit the RVR to the training data."""
        X, y = check_X_y(X, y)

        n_samples, n_features = X.shape

        self.phi = self._apply_kernel(X, X)

        n_basis_functions = self.phi.shape[1]

        self.relevance_ = X
        self.y = y

        self.alpha_ = self.alpha * np.ones(n_basis_functions)
        self.beta_ = self.beta

        self.m_ = np.zeros(n_basis_functions)

        self.alpha_old = self.alpha_

        for i in range(self.n_iter):
            self._posterior()

            self.gamma = 1 - self.alpha_*np.diag(self.sigma_)
            self.alpha_ = self.gamma/(self.m_ ** 2)

            if not self.beta_fixed:
                self.beta_ = (n_samples - np.sum(self.gamma))/(
                    np.sum((y - np.dot(self.phi, self.m_)) ** 2))

            self._prune()

            if self.verbose:
                print("Iteration: {}".format(i))
                print("Alpha: {}".format(self.alpha_))
                print("Beta: {}".format(self.beta_))
                print("Gamma: {}".format(self.gamma))
                print("m: {}".format(self.m_))
                print("Relevance Vectors: {}".format(self.relevance_.shape[0]))
                print()

            delta = np.amax(np.absolute(self.alpha_ - self.alpha_old))

            if delta < self.tol and i > 1:
                break

            self.alpha_old = self.alpha_

        if self.bias_used:
            self.bias = self.m_[-1]
        else:
            self.bias = None

        return self
Пример #19
0
 def fit(self, X, y):
     X, y = check_X_y(X, y)
     self.classes_ = unique_labels(y)
     self.X_ = DynamicBayesianClassifier._first_col(X)
     self.y_ = y
     self.size_ = self.X_.size
     for i in range(self.X_.size):
         if y[i] not in self.dbayesmode_major_.keys():
             self.dbayesmode_major_[y[i]] = scalgoutil.DBayesMode(y[i])
         self.dbayesmode_major_[y[i]].update(self.X_[i])
         self.update_priors()
     return self
Пример #20
0
    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, multi_output=False)

        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
            self.kernel, self.optimizer, self.n_restarts_optimizer,
            self.max_iter_predict, self.warm_start, self.copy_X_train,
            self.random_state)

        self.classes_ = np.unique(y)
        self.n_classes_ = self.classes_.size
        if self.n_classes_ == 1:
            raise ValueError("GaussianProcessClassifier requires 2 or more "
                             "distinct classes; got %d class (only class %s "
                             "is present)"
                             % (self.n_classes_, self.classes_[0]))
        if self.n_classes_ > 2:
            if self.multi_class == "one_vs_rest":
                self.base_estimator_ = \
                    OneVsRestClassifier(self.base_estimator_,
                                        n_jobs=self.n_jobs)
            elif self.multi_class == "one_vs_one":
                self.base_estimator_ = \
                    OneVsOneClassifier(self.base_estimator_,
                                       n_jobs=self.n_jobs)
            else:
                raise ValueError("Unknown multi-class mode %s"
                                 % self.multi_class)

        self.base_estimator_.fit(X, y)

        if self.n_classes_ > 2:
            self.log_marginal_likelihood_value_ = np.mean(
                [estimator.log_marginal_likelihood()
                 for estimator in self.base_estimator_.estimators_])
        else:
            self.log_marginal_likelihood_value_ = \
                self.base_estimator_.log_marginal_likelihood()

        return self
Пример #21
0
    def fit(self, x, y):
        # y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self
Пример #22
0
    def fit(self, X, y):
        """
        Train the Logistic model, X and y are numpy arrays.
        """
        X, y = check_X_y(X, y) 
        #, accept_sparse=['csr', 'csc']) # not sure how to handle sparse
        self.classes_, y = np.unique(y, return_inverse=True)

        if self.fit_intercept:
            X = np.insert(X, 0, 1, axis=1)

        w0 = np.zeros(X.shape[1])

        if self.bounds is None:
            self.bounds_ = [(None, None) for v in w0]
        elif isinstance(self.bounds, tuple) and len(self.bounds) == 2:
            self.bounds_ = [self.bounds for v in w0]
        elif self.fit_intercept and len(self.bounds) == len(w0) - 1:
            self.bounds_ = np.concatenate(([(None, None)], self.bounds))
        else:
            self.bounds_ = self.bounds
        if len(self.bounds_) != len(w0):
            raise ValueError("Bounds must be the same length as the coef")

        if isinstance(self.l2, Number):
            self.l2_ = [self.l2 for v in w0]
        elif self.fit_intercept and len(self.l2) == len(w0) - 1:
            self.l2_ = np.insert(self.l2, 0, 0)
        else:
            self.l2_ = self.l2
        if len(self.l2_) != len(w0):
            raise ValueError("L2 penalty must be the same length as the coef, be sure the intercept is accounted for.")

        # the intercept should never be regularized.
        if self.fit_intercept:
            self.l2_[0] = 0.0

        w = minimize(_ll, w0, args=(X, y, self.l2_),
                               jac=_ll_grad, 
                               method=self.method, bounds=self.bounds_,
                               options={'maxiter': self.max_iter, 
                                        #'disp': True
                               })['x']

        if self.fit_intercept:
            self.intercept_ = w[0:1]
            self.coef_ = w[1:]
        else:
            self.intercept_ = np.array([])
            self.coef_ = w
        return self
Пример #23
0
 def fit(self, X, y):
   self.X_, y = check_X_y(X, y, dtype=float)
   labels = MulticlassLabels(y)
   self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k)
   self._lmnn.set_maxiter(self.max_iter)
   self._lmnn.set_obj_threshold(self.convergence_tol)
   self._lmnn.set_regularization(self.regularization)
   self._lmnn.set_stepsize(self.learn_rate)
   if self.use_pca:
     self._lmnn.train()
   else:
     self._lmnn.train(np.eye(X.shape[1]))
   self.L_ = self._lmnn.get_linear_transform()
   return self
Пример #24
0
    def fit(self,X,y):
        X, y = check_X_y(X,y,multi_output=True)
        self.reshape(X) #compute self._XX
        self.y_ = y
        self.nn_ = define_model_all(shape = self.shape_,\
                                    n_feat=self.n_feat_, filter_size= self.filter_size_,\
                                    nhid1=self.nhid1_, nhid2 = self.nhid2_,\
                                    pool_size = self.pool_size_,lr=self.lr_)

  
        self.history_ = self.nn_.fit(self.XX_,self.y_,batch_size=self.batch_size_,\
                                  nb_epoch=self.nb_epoch_,\
                                  validation_split=self.validation_split_,verbose=0)
    
        return self
Пример #25
0
    def reduce_data(self, X, y):
        X, y = check_X_y(X, y, accept_sparse="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
Пример #26
0
    def fit(self,X,y):
        X, y = check_X_y(X, y, multi_output=True, y_numeric=True, force_all_finite=False)
        if self.use_mcmc:
            self.mcmc = pymc.MCMC(self.lasso_model(X, y, self.sigma2))
            self.mcmc.sample(self.mcmc_trials, self.mcmc_burn, 2)
            self.num_betas = X.shape[1]

            traces = []
            for i in range(self.num_betas):
                traces.append(self.mcmc.trace('beta_{}'.format(i))[:])

            self.coef_ = np.array([np.mean(trace) for trace in traces])
        else:
            self._map = pymc.MAP(self.lasso_model(X, y, self.sigma2))
            self._map.fit()
            self.coef_ = np.array([beta.value for beta in self._map.betas])
Пример #27
0
    def predict_logpdf(self, X, y, nsamples=200, likelihood_args=()):
        r"""
        Predictive log-probability density function of a Bayesian GLM.

        Parameters
        ----------
        X : ndarray
            (N*,d) array query input dataset (N* samples, D dimensions).
        y : float or ndarray
            The test observations of shape (N*,) to evaluate under,
            :math:`\log p(y^* |\mathbf{x}^*, \mathbf{X}, y)`.
        nsamples : int, optional
            Number of samples for sampling the log predictive distribution.
        likelihood_args : sequence, optional
            sequence of arguments to pass to the likelihood function. These are
            non-learnable parameters. They can be scalars or arrays of length
            N*.

        Returns
        -------
        logp : ndarray
           The log probability of y* given X* of shape (N*,).
        logp_min : ndarray
            The minimum sampled values of the predicted log probability (same
            shape as p)
        logp_max : ndarray
            The maximum sampled values of the predicted log probability (same
            shape as p)
        """
        X, y = check_X_y(X, y)

        # Get latent function samples
        N = X.shape[0]
        ps = np.empty((N, nsamples))
        fsamples = self._sample_func(X, nsamples)

        # Push samples though likelihood pdf
        llargs = tuple(chain(atleast_list(self.like_hypers_), likelihood_args))
        for i, f in enumerate(fsamples):
            ps[:, i] = self.likelihood.loglike(y, f, *llargs)

        # Average transformed samples (MC integration)
        logp = ps.mean(axis=1)
        logp_min = ps.min(axis=1)
        logp_max = ps.max(axis=1)

        return logp, logp_min, logp_max
Пример #28
0
    def fit(self, X, y):
        X, y = check_X_y(X, y,
                         accept_sparse=("csr", "csc", "coo"),
                         accept_large_sparse=True,
                         multi_output=True,
                         y_numeric=True)
        if sp.issparse(X):
            if X.getformat() == "coo":
                if X.row.dtype == "int64" or X.col.dtype == "int64":
                    raise ValueError(
                        "Estimator doesn't support 64-bit indices")
            elif X.getformat() in ["csc", "csr"]:
                if X.indices.dtype == "int64" or X.indptr.dtype == "int64":
                    raise ValueError(
                        "Estimator doesn't support 64-bit indices")

        return self
Пример #29
0
    def fit(self, X, y):
        """A reference implementation of a fitting function

        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The training input samples.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels in classification, real numbers in
            regression).
        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y)
        return self
Пример #30
0
    def fit(self, X, y):
        """Fit the model.

        Args:
            X (ndarray): Training data of shape ``(n_samples, n_features)``.
            y (ndarray): Target values of shape ``(n_samples,)``.

        Returns:
            self

        Raises:
            FitError: If the fitting failed.
        """
        X, y = check_X_y(X, y, y_numeric=True)
        C = self.C
        cost_func, cost_opts = self._check_cost_func(
            self.cost_func, self.cost_opts)
        reg_cost_func, reg_cost_opts = self._check_cost_func(
            self.reg_cost_func, self.reg_cost_opts)

        # add a column of ones to X (for intercept coefficient)
        X = np.hstack((np.ones((X.shape[0], 1), dtype=float), X))

        def objective(W):
            # compute training cost/grad
            cost, outer_grad = cost_func(np.dot(X, W) - y, **cost_opts)
            grad = np.dot(outer_grad, X)  # chain rule

            # add regularization cost/grad (but don't regularize intercept)
            reg_cost, reg_grad = reg_cost_func(W[1:], **reg_cost_opts)
            cost += C * reg_cost
            grad[1:] += C * reg_grad

            return cost, grad

        initial_coef_ = np.zeros(X.shape[1])
        res = scipy.optimize.minimize(
            objective, initial_coef_, jac=True, method='L-BFGS-B')
        if res.success:
            self.coef_ = res.x
        else:
            raise FitError("Fit failed: {}".format(res.message), res=res)

        return self
Пример #31
0
 def fit(self, X, y):
     """Calculates the hash of X_train"""
     check_X_y(X, y, estimator=self)
     self.X_hash_ = self._hash(X)
     self.dim_ = X.shape[1]
     return self
Пример #32
0
    def fit(self, X, y):
        """Fit the model using X and y as training data.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            Training data.

        y : numpy array of shape (n_samples,)
            The ground truth (binary label)

            - 0 : inliers
            - 1 : outliers

        Returns
        -------
        self : object
        """

        # Validate inputs X and y
        X, y = check_X_y(X, y)
        X = check_array(X)
        self._set_n_classes(y)
        self.n_detector_ = self._validate_estimator(X)
        self.X_train_add_ = np.zeros([X.shape[0], self.n_detector_])

        # keep the standardization scalar for test conversion
        X_norm, self._scalar = standardizer(X, keep_scalar=True)

        for ind, estimator in enumerate(self.estimator_list):
            if self.standardization_flag_list[ind]:
                estimator.fit(X_norm)
                self.X_train_add_[:, ind] = estimator.decision_scores_

            else:
                estimator.fit(X)
                self.X_train_add_[:, ind] = estimator.decision_scores_

        # construct the new feature space
        self.X_train_new_ = np.concatenate((X, self.X_train_add_), axis=1)

        # initialize, train, and predict on XGBoost
        self.clf_ = clf = XGBClassifier(
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            n_estimators=self.n_estimators,
            silent=self.silent,
            objective=self.objective,
            booster=self.booster,
            n_jobs=self.n_jobs,
            nthread=self.nthread,
            gamma=self.gamma,
            min_child_weight=self.min_child_weight,
            max_delta_step=self.max_delta_step,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            colsample_bylevel=self.colsample_bylevel,
            reg_alpha=self.reg_alpha,
            reg_lambda=self.reg_lambda,
            scale_pos_weight=self.scale_pos_weight,
            base_score=self.base_score,
            random_state=self.random_state,
            missing=self.missing,
            **self.kwargs)
        self.clf_.fit(self.X_train_new_, y)
        self.decision_scores_ = self.clf_.predict_proba(self.X_train_new_)[:,
                                                                           1]
        self.labels_ = self.clf_.predict(self.X_train_new_).ravel()

        return self
Пример #33
0
    def fit(self, X, y):
        """Fit a semi-supervised label propagation model based

        All the input data is provided matrix X (labeled and unlabeled)
        and corresponding label matrix y with a dedicated marker value for
        unlabeled samples.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix will be created from this

        y : array_like, shape = [n_samples]
            n_labeled_samples (unlabeled points are marked as -1)
            All unlabeled samples will be transductively assigned labels

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y)
        self.X_ = X
        check_classification_targets(y)

        # actual graph construction (implementations should override this)
        graph_matrix = self._build_graph()

        # label construction
        # construct a categorical distribution for classification only
        classes = np.unique(y)
        classes = (classes[classes != -1])
        self.classes_ = classes

        n_samples, n_classes = len(y), len(classes)

        alpha = self.alpha
        if self._variant == 'spreading' and \
                (alpha is None or alpha <= 0.0 or alpha >= 1.0):
            raise ValueError('alpha=%s is invalid: it must be inside '
                             'the open interval (0, 1)' % alpha)
        y = np.asarray(y)
	unlabeled = y == -1
        # initialize distributions
        self.label_distributions_ = np.zeros((n_samples, n_classes))
	
	V = self.X_.shape[0]
        for label in classes:
            self.label_distributions_[y == label, classes == label] = 1
        y_static = np.copy(self.label_distributions_)
        if self._variant == 'propagation':
            # LabelPropagation
            y_static[unlabeled] = 0
        else:
            # LabelSpreading
            y_static *= 1 - alpha

        l_previous = np.zeros((self.X_.shape[0], n_classes))

        unlabeled = unlabeled[:, np.newaxis]
        if sparse.isspmatrix(graph_matrix):
            graph_matrix = graph_matrix.tocsr()
	
	import Queue as Q
	
	def D_theta(p_uv):
	    X = np.random.multinomial(1, [p_uv,1-p_uv], size=1)
	    return 1 if X[0][0] == 0 else np.inf
	def q_u(y_v):
	    return 1.0/y_v

        for self.n_iter_ in range(self.max_iter):
            if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
                break
            l_previous = self.label_distributions_
	    q = Q.PriorityQueue()#.put(np.where(unlabeled==False)
	    dist = np.full(V, np.inf) # distance
	    for j in np.argwhere(unlabeled==False)[:,0]:
	    	dist[j]=0
		q.put((dist[j],j))
	    while not q.empty():
		dist_v, v= Q.get()
		for u in range(V):
			delta_uv = D_theta(graph_matrix[u][v])
			if delta_uv == np.inf: # not infected
				continue
			alt = dist [v] + graph_matrix[u][v] + q_u(y[v])
			if alt < dist[u]:
				dist[u] = alt
				y[u] = y[v] # u inherits label from parent v
				self.label_distributions_[u,y[v]] +=1
				Q.put(dist[u],u)
        else:
            warnings.warn(
                'max_iter=%d was reached without convergence.' % self.max_iter,
                category=ConvergenceWarning
            )
            self.n_iter_ += 1
	for i in range(self.max_iter):
		
        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
        self.label_distributions_ /= normalizer

        # set the transduction item
        transduction = self.classes_[np.argmax(self.label_distributions_,
                                               axis=1)]
        self.transduction_ = transduction.ravel()
        return self
Пример #34
0
    def fit(self, X, y):
        """
        Kernelizes passed data and then fits data according to passed
        model.  Function inherits all attributes and features of
        SKLearn's base esimator class as well as passed model.  As part
        of fit process, feature data is kernelized (based on instance
        kernel parameter) and normalized -- should parameterize
        normalization or functionalize both together outside `fit`.

        __Parameters__

        > __X__ : ndarray of shape (n_samples, n_features)
        >- Training data
        >
        > __y__ : ndarray of shape (n_samples, spatial dimensions)
        >- Response data (location of Tx for each sample set
        >  of measurements)

        __Returns__

        > Self, sets self.X_, self.Y_

        """

        # Check that X and y have correct shape
        X, y = check_X_y(X, y, multi_output=True)
        # Check that number of kernels and number of kernel scales is same
        if self.n_kernels != len(self.n_meas_array):
            raise ValueError("n_kernels is not same as number of n_meas_array")
        # Check that number of each measurement types is correct
        if sum(self.n_meas_array) != X.shape[1]:
            raise ValueError(
                "Sum of n_meas_array is not same as number of features in X")

        #put lambdau into ndarray
        self.lambdau = np.array([self.lambdau])
        #put kernel scales together (reset in case called multiple times)
        kernel_scales = np.array([self.kernel_s0])
        for i in range(1, self.n_kernels):
            kernel_scales = np.append(kernel_scales,
                                      self.get_params()["kernel_s" + str(i)])

        # Generate kernelized matrix for fit input
        X_kernel = HFF_k_matrix(fml=X,
                                kernel=self.skl_kernel,
                                num_meas_array=self.n_meas_array,
                                varMs=kernel_scales)
        #normalize
        X_kernel = Normalizer().fit_transform(X_kernel)

        # Fit
        self.glmnet_model = glmnet(x=X_kernel,
                                   y=y.copy(),
                                   alpha=self.glm_alpha,
                                   lambdau=self.lambdau,
                                   **self.glmnet_args)

        # Store X,y seen during fit
        self.X_ = X
        self.y_ = y

        # Return the regressor
        return self
Пример #35
0
def partial_fit(self, X, y=None, **fit_params):
    """ A wrapper around the partial_fit function.

    Parameters
    ----------
    X : xarray DataArray, Dataset or other array-like
        The input samples.

    y : xarray DataArray, Dataset or other array-like
        The target values.
    """

    if self.estimator is None:
        raise ValueError("You must specify an estimator instance to wrap.")

    if is_target(y):
        y = y(X)

    if is_dataarray(X):

        if not hasattr(self, "type_"):
            self.type_ = "DataArray"
            self.estimator_ = self._fit(X, y, **fit_params)
        elif self.type_ == "DataArray":
            self.estimator_ = self._partial_fit(self.estimator_, X, y,
                                                **fit_params)
        else:
            raise ValueError(
                "This wrapper was not fitted for DataArray inputs.")

        # TODO: check if this needs to be removed for compat wrappers
        for v in vars(self.estimator_):
            if v.endswith("_") and not v.startswith("_"):
                setattr(self, v, getattr(self.estimator_, v))

    elif is_dataset(X):

        if not hasattr(self, "type_"):
            self.type_ = "Dataset"
            self.estimator_dict_ = {
                v: self._fit(X[v], y, **fit_params)
                for v in X.data_vars
            }
        elif self.type_ == "Dataset":
            self.estimator_dict_ = {
                v: self._partial_fit(self.estimator_dict_[v], X[v], y,
                                     **fit_params)
                for v in X.data_vars
            }
        else:
            raise ValueError("This wrapper was not fitted for Dataset inputs.")

        # TODO: check if this needs to be removed for compat wrappers
        for e_name, e in self.estimator_dict_.items():
            for v in vars(e):
                if v.endswith("_") and not v.startswith("_"):
                    if hasattr(self, v):
                        getattr(self, v).update({e_name: getattr(e, v)})
                    else:
                        setattr(self, v, {e_name: getattr(e, v)})

    else:

        if not hasattr(self, "type_"):
            self.type_ = "other"
            if y is None:
                X = check_array(X)
            else:
                X, y = check_X_y(X, y)
            self.estimator_ = clone(self.estimator).fit(X, y, **fit_params)
        elif self.type_ == "other":
            self.estimator_ = self.estimator_.partial_fit(X, y, **fit_params)
        else:
            raise ValueError("This wrapper was not fitted for other inputs.")

        # TODO: check if this needs to be removed for compat wrappers
        for v in vars(self.estimator_):
            if v.endswith("_") and not v.startswith("_"):
                setattr(self, v, getattr(self.estimator_, v))

    return self
Пример #36
0
    def fit(self, training_data, training_labels):

        X, y = check_X_y(training_data, training_labels)
        training_data = training_data.T

        self.training_data = training_data
        self.training_labels = training_labels
        self.fstar = np.zeros(
            training_data.shape
        )  # selected features for each representative point; if fstar(i, j) = 1, ith feature is selected for jth representative point.
        self.fstar_lin = np.zeros(
            training_data.shape
        )  # fstar before applying randomized rounding process

        m_features, n_observations = training_data.shape  # (M, N) M number of candidate features, N observations
        n_total_cls = [np.sum(training_labels ^ 1),
                       np.sum(training_labels)
                       ]  # Total number of each class in our training data

        overall_feasibility = np.zeros((n_observations, self.n_beta))
        overall_radious = np.zeros((n_observations, self.n_beta))
        overall_b_ratio = np.zeros((n_observations, self.n_beta))
        tb_temp = np.zeros((m_features, n_observations, self.n_beta))
        tr_temp = np.zeros((m_features, n_observations, self.n_beta))

        for z in range(0, self.tau):

            # For each feature across all observations in our training set we calculate a [0,1] fstar value
            for i_observation in range(0, n_observations):

                current_observation = training_data[:, i_observation][...,
                                                                      None]
                selected_label = training_labels[i_observation]
                matching_label = selected_label
                nonmatching_label = selected_label ^ 1

                excluding_selected = np.ones(
                    (1, n_observations),
                    dtype=bool)[0]  # mask for all observations not at i
                excluding_selected[
                    i_observation] = False  # deselect the i'th observations

                excluded_labels = training_labels[
                    excluding_selected]  # get the labels not at i, (convert to bool so we can use it as a mask)
                n_excluded_class = [
                    np.sum(excluded_labels ^ 1),
                    np.sum(excluded_labels)
                ]

                fstar_mask = self.fstar.astype(
                    bool
                )  # fstar mask to select all active features for this observation

                # Calculate the difference between this obseration and all other observations
                observation_distances = (training_data -
                                         current_observation)**2

                # adjust weighting for each observation based on if we've previously found fstar values for the features
                observation_weight = np.zeros((n_observations, n_observations))

                for i in range(n_observations):
                    fstar_feature_dist_0 = (
                        np.sqrt(
                            np.sum(
                                observation_distances[:, excluding_selected &
                                                      (training_labels == 0)] *
                                fstar_mask[:, i][..., None],
                                axis=0))
                    )  # total distance along features selected by fstar
                    fstar_feature_dist_1 = (
                        np.sqrt(
                            np.sum(
                                observation_distances[:, excluding_selected &
                                                      (training_labels == 1)] *
                                fstar_mask[:, i][..., None],
                                axis=0))
                    )  # total distance along features selected by fstar
                    w11 = np.exp(
                        (-(fstar_feature_dist_1 - np.min(fstar_feature_dist_1))
                         **2) / self.sigma)
                    w22 = np.exp(
                        (-(fstar_feature_dist_0 - np.min(fstar_feature_dist_0))
                         **2) / self.sigma)
                    observation_weight[i, excluding_selected &
                                       (training_labels == 0)] = w22
                    observation_weight[i, excluding_selected &
                                       (training_labels == 1)] = w11
                    # observation_weight[i, excluding_selected] = np.concatenate((w22, w11))

                average_observation_weight = np.mean(
                    observation_weight,
                    axis=0)  # mean weight of all observations or features
                normalized_weight = np.zeros(
                    (1, n_observations)
                )  # normalized weight for all observations not including the current observation
                normalized_weight[:, training_labels ==
                                  0] = average_observation_weight[
                                      training_labels == 0] / np.sum(
                                          average_observation_weight[
                                              training_labels == 0])
                normalized_weight[:, training_labels ==
                                  1] = average_observation_weight[
                                      training_labels == 1] / np.sum(
                                          average_observation_weight[
                                              training_labels == 1])

                # Find the average weighted difference for each feature
                average_feature_distances = [0, 0]
                average_feature_distances[matching_label] = np.sum(
                    normalized_weight[0, excluding_selected &
                                      (training_labels == matching_label)] *
                    observation_distances[:, excluding_selected &
                                          (training_labels == matching_label)],
                    axis=1) / (n_total_cls[matching_label] - 1)
                average_feature_distances[nonmatching_label] = np.sum(
                    normalized_weight[0, excluding_selected &
                                      (training_labels == nonmatching_label)] *
                    observation_distances[:, excluding_selected & (
                        training_labels == nonmatching_label)],
                    axis=1) / n_total_cls[nonmatching_label]

                A_ub_0 = np.concatenate(
                    (np.ones((1, m_features)), -np.ones((1, m_features))),
                    axis=0
                )  # The inequality constraint matrix. Each row of A_ub specifies the coefficients of a linear inequality constraint on x.
                b_ub_0 = np.array(
                    [[self.alpha], [-1]]
                )  # The inequality constraint vector. Each element represents an upper bound on the corresponding value of A_ub @ x.

                linprog_res_0 = linprog(
                    -average_feature_distances[nonmatching_label],
                    A_ub=A_ub_0,
                    b_ub=b_ub_0,
                    bounds=(0, 1),
                    method='interior-point',
                    options={
                        'tol': 0.000001,
                        'maxiter': 200
                    })  # This is secretly a maximization function

                if linprog_res_0.success:
                    epsilon_max = -linprog_res_0.fun

                    for i_beta in range(
                            0, self.n_beta
                    ):  # beta is kind of the granularity or resolution, higher = better estimation?
                        beta = np.round(1 / self.n_beta * (i_beta + 1),
                                        decimals=15)
                        epsilon = beta * epsilon_max

                        A_ub_1 = np.vstack(
                            (np.ones((1, m_features)), -np.ones(
                                (1, m_features)),
                             -average_feature_distances[nonmatching_label]))
                        b_ub_1 = np.vstack(
                            (self.alpha, -1, -epsilon))  # b1 TODO: Rename

                        linprog_res_1 = linprog(
                            average_feature_distances[matching_label],
                            A_ub=A_ub_1,
                            b_ub=b_ub_1,
                            bounds=(0.0, 1.0),
                            method='interior-point',
                            options={
                                'tol': 1e-6,
                                'maxiter': 200
                            })

                        class_estimations = linprog_res_1.x[..., None]

                        if linprog_res_1.success:

                            # Random rounding, for each of our estimates that are close to 0.5 (in the middle of what class it should be)
                            if self.rr_seed is not None:
                                np.random.seed(seed=self.rr_seed)

                            random_numbers = np.random.rand(
                                m_features, self.nrrp)

                            requires_adjustment = random_numbers <= class_estimations  # compare our class estimations against random numbers, where our results are close to 0.5 we will get true, otherwise False
                            unique_options = np.unique(
                                requires_adjustment, axis=1
                            )  # this will result in all probable options for what our less certain features can be
                            n_options = unique_options.shape[1]

                            option_feasabilities = np.zeros(
                                (1, n_options)
                            )[0]  # not all options are feasible, this is adjusted as feasible options are found
                            option_radiuses = np.zeros((1, n_options))[0]
                            option_distance_within = np.inf * np.ones(
                                (1, n_options))[0]

                            dr = np.zeros((1, n_options))
                            far = np.zeros((1, n_options))

                            # Each option is a probable case for which features could be further classified. We try each option, and find the one that best fits
                            for i_option, option in enumerate(
                                    unique_options.T
                            ):  # For each probable option
                                if np.sum(
                                        A_ub_1 @ option > b_ub_1[:, 0]
                                ) == 0:  # if atleast one feature is active and no more than maxNoFeatures
                                    if np.sum(
                                            option
                                    ) > 0:  # If there is atleast one relevant feature in this option

                                        option_feasabilities[
                                            i_option] = 1  # this is a feasible option if the above criteria is fulfilled
                                        representative_points = training_data[
                                            option, :]  # Each feature that has been selected in this option
                                        option_distance_within[
                                            i_option] = average_feature_distances[
                                                matching_label] @ option  # get our previously calculated feature distance for the selected features
                                        active_point = representative_points[:, i_observation][
                                            ..., None]
                                        rep_distances = np.abs(
                                            np.sqrt(
                                                np.sum(
                                                    (-representative_points +
                                                     active_point)**2, 0))
                                        )  # We get the difference between this active point and all other rep points
                                        unique_distances = np.msort(
                                            np.unique(rep_distances)
                                        )  # we filter out all duplicate distances and sort in ascending order in order to find the smallest distance

                                        # Increase the difference threshold until we have more dissimilar observations than similar observations
                                        for i_distance, distance in enumerate(
                                                unique_distances, start=0):

                                            radious = distance
                                            observations_within_distance = (
                                                rep_distances <= distance
                                            )  # find all representative points that are atleast distance different
                                            n_cls_within = [
                                                np.
                                                sum((
                                                    training_labels ^ 1
                                                )[observations_within_distance]
                                                    ),  # the number of 0's that fall within this difference threshold
                                                np.
                                                sum(training_labels[
                                                    observations_within_distance]
                                                    )  # the number of 1's within the zone
                                            ]
                                            n_cls_within[
                                                selected_label] = n_cls_within[
                                                    selected_label] - 1  # we subtract 1 since we arnt including our selected point

                                            # We want there to be less of our similar class proportionally at this distance than our dissimilar class
                                            if self.gamma * (
                                                    n_cls_within[
                                                        selected_label] /
                                                (n_total_cls[selected_label] -
                                                 1)) < (
                                                     n_cls_within[
                                                         selected_label ^ 1] /
                                                     n_total_cls[selected_label
                                                                 ^ 1]):

                                                if i_distance > 0:
                                                    radious = 0.5 * (
                                                        radious +
                                                        unique_distances[
                                                            i_distance - 1]
                                                    )  # this is the radious of how far we need to go for this
                                                    n_cls_within[
                                                        selected_label] = n_cls_within[
                                                            selected_label] + 1  # increment the cound of how many classes are within this radious

                                                if radious == 0:  # if the radious is 0, that is probably if the point is right on top of it, then pad it a bit
                                                    radious = 0.000001

                                                option_radiuses[
                                                    i_option] = radious
                                                observations_within_radious = (
                                                    rep_distances <= radious
                                                )  # how many are within the zone
                                                rep_points_within_radious = representative_points[:, (
                                                    observations_within_radious
                                                    == 1
                                                )]  # these are which points are within the zone
                                                classes_within_radious = training_labels[
                                                    observations_within_radious
                                                    == 1]
                                                dr[0, i_option] = 0
                                                far[0, i_option] = 0

                                                for i_point, rep_point_within in enumerate(
                                                        rep_points_within_radious
                                                        .T):
                                                    rep_point_within = rep_point_within[
                                                        ..., None]
                                                    dist_quasi_test = np.absolute(
                                                        np.sqrt(
                                                            np.sum((
                                                                representative_points
                                                                -
                                                                rep_point_within
                                                            )**2,
                                                                   axis=0))
                                                    )  # distance between this point and all other points
                                                    dist_quasi_test_cls = classes_within_radious[
                                                        i_point]
                                                    min_uniq = np.sort(
                                                        np.unique(
                                                            dist_quasi_test)
                                                    )  # we once again sort the features by ascending difference
                                                    total_nearest_neighbours = 0

                                                    # Searches until it finds k nearest neighbours
                                                    for i_distance_within, distance_within in enumerate(
                                                            min_uniq):
                                                        nearest_neighbours = dist_quasi_test <= min_uniq[
                                                            i_distance_within]  # from smallest to largest, tries to find k nearest neighbours
                                                        total_nearest_neighbours = np.sum(
                                                            nearest_neighbours)
                                                        if (total_nearest_neighbours
                                                                > self.knn):
                                                            break

                                                    n_nearest_neighbours = [  # number of nearest neighbours of each class
                                                        np.sum(
                                                            nearest_neighbours
                                                            & (training_labels
                                                               ^ 1)),
                                                        np.sum(
                                                            nearest_neighbours
                                                            & training_labels)
                                                    ]

                                                    # The case where the this point's class is in the majority amongst neighbouring points in the localized radious
                                                    if dist_quasi_test_cls == selected_label and (
                                                            n_nearest_neighbours[
                                                                selected_label]
                                                            - 1
                                                    ) > n_nearest_neighbours[
                                                            selected_label
                                                            ^ 1]:
                                                        dr[0, i_option] = dr[
                                                            0,
                                                            i_option] + 1  # Count the number of points that are in the majority

                                                    # The case where the this point's class is in the minority amongst neighbouring points in the localized radious, that is there are more dissimilar points within the radious
                                                    if dist_quasi_test_cls == (
                                                            selected_label ^ 1
                                                    ) and n_nearest_neighbours[
                                                            selected_label] > (
                                                                n_nearest_neighbours[
                                                                    selected_label
                                                                    ^ 1] - 1):
                                                        far[0, i_option] = far[
                                                            0,
                                                            i_option] + 1  # count the number of times points are in the minority
                                                break

                            eval_criteria = [
                                dr / n_total_cls[0] - far / n_total_cls[
                                    1],  # we get the difference between the proportion of similar to dissimilar neighbouring points within the radious
                                dr / n_total_cls[1] - far / n_total_cls[0]
                            ]
                            i_lowest_distance_within = np.argmin(
                                option_distance_within
                            )  # find the shortest within distance
                            TT_binary = unique_options[:,
                                                       i_lowest_distance_within]
                            overall_feasibility[i_observation,
                                                i_beta] = option_feasabilities[
                                                    i_lowest_distance_within]
                            overall_radious[i_observation,
                                            i_beta] = option_radiuses[
                                                i_lowest_distance_within]
                            overall_b_ratio[
                                i_observation, i_beta] = eval_criteria[
                                    training_labels[i_observation]][
                                        0, i_lowest_distance_within]

                            if overall_feasibility[i_observation, i_beta] == 1:
                                tb_temp[:, i_observation, i_beta] = TT_binary
                                tr_temp[:, i_observation,
                                        i_beta] = linprog_res_1.x

            overall_b_ratio[overall_feasibility == 0] = -np.inf
            I1 = np.argmax(
                overall_b_ratio, axis=1
            )  # what column (observation) contains the largest value for each row (feature)

            for j in range(n_observations):
                self.fstar[:, j] = tb_temp[:, j, I1[
                    j]]  # what class is associated with the observation that has the largest value for this feature?
                self.fstar_lin[:, j] = tr_temp[:, i_observation, I1[j]]
Пример #37
0
    def _validate_params(self, X, y):
        """Validate parameters as soon as :meth:`fit` is called.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training samples.
        y : array-like, shape (n_samples,)
            The corresponding training labels.
        Returns
        -------
        X : array, shape (n_samples, n_features)
            The validated training samples.
        y : array, shape (n_samples,)
            The validated training labels, encoded to be integers in
            the range(0, n_classes).
        init : string or numpy array of shape (n_features_a, n_features_b)
            The validated initialization of the linear transformation.
        Raises
        -------
        TypeError
            If a parameter is not an instance of the desired type.
        ValueError
            If a parameter's value violates its legal value range or if the
            combination of two or more given parameters is incompatible.
        """

        # Validate the inputs X and y, and converts y to numerical classes.
        X, y = check_X_y(X, y, ensure_min_samples=2)
        check_classification_targets(y)
        y = LabelEncoder().fit_transform(y)

        # Check the preferred dimensionality of the projected space
        if self.n_components is not None:
            check_scalar(self.n_components, 'n_components', int, 1)

            if self.n_components > X.shape[1]:
                raise ValueError('The preferred dimensionality of the '
                                 'projected space `n_components` ({}) cannot '
                                 'be greater than the given data '
                                 'dimensionality ({})!'.format(
                                     self.n_components, X.shape[1]))

        # If warm_start is enabled, check that the inputs are consistent
        check_scalar(self.warm_start, 'warm_start', bool)
        if self.warm_start and hasattr(self, 'components_'):
            if self.components_.shape[1] != X.shape[1]:
                raise ValueError(
                    'The new inputs dimensionality ({}) does not '
                    'match the input dimensionality of the '
                    'previously learned transformation ({}).'.format(
                        X.shape[1], self.components_.shape[1]))

        check_scalar(self.max_iter, 'max_iter', int, 1)
        check_scalar(self.tol, 'tol', float, 0.)
        check_scalar(self.verbose, 'verbose', int, 0)

        if self.callback is not None:
            if not callable(self.callback):
                raise ValueError('`callback` is not callable.')

        # Check how the linear transformation should be initialized
        init = self.init

        if isinstance(init, np.ndarray):
            init = check_array(init)

            # Assert that init.shape[1] = X.shape[1]
            if init.shape[1] != X.shape[1]:
                raise ValueError(
                    'The input dimensionality ({}) of the given '
                    'linear transformation `init` must match the '
                    'dimensionality of the given inputs `X` ({}).'.format(
                        init.shape[1], X.shape[1]))

            # Assert that init.shape[0] <= init.shape[1]
            if init.shape[0] > init.shape[1]:
                raise ValueError(
                    'The output dimensionality ({}) of the given '
                    'linear transformation `init` cannot be '
                    'greater than its input dimensionality ({}).'.format(
                        init.shape[0], init.shape[1]))

            if self.n_components is not None:
                # Assert that self.n_components = init.shape[0]
                if self.n_components != init.shape[0]:
                    raise ValueError('The preferred dimensionality of the '
                                     'projected space `n_components` ({}) does'
                                     ' not match the output dimensionality of '
                                     'the given linear transformation '
                                     '`init` ({})!'.format(
                                         self.n_components, init.shape[0]))
        elif init in ['auto', 'pca', 'lda', 'identity', 'random']:
            pass
        else:
            raise ValueError(
                "`init` must be 'auto', 'pca', 'lda', 'identity', 'random' "
                "or a numpy array of shape (n_components, n_features).")

        return X, y, init
Пример #38
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        train_X, train_y = X, y

        unique, counts = np.unique(train_y, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            train_X, train_y = smote.fit_resample(train_X, train_y)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                train_X, train_y = svmSmote.fit_resample(train_X, train_y)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                train_X, train_y = adasyn.fit_resample(train_X, train_y)
            except RuntimeError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            train_X, train_y = sls.sample(train_X, train_y)

        # Testing all models
        scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        if len(self.ensemble_) > 1:
            alpha_good = scores > (0.5 + self.alpha)
            self.ensemble_ = [
                self.ensemble_[i] for i in np.where(alpha_good)[0]
            ]

        if len(self.ensemble_) > self.ensemble_size - 1:
            worst = np.argmin(scores)
            del self.ensemble_[worst]

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
Пример #39
0
 def fit(self, X, y=None):
     self.wrong_attribute = 0
     X, y = check_X_y(X, y)
     return self
Пример #40
0
 def fit(self, X, y):
     X, y = check_X_y(X, y)
     self.coef_ = np.ones(X.shape[1])
     return self
Пример #41
0
 def fit(self, X, y):
     X, y = check_X_y(X, y, dtype=None)
     return self
Пример #42
0
    def fit(self, X, y, sample_weight=None):
        """Fit all base estimators.

        Parameters
        ----------
        X : 2d numpy array or sparse matrix of shape [n_samples, n_features]
            Training data
        y : 1d numpy array of shape [n_samples]
            Target values.
        sample_weight : 1d numpy array of shape [n_samples]
            Individual weights for each sample.
            Passed to fit method of each estimator.
            Note: will be split automatically for each fold.

        Returns
        -------
        self : object
            Fitted StackingTransformer instance.
        """
        # ---------------------------------------------------------------------
        # Validation
        # ---------------------------------------------------------------------

        # ---------------------------------------------------------------------
        # Check input data
        # ---------------------------------------------------------------------
        # Check X and y
        # ``check_estimator`` does not allow ``force_all_finite=False``
        X, y = check_X_y(X, y,
                         accept_sparse=['csr'],  # allow csr, cast all others to csr
                         force_all_finite=True,  # do not allow  nan and inf
                         multi_output=False)  # allow only one column in y_train

        # Check X and sample_weight
        # X is alredy checked, but we need it to compare length of sample_weight
        if sample_weight is not None:
            X, sample_weight = check_X_y(X, sample_weight,
                                         accept_sparse=['csr'],
                                         force_all_finite=True,
                                         multi_output=False)

        # ---------------------------------------------------------------------
        # Check ``estimators``
        # ---------------------------------------------------------------------
        if self.estimators is None:
            if self.regression:
                self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))]
            else:
                self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))]
            # warnings.warn('No estimators were specified. '
            #               'Using single dummy estimator as demo.', UserWarning)
        else:
            if 0 == len(self.estimators):
                raise ValueError('List of estimators is empty')
            else:
                # Clone
                self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators]
                # Check names of estimators
                names, estims = zip(*self.estimators_)
                self._validate_names(names)
                # Check if all estimators support ``sample_weight``
                if sample_weight is not None:
                    for name, estim in self.estimators_:
                        if not has_fit_parameter(estim, 'sample_weight'):
                            raise ValueError('Underlying estimator [%s] does not '
                                             'support sample weights.' % name)

        # ---------------------------------------------------------------------
        # Check other StackingTransformer parameters
        # ---------------------------------------------------------------------

        # ``variant``
        if self.variant not in ['A', 'B']:
            raise ValueError('Parameter ``variant`` must be set properly')

        # ``n_folds``
        if not isinstance(self.n_folds, int):
            raise ValueError('Parameter ``n_folds`` must be integer')
        if not self.n_folds > 1:
            raise ValueError('Parameter ``n_folds`` must be not less than 2')

        # ``verbose``
        if self.verbose not in [0, 1, 2]:
            raise ValueError('Parameter ``verbose`` must be 0, 1, or 2')

        # Additional check for inapplicable parameter combinations
        # If ``regression=True`` we ignore classification-specific
        # parameters and issue user warning
        if self.regression and (self.needs_proba or self.stratified):
            warn_str = ('This is regression task hence classification-specific'
                        'parameters set to ``True`` were ignored:')
            if self.needs_proba:
                self.needs_proba = False
                warn_str += ' ``needs_proba``'
            if self.stratified:
                self.stratified = False
                warn_str += ' ``stratified``'
            warnings.warn(warn_str, UserWarning)

        # ---------------------------------------------------------------------
        # Compute attributes (basic properties of data, number of estimators, etc.)
        # ---------------------------------------------------------------------
        self.train_shape_ = X.shape
        self.n_train_examples_ = X.shape[0]
        self.n_features_ = X.shape[1]
        if not self.regression:
            self.n_classes_ = len(np.unique(y))
        else:
            self.n_classes_ = None
        self.n_estimators_ = len(self.estimators_)
        self.train_footprint_ = self._get_footprint(X)

        # ---------------------------------------------------------------------
        # Specify default metric
        # ---------------------------------------------------------------------
        if self.metric is None and self.regression:
            self.metric_ = mean_absolute_error
        elif self.metric is None and not self.regression:
            if self.needs_proba:
                self.metric_ = log_loss
            else:
                self.metric_ = accuracy_score
        else:
            self.metric_ = self.metric
        # ---------------------------------------------------------------------
        # Create report header strings and print report header
        # ---------------------------------------------------------------------
        if self.verbose > 0:
            if self.regression:
                task_str = 'task:         [regression]'
            else:
                task_str = 'task:         [classification]'
                n_classes_str = 'n_classes:    [%d]' % self.n_classes_
            metric_str = 'metric:       [%s]' % self.metric_.__name__
            variant_str = 'variant:      [%s]' % self.variant
            n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_

            print(task_str)
            if not self.regression:
                print(n_classes_str)
            print(metric_str)
            print(variant_str)
            print(n_estimators_str + '\n')
        # ---------------------------------------------------------------------
        # Initialize cross-validation split
        # Stratified can be used only for classification
        # ---------------------------------------------------------------------
        if not self.regression and self.stratified:
            self.kf_ = StratifiedKFold(n_splits=self.n_folds,
                                       shuffle=self.shuffle,
                                       random_state=self.random_state)
            # Save target to be able to create stratified split in ``transform`` method
            # This is more efficient than to save split indices
            self._y_ = y.copy()
        else:
            self.kf_ = KFold(n_splits=self.n_folds,
                             shuffle=self.shuffle,
                             random_state=self.random_state)
            self._y_ = None

        # ---------------------------------------------------------------------
        # Compute implicit number of classes to create appropriate empty arrays.
        # !!! Important. In order to unify array creation
        # variable ``n_classes_implicit_`` is always equal to 1, except the case
        # when we performing classification task with ``needs_proba=True``
        # ---------------------------------------------------------------------
        if not self.regression and self.needs_proba:
            self.n_classes_implicit_ = len(np.unique(y))
            self.action_ = 'predict_proba'
        else:
            self.n_classes_implicit_ = 1
            self.action_ = 'predict'

        # ---------------------------------------------------------------------
        # Create empty numpy array for train predictions (OOF)
        # !!! Important. We have to implicitly predict during fit
        # in order to compute CV scores, because
        # the most reasonable place to print out CV scores is fit method
        # ---------------------------------------------------------------------
        S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_))

        # ---------------------------------------------------------------------
        # Prepare (clone) estmators for fitting and storing
        # We need models_A_ for both variant A and varian B
        # We need models_B_ for varian B only (in variant A attribute models_B_ is None)
        # ---------------------------------------------------------------------

        self.models_A_ = []
        self.models_B_ = None

        for n, est in self.estimators_:
            self.models_A_.append([clone(est) for _ in range(self.n_folds)])

        if self.variant in ['B']:
            self.models_B_ = [clone(est) for n, est in self.estimators_]

        # ---------------------------------------------------------------------
        # Create empty numpy array to store scores for each estimator and each fold
        # ---------------------------------------------------------------------
        self.scores_ = np.zeros((self.n_estimators_, self.n_folds))

        # ---------------------------------------------------------------------
        # Create empty list to store name, mean and std for each estimator
        # ---------------------------------------------------------------------
        self.mean_std_ = []

        # ---------------------------------------------------------------------
        # MAIN FIT PROCEDURE
        # ---------------------------------------------------------------------
        # Loop across estimators
        # ---------------------------------------------------------------------
        for estimator_counter, (name, estimator) in enumerate(self.estimators_):
            if self.verbose > 0:
                estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__)
                print(estimator_str)

            # -----------------------------------------------------------------
            # Loop across folds
            # -----------------------------------------------------------------
            for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)):
                # Split data and target
                X_tr = X[tr_index]
                y_tr = y[tr_index]
                X_te = X[te_index]
                y_te = y[te_index]

                # Split sample weights accordingly (if passed)
                if sample_weight is not None:
                    sample_weight_tr = sample_weight[tr_index]
                    # sample_weight_te = sample_weight[te_index]
                else:
                    sample_weight_tr = None
                    # sample_weight_te = None

                # Fit estimator
                _ = self._estimator_action(self.models_A_[estimator_counter][fold_counter],
                                           X_tr, y_tr, None,
                                           sample_weight=sample_weight_tr,
                                           action='fit',
                                           transform=self.transform_target)

                # Predict out-of-fold part of train set
                if 'predict_proba' == self.action_:
                    col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_,
                                                estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_)
                else:
                    col_slice_estimator = estimator_counter
                S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter],
                                                                                None, None,
                                                                                X_te, action=self.action_,
                                                                                transform=self.transform_pred)
                # Compute score
                score = self.metric_(y_te, S_train[te_index, col_slice_estimator])
                self.scores_[estimator_counter, fold_counter] = score

                # Print fold score
                if self.verbose > 1:
                    fold_str = '    fold %2d:  [%.8f]' % (fold_counter, score)
                    print(fold_str)

            # Compute mean and std and save in dict
            estim_name = self.estimators_[estimator_counter][0]
            estim_mean = np.mean(self.scores_[estimator_counter])
            estim_std = np.std(self.scores_[estimator_counter])
            self.mean_std_.append((estim_name, estim_mean, estim_std))

            if self.verbose > 1:
                sep_str = '    ----'
                print(sep_str)

            # Compute mean + std (and full)
            if self.verbose > 0:
                mean_str = '    MEAN:     [%.8f] + [%.8f]\n' % (estim_mean, estim_std)
                print(mean_str)

            # Fit estimator on full train set
            if self.variant in ['B']:
                if self.verbose > 0:
                    print('    Fitting on full train set...\n')
                _ = self._estimator_action(self.models_B_[estimator_counter],
                                           X, y, None,
                                           sample_weight=sample_weight,
                                           action='fit',
                                           transform=self.transform_target)

        # ---------------------------------------------------------------------
        # ---------------------------------------------------------------------

        # Return fitted StackingTransformer instance
        return self
Пример #43
0
    def fit(self, X_l, y_l, X_h, y_h):
        """Fit Gaussian process regression model.

        Parameters
        ----------
        X_l : array-like, shape = (n_l_samples, n_features)
            Training data

        y_l : array-like, shape = (n_l_samples, [n_output_dims])
            Target values

        X_h : array-like, shape = (n_h_samples, n_features)
            Training data

        y_h : array-like, shape = (n_h_samples, [n_output_dims])
            Target values

        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_l_ = C(1.0, constant_value_bounds="fixed") \
                * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_l_ = clone(self.kernel)
        self.kernel_d_ = clone(self.kernel_l_)

        self.rng = check_random_state(self.random_state)

        X_l, y_l = check_X_y(X_l, y_l, multi_output=True, y_numeric=True)
        X_h, y_h = check_X_y(X_h, y_h, multi_output=True, y_numeric=True)
        self.n_l_ = len(X_l)

        # Normalize target value
        if self.normalize_y:
            self._y_l_train_mean = np.mean(y_l, axis=0)
            self._y_h_train_mean = np.mean(y_h, axis=0)
            # demean y
            y_l = y_l - self._y_l_train_mean
            y_h = y_h - self._y_h_train_mean
        else:
            self._y_l_train_mean = np.zeros(1)
            self._y_h_train_mean = np.zeros(1)

        self.X_train_ = np.vstack((X_l, X_h))
        self.y_train_ = np.hstack((y_l, y_h))

        theta_initial = np.hstack(
            (np.array([self.rho]), self.kernel_l_.theta, self.kernel_d_.theta))
        if self.optimizer is not None and self.kernel_l_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=self.eval_gradient):
                if eval_gradient:
                    raise Warning(
                        "eval_gradient = True mode is not implemented yet!")
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            theta_bounds = np.r_[np.array(self.rho_bounds)[np.newaxis],
                                 self.kernel_l_.bounds, self.kernel_d_.bounds]
            # First optimize starting from theta specified in kernel
            optima = [(self._constrained_optimization(obj_func, theta_initial,
                                                      theta_bounds,
                                                      self.eval_gradient))]

            # Additional runs are performed from log-uniform chosen initial
            # theta
            if self.n_restarts_optimizer > 0:
                flag = np.isfinite(self.kernel_l_.bounds).all() and \
                    np.isfinite(self.kernel_d_.bounds).all() and \
                    np.isfinite(self.rho_bounds).all()
                if not flag:
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = np.vstack(
                    (np.array(self.rho_bounds).reshape(1, -1),
                     self.kernel_l_.bounds, self.kernel_d_.bounds))
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = np.hstack(
                        (self.rng.uniform(bounds[0, 0], bounds[0, 1]),
                         np.exp(self.rng.uniform(bounds[1:, 0], bounds[1:,
                                                                       1]))))
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds,
                                                       self.eval_gradient))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            best_hyperparams = optima[np.argmin(lml_values)][0]
            self.rho = best_hyperparams[0]
            self.kernel_l_.theta = best_hyperparams[1:1 +
                                                    len(self.kernel_l_.theta)]
            self.kernel_d_.theta = best_hyperparams[1 +
                                                    len(self.kernel_l_.theta):]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(theta_initial)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K_lf = self.kernel_l_(self.X_train_[:self.n_l_])
        K = np.vstack((
            np.hstack((self.kernel_l_(self.X_train_[:self.n_l_]),
                       self.rho * self.kernel_l_(self.X_train_[:self.n_l_],
                                                 self.X_train_[self.n_l_:]))),
            np.hstack((
                self.rho * self.kernel_l_(self.X_train_[self.n_l_:],
                                          self.X_train_[:self.n_l_]),
                self.rho**2 * self.kernel_l_(self.X_train_[self.n_l_:])
                +  # noqa W504
                self.kernel_d_(self.X_train_[self.n_l_:])))))
        K_lf[np.diag_indices_from(K_lf)] += self.alpha
        K[np.diag_indices_from(K)] += self.alpha
        try:
            self.L_lf_ = cholesky(K_lf, lower=True)  # Line 2 (lf)
            self.L_ = cholesky(K, lower=True)  # Line 2
            # self.L_ changed, self._K_inv needs to be recomputed
            self._K_inv = None
            self._K_lf_inv = None
        except np.linalg.LinAlgError as exc:
            exc.args = ("The kernel is not returning a "
                        "positive definite matrix. Try gradually "
                        "increasing the 'alpha' parameter of your "
                        "GaussianProcessRegressor estimator.", ) + exc.args
            raise
        self.alpha_lf_ = cho_solve((self.L_lf_, True),
                                   self.y_train_[:self.n_l_])  # Line 3 (Lf)
        self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3
        return self
Пример #44
0
    def fit(self, X, y):
        """Fit Gaussian process regression model.
        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data
        y : array-like, shape = (n_samples, [n_output_dims])
            Target values
        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
                           * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_ = clone(self.kernel)

        self._rng = check_random_state(self.random_state)

        X, y = check_X_y(X, y, multi_output=True, y_numeric=True)

        # Normalize target value
        if self.normalize_y:
            self._y_train_mean = np.mean(y, axis=0)
            # demean y
            y = y - self._y_train_mean
        else:
            self._y_train_mean = np.zeros(1)

        if np.iterable(self.alpha) \
                and self.alpha.shape[0] != y.shape[0]:
            if self.alpha.shape[0] == 1:
                self.alpha = self.alpha[0]
            else:
                raise ValueError(
                    "alpha must be a scalar or an array"
                    " with same number of entries as y.(%d != %d)" %
                    (self.alpha.shape[0], y.shape[0]))

        self.X_train_ = np.copy(X) if self.copy_X_train else X
        self.y_train_ = np.copy(y) if self.copy_X_train else y

        if self.optimizer is not None and self.kernel_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=True):
                if eval_gradient:
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            # First optimize starting from theta specified in kernel
            optima = [(self._constrained_optimization(obj_func,
                                                      self.kernel_.theta,
                                                      self.kernel_.bounds))]

            # Additional runs are performed from log-uniform chosen initial
            # theta

            if self.n_restarts_optimizer > 0:
                if not np.isfinite(self.kernel_.bounds).all():
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = self.kernel_.bounds
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = \
                        self._rng.uniform(bounds[:, 0], bounds[:, 1])
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            self.kernel_.theta = optima[np.argmin(lml_values)][0]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(self.kernel_.theta)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K = self.kernel_(self.X_train_)
        K[np.diag_indices_from(K)] += self.alpha

        try:
            self.L_ = cholesky(K, lower=True)  # Line 2
        except np.linalg.LinAlgError as exc:
            exc.args = ("The kernel, %s, is not returning a "
                        "positive definite matrix. Try gradually "
                        "increasing the 'alpha' parameter of your "
                        "GaussianProcessRegressor estimator." %
                        self.kernel_, ) + exc.args
            raise
        self.alpha_ = cho_solve((self.L_, True), self.y_train_)  # Line 3
        L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0]))
        self.K_inv = L_inv.dot(L_inv.T)
        return self
Пример #45
0
    def _fit(self, x, y):

        estimator = clone(self.estimator)

        def score_pri(slices, x0, y0):
            slices = list(slices)
            if len(slices) < 1:
                score0 = -np.inf
            else:
                slices = self.feature_unfold(slices)
                data_x0 = x0[:, slices]

                if hasattr(estimator, "best_score_"):
                    estimator.fit(data_x0, y0)
                    score0 = np.mean(estimator.best_score_)  # score_test

                else:
                    score0 = cross_val_score(estimator,
                                             data_x0,
                                             y0,
                                             cv=self.cv)
                    score0 = np.mean(score0)
                # print(slices, score0)
            return score0

        score = partial(score_pri, x0=x, y0=y)

        self.score_ = []
        x, y = check_X_y(x, y, "csc")
        assert all((self.check_must, self.check_muti)) in [True, False]

        feature_list = list(range(x.shape[1]))
        fold_feature_list = self.feature_fold(feature_list)
        if self.check_must:
            fold_feature_list = [
                i for i in fold_feature_list if i not in self.check_must
            ]

        slice_all = [combinations(fold_feature_list, i) for i in self.n_select]
        slice_all = [
            list(self.feature_must_fold(_)) for i in slice_all for _ in i
        ]

        scores = parallelize(n_jobs=self.n_jobs,
                             func=score,
                             iterable=slice_all)

        feature_combination = [self.feature_unfold(_) for _ in slice_all]
        index = np.argmax(scores)
        select_feature = feature_combination[int(index)]
        su = np.zeros(x.shape[1], dtype=np.bool)
        su[select_feature] = 1
        self.best_score_ = max(scores)
        self.score_ = scores
        self.support_ = su
        self.estimator_ = clone(self.estimator)
        if self.refit:
            if not hasattr(self.estimator_, 'best_score_'):
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            if hasattr(self.estimator_, 'best_score_') and hasattr(self.estimator_, "refit") \
                    and self.estimator_.refit is True:
                warnings.warn(
                    UserWarning(
                        "The self.estimator_ :{} used all the X,y data.".
                        format(self.estimator_.__class__.__name__),
                        "please be careful with the later 'score' and 'predict'."
                    ))
            self.estimator_.fit(x[:, select_feature], y)
        self.n_feature_ = len(select_feature)
        self.score_ex = list(zip(feature_combination, scores))
        self.scatter = list(zip([len(i) for i in slice_all], scores))
        self.score_ex.sort(key=lambda _: _[1], reverse=True)

        return self
Пример #46
0
    def _scrub(self, X, y, sample_weight, output_weight, missing, **kwargs):
        '''
        Sanitize input data.
        '''
        # Check for sparseness
        if sparse.issparse(y):
            raise TypeError(
                'A sparse matrix was passed, but dense data '
                'is required. Use y.toarray() to convert to dense.')
        if sparse.issparse(sample_weight):
            raise TypeError('A sparse matrix was passed, but dense data '
                            'is required. Use sample_weight.toarray()'
                            'to convert to dense.')
        if sparse.issparse(output_weight):
            raise TypeError('A sparse matrix was passed, but dense data '
                            'is required. Use output_weight.toarray()'
                            'to convert to dense.')

        # Check whether X is the output of patsy.dmatrices
        if y is None and isinstance(X, tuple):
            y, X = X

        # Handle X separately
        X, missing = self._scrub_x(X, missing, **kwargs)

        # Convert y to internally used data type
        y = np.asarray(y, dtype=np.float64)
        assert_all_finite(y)

        if len(y.shape) == 1:
            y = y[:, np.newaxis]

        # Deal with sample_weight
        if sample_weight is None:
            sample_weight = np.ones(y.shape[0], dtype=y.dtype)
        else:
            sample_weight = np.asarray(sample_weight)
            assert_all_finite(sample_weight)
        # Deal with output_weight
        if output_weight is None:
            output_weight = np.ones(y.shape[1], dtype=y.dtype)
        else:
            output_weight = np.asarray(output_weight)
            assert_all_finite(output_weight)

        # Make sure dimensions match
        if y.shape[0] != X.shape[0]:
            raise ValueError('X and y do not have compatible dimensions.')
        if y.shape[0] != sample_weight.shape[0]:
            raise ValueError(
                'y and sample_weight do not have compatible dimensions.')
        if y.shape[1] != output_weight.shape[0]:
            raise ValueError(
                'y and output_weight do not have compatible dimensions.')

        # Make sure everything is finite (except X, which is allowed to have
        # missing values)
        assert_all_finite(missing)
        assert_all_finite(y)
        assert_all_finite(sample_weight)
        assert_all_finite(output_weight)

        # Make sure everything is consistent
        check_X_y(X,
                  y,
                  accept_sparse=None,
                  multi_output=True,
                  force_all_finite=False)

        return X, y, sample_weight, output_weight, missing
Пример #47
0
    def fit(self,
            X,
            y,
            estimator,
            cutting_rule,
            test_size=0.3,
            delta=0.5,
            feature_names=None,
            points=None):
        """

        :param X:
        :param y:
        :param estimator:
        :param cutting_rule:
        :param test_size:
        :param delta:
        :param feature_names:
        :param points:
        :return:
        """
        if self.verbose:
            print('Running basic MeLiF\nEnsemble of :{}'.format(self.ensemble))
        feature_names = generate_features(X, feature_names)
        check_shapes(X, y)
        # check_features(features_names)
        self.__X, self.__y = check_X_y(X,
                                       y,
                                       dtype=np.float64,
                                       order='C',
                                       accept_sparse='csr',
                                       accept_large_sparse=False)
        self.__feature_names = feature_names
        self.__filter_weights = np.ones(len(self.ensemble)) / len(
            self.ensemble)
        self.__points = points
        self.__estimator = estimator
        self.__cutting_rule = cutting_rule

        self.__delta = delta
        if self.verbose:
            print('Estimator: {}'.format(estimator))
            print("Optimizer greedy search, optimizing measure is {}".format(
                self.__score))
            time = dt.datetime.now()
            print("time:{}".format(time))

        check_cutting_rule(cutting_rule)
        self._train_x, self._test_x, self._train_y, self._test_y = train_test_split(
            self.__X, self.__y, test_size=test_size)
        nu = self.ensemble.score(self.__X, self.__y, self.__feature_names)

        if self.__points is None:
            self.__points = [self.__filter_weights]
            for i in range(len(self.ensemble)):
                a = np.zeros(len(self.ensemble))
                a[i] = 1
                self.__points.append(a)
        best_point = self.__points[0]
        mapping = dict(zip(range(len(nu.keys())), nu.keys()))
        n = dict(
            zip(nu.keys(),
                self.__measure(np.array(list(nu.values())), best_point)))

        self.selected_features = self.__cutting_rule(n)
        self.best_f = {i: nu[i] for i in self.selected_features}
        for k, v in mapping.items():
            nu[k] = nu.pop(v)
        self.__search(self.__points, nu)
        self.selected_features = [mapping[i] for i in self.selected_features]
        for k in list(self.best_f.keys()):
            self.best_f[mapping[k]] = self.best_f.pop(k)
        if self.verbose:
            print('Footer')
            print("Best point:{}".format(self.best_point))
            print("Best Score:{}".format(self.best_score))
            print('Top features:')
            for key, value in sorted(self.best_f.items(),
                                     key=lambda x: x[1],
                                     reverse=True):
                print("Feature: {}, value: {}".format(key, value))
Пример #48
0
    def fit(self, X, y):
        """Prepare the DS model by setting the KNN algorithm and
        pre-processing the information required to apply the DS
        methods

        Parameters
        ----------
        X : array of shape = [n_samples, n_features]
            The input data.

        y : array of shape = [n_samples]
            class labels of each example in X.

        Returns
        -------
        self
        """
        self.random_state_ = check_random_state(self.random_state)

        # Check if the length of X and y are consistent.
        X, y = check_X_y(X, y)

        # Check if the pool of classifiers is None.
        # If yes, use a BaggingClassifier for the pool.
        if self.pool_classifiers is None:
            if len(X) < 2:
                raise ValueError('More than one sample is needed '
                                 'if the pool of classifiers is not informed.')

            # Split the dataset into training (for the base classifier) and
            # DSEL (for DS)
            X_train, X_dsel, y_train, y_dsel = train_test_split(
                X,
                y,
                test_size=self.DSEL_perc,
                random_state=self.random_state_)
            # self.pool_classifiers_ = BaggingClassifier(
            #     random_state=self.random_state_)
            self.pool_classifiers_ = RandomForestClassifier(n_estimators=200)
            # print(self.pool_classifiers_)
            self.pool_classifiers_.fit(X_train, y_train)
        else:
            self._check_base_classifier_fitted()
            self.pool_classifiers_ = self.pool_classifiers
            # print(self.pool_classifiers_)
            X_dsel = X
            y_dsel = y

        self.n_classifiers_ = len(self.pool_classifiers_)

        # check if the input parameters are correct. Raise an error if the
        # generated_pool is not fitted or k < 1
        self._validate_parameters()  # None
        # print(self.s_validate_parameters())
        # Check label encoder on the pool of classifiers
        self.check_label_encoder()

        self._setup_label_encoder(y)  # None

        y_dsel = self.enc_.transform(y_dsel)

        self._set_dsel(X_dsel, y_dsel)
        # validate the value of k
        self._validate_k()
        self._set_region_of_competence_algorithm()
        self._fit_region_competence(X_dsel, y_dsel)

        # validate the IH
        if self.with_IH:
            self._validate_ih()
        return self
Пример #49
0
def fit_transform(self, X, y=None, **fit_params):
    """ A wrapper around the fit_transform function.

    Parameters
    ----------
    X : xarray DataArray, Dataset or other array-like
        The input samples.

    y : xarray DataArray, Dataset or other array-like
        The target values.

    Returns
    -------
    Xt : xarray DataArray, Dataset or other array-like
        The transformed output.
    """

    if self.estimator is None:
        raise ValueError("You must specify an estimator instance to wrap.")

    if is_target(y):
        y = y(X)

    if is_dataarray(X):

        self.type_ = "DataArray"
        self.estimator_ = clone(self.estimator)

        if self.reshapes is not None:
            data, dims = self._fit_transform(self.estimator_, X, y,
                                             **fit_params)
            coords = self._update_coords(X)
            return xr.DataArray(data, coords=coords, dims=dims)
        else:
            return xr.DataArray(
                self.estimator_.fit_transform(X.data, y, **fit_params),
                coords=X.coords,
                dims=X.dims,
            )

    elif is_dataset(X):

        self.type_ = "Dataset"
        self.estimator_dict_ = {v: clone(self.estimator) for v in X.data_vars}

        if self.reshapes is not None:
            data_vars = dict()
            for v, e in self.estimator_dict_.items():
                yp_v, dims = self._fit_transform(e, X[v], y, **fit_params)
                data_vars[v] = (dims, yp_v)
            coords = self._update_coords(X)
            return xr.Dataset(data_vars, coords=coords)
        else:
            data_vars = {
                v: (X[v].dims, e.fit_transform(X[v].data, y, **fit_params))
                for v, e in self.estimator_dict_.items()
            }
            return xr.Dataset(data_vars, coords=X.coords)

    else:

        self.type_ = "other"
        if y is None:
            X = check_array(X)
        else:
            X, y = check_X_y(X, y)

        self.estimator_ = clone(self.estimator)
        Xt = self.estimator_.fit_transform(X, y, **fit_params)

        for v in vars(self.estimator_):
            if v.endswith("_") and not v.startswith("_"):
                setattr(self, v, getattr(self.estimator_, v))

    return Xt
Пример #50
0
    def fit(self,
            X,
            y,
            column_names=None,
            save_human_readable=False,
            remove_files=True):
        """A reference implementation of a fitting function for a classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The training input samples. No numerical inputs are allowed it.
        y : array-like, shape (n_samples,)
            The target values. An array of int.
        column_names : list, default=None
            A list containing the names to assign to columns in the dataset.
            They will be used when printing the human readable format of the
            rules.
        remove_files : bool, default=True
            Use this parameter to remove all the file generated by the original
            L3 implementation at training time.

        Returns
        -------
        self : object
            Returns self.
        """
        self._train_bin_path = join(self.l3_root, BIN_DIR, TRAIN_BIN)
        self._classify_bin_path = join(self.l3_root, BIN_DIR, CLASSIFY_BIN)
        self._logger = logging.getLogger(__name__)

        X = check_dtype(X)
        check_classification_targets(y)

        # Check that X and y have correct shape
        X, y = check_X_y(X, y, dtype=np.unicode_)

        # Check that y has correct values according to sklearn's policy
        unique = unique_labels(y)

        # Check that the rule sets modifier is valid
        valid_modifiers = ['standard', 'level1']
        if self.rule_sets_modifier not in valid_modifiers:
            raise NotImplementedError(
                f"The rule sets modifier specified is not" \
                f"supported. Use one of {valid_modifiers}."
            )

        # create mappings letting L3 binaries to work on strings only
        self._yorig_to_str, self._ystr_to_orig = build_y_mappings(unique)
        y = np.array([self._yorig_to_str[label] for label in y])

        # Store the classes seen during fit
        self.classes_ = [label for label in self._ystr_to_orig.keys()]

        # Define the label when no rule matches
        if self.assign_unlabeled == 'majority_class':
            self.unlabeled_class_ = _get_majority_class(y)
        else:
            self.unlabeled_class_ = self.assign_unlabeled

        self.X_ = X
        self.y_ = y

        token = secrets.token_hex(4)
        filestem = f"{token}"
        train_dir = token
        if exists(train_dir):
            raise RuntimeError(
                f"The training dir with token {token} already exists")
        else:
            mkdir(train_dir)
        old_dir = getcwd()
        chdir(train_dir)

        # Create column names if not provided
        if column_names is None:
            column_names = _create_column_names(X)
        check_column_names(X, column_names)
        self._column_id_to_name = build_columns_dictionary(column_names)

        # Dump X and y in a single .data (csv) file. "y" target labels are inserted as the last column
        X_todump = np.hstack([X, y.reshape(-1, 1)])
        _dump_array_to_file(X_todump, filestem, "data")

        # Invoke the training module of L3.
        if self.specialistic_rules:
            specialistic_flag = "0"
        else:
            specialistic_flag = "1"

        with open(f"{filestem}_stdout.txt", "w") as stdout:
            subprocess.run(
                [
                    self._train_bin_path,
                    filestem,  # training file filestem
                    f"{self.min_sup * 100:.2f}",  # min sup
                    f"{self.min_conf * 100:.2f}",  # min conf
                    "nofiltro",  # filtering measure for items (DEPRECATED)
                    "0",  # filtering threshold (DEPRECATED)
                    specialistic_flag,  # specialistic/general rules (TO VERIFY)
                    f"{self.max_length}",  # max length allowed for rules
                    self.
                    l3_root  # L3 root containing the 'bin' directory with binaries
                ],
                stdout=stdout)

        # rename useful (lvl1) and sparse (lvl2) rule files
        rename(LEVEL1_FILE, f"{token}_{LEVEL1_FILE}")
        rename(LEVEL2_FILE, f"{token}_{LEVEL2_FILE}")

        # read the mappings of classification labels
        self._class_dict = build_class_dict(filestem)

        # read the mappings item->"column_name","value"
        self._item_id_to_item, self._item_to_item_id = build_item_dictionaries(
            filestem)
        self.n_items_used_ = len(self._item_id_to_item)

        # apply the rule set modifier
        if self.rule_sets_modifier == 'level1':
            with open(f"{token}_{LEVEL2_FILE}", "w") as fp:
                self._logger.debug("Empty the level 2 rule set.")

        # parse the two rule sets and store them
        self.lvl1_rules_ = parse_raw_rules(f"{token}_{LEVEL1_FILE}")
        self.lvl2_rules_ = parse_raw_rules(f"{token}_{LEVEL2_FILE}")
        self.n_lvl1_rules_ = len(self.lvl1_rules_)
        self.n_lvl2_rules_ = len(self.lvl2_rules_)

        # translate the model to human readable format
        if save_human_readable:
            write_human_readable(f"{token}_{LEVEL1_FILE_READABLE}",
                                 self.lvl1_rules_, self._item_id_to_item,
                                 self._column_id_to_name, self._class_dict)
            write_human_readable(f"{token}_{LEVEL2_FILE_READABLE}",
                                 self.lvl2_rules_, self._item_id_to_item,
                                 self._column_id_to_name, self._class_dict)

        if remove_files:
            _remove_fit_files(token)

        chdir(old_dir)
        if remove_files and not save_human_readable:
            shutil.rmtree(train_dir)
        self.current_token_ = token  # keep track of the latest token generated by the fit method

        return self
Пример #51
0
    def fit(self, X, y, sample_weight=None, check_input=True,
            X_idx_sorted=None):

        random_state = check_random_state(self.random_state)
        if check_input:
            X, y = check_X_y(X, y, dtype=DTYPE, multi_output=False)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            check_classification_targets(y)
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            if self.class_weight is not None:
                y_original = np.copy(y)

            y_encoded = np.zeros(y.shape, dtype=np.int)
            for k in range(self.n_outputs_):
                classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])
            y = y_encoded

            if self.class_weight is not None:
                expanded_class_weight = compute_sample_weight(
                    self.class_weight, y_original)

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = ((2 ** 31) - 1 if self.max_depth is None
                     else self.max_depth)

        if isinstance(self.min_samples_split, (numbers.Integral, np.integer)):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s"
                                 % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0. < self.min_samples_split <= 1.:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s"
                                 % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE or
                    not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(
                    sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                         n_samples)
        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion,
                                                random_state)

        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)

        builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                        max_depth)
        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Пример #52
0
def check_input(input_data,
                y=None,
                preprocessor=None,
                type_of_inputs='classic',
                tuple_size=None,
                accept_sparse=False,
                dtype='numeric',
                order=None,
                copy=False,
                force_all_finite=True,
                multi_output=False,
                ensure_min_samples=1,
                ensure_min_features=1,
                y_numeric=False,
                estimator=None):
    """Checks that the input format is valid, and converts it if specified
  (this is the equivalent of scikit-learn's `check_array` or `check_X_y`).
  All arguments following tuple_size are scikit-learn's `check_X_y`
  arguments that will be enforced on the data and labels array. If
  indicators are given as an input data array, the returned data array
  will be the formed points/tuples, using the given preprocessor.

  Parameters
  ----------
  input : array-like
    The input data array to check.

  y : array-like
    The input labels array to check.

  preprocessor : callable (default=`None`)
    The preprocessor to use. If None, no preprocessor is used.

  type_of_inputs : `str` {'classic', 'tuples'}
    The type of inputs to check. If 'classic', the input should be
    a 2D array-like of points or a 1D array like of indicators of points. If
    'tuples', the input should be a 3D array-like of tuples or a 2D
    array-like of indicators of tuples.

  accept_sparse : `bool`
    Set to true to allow sparse inputs (only works for sparse inputs with
    dim < 3).

  tuple_size : int
    The number of elements in a tuple (e.g. 2 for pairs).

  dtype : string, type, list of types or None (default='numeric')
    Data type of result. If None, the dtype of the input is preserved.
    If 'numeric', dtype is preserved unless array.dtype is object.
    If dtype is a list of types, conversion on the first type is only
    performed if the dtype of the input is not in the list.

  order : 'F', 'C' or None (default=`None`)
    Whether an array will be forced to be fortran or c-style.

  copy : boolean (default=False)
    Whether a forced copy will be triggered. If copy=False, a copy might
    be triggered by a conversion.

  force_all_finite : boolean or 'allow-nan', (default=True)
    Whether to raise an error on np.inf and np.nan in X. This parameter
    does not influence whether y can have np.inf or np.nan values.
    The possibilities are:
     - True: Force all values of X to be finite.
     - False: accept both np.inf and np.nan in X.
     - 'allow-nan':  accept  only  np.nan  values in  X.  Values  cannot  be
       infinite.

  ensure_min_samples : int (default=1)
    Make sure that X has a minimum number of samples in its first
    axis (rows for a 2D array).

  ensure_min_features : int (default=1)
    Make sure that the 2D array has some minimum number of features
    (columns). The default value of 1 rejects empty datasets.
    This check is only enforced when X has effectively 2 dimensions or
    is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
    this check.

  estimator : str or estimator instance (default=`None`)
    If passed, include the name of the estimator in warning messages.

  Returns
  -------
  X : `numpy.ndarray`
    The checked input data array.

  y: `numpy.ndarray` (optional)
    The checked input labels array.
  """

    context = make_context(estimator)

    args_for_sk_checks = dict(accept_sparse=accept_sparse,
                              dtype=dtype,
                              order=order,
                              copy=copy,
                              force_all_finite=force_all_finite,
                              ensure_min_samples=ensure_min_samples,
                              ensure_min_features=ensure_min_features,
                              estimator=estimator)

    # We need to convert input_data into a numpy.ndarray if possible, before
    # any further checks or conversions, and deal with y if needed. Therefore
    # we use check_array/check_X_y with fixed permissive arguments.
    if y is None:
        input_data = check_array(input_data,
                                 ensure_2d=False,
                                 allow_nd=True,
                                 copy=False,
                                 force_all_finite=False,
                                 accept_sparse=True,
                                 dtype=None,
                                 ensure_min_features=0,
                                 ensure_min_samples=0)
    else:
        input_data, y = check_X_y(input_data,
                                  y,
                                  ensure_2d=False,
                                  allow_nd=True,
                                  copy=False,
                                  force_all_finite=False,
                                  accept_sparse=True,
                                  dtype=None,
                                  ensure_min_features=0,
                                  ensure_min_samples=0,
                                  multi_output=multi_output,
                                  y_numeric=y_numeric)

    if type_of_inputs == 'classic':
        input_data = check_input_classic(input_data, context, preprocessor,
                                         args_for_sk_checks)

    elif type_of_inputs == 'tuples':
        input_data = check_input_tuples(input_data, context, preprocessor,
                                        args_for_sk_checks, tuple_size)

        # if we have y and the input data are pairs, we need to ensure
        # the labels are in [-1, 1]:
        if y is not None and input_data.shape[1] == 2:
            check_y_valid_values_for_pairs(y)

    else:
        raise ValueError(
            "Unknown value {} for type_of_inputs. Valid values are "
            "'classic' or 'tuples'.".format(type_of_inputs))

    return input_data if y is None else (input_data, y)
Пример #53
0
 def fit(self, X, y=None):
     self._good_attribute = 1
     X, y = check_X_y(X, y)
     return self
Пример #54
0
    def fit(self, X, y):
        """Fit a semi-supervised label propagation model based

        All the input data is provided matrix X (labeled and unlabeled)
        and corresponding label matrix y with a dedicated marker value for
        unlabeled samples.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            A {n_samples by n_samples} size matrix will be created from this

        y : array_like, shape = [n_samples]
            n_labeled_samples (unlabeled points are marked as -1)
            All unlabeled samples will be transductively assigned labels

        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y)
        self.X_ = X
        check_classification_targets(y)

        # actual graph construction (implementations should override this)
        graph_matrix = self._build_graph()

        # label construction
        # construct a categorical distribution for classification only
        classes = np.unique(y)
        classes = (classes[classes != -1])
        self.classes_ = classes

        n_samples, n_classes = len(y), len(classes)

        alpha = self.alpha
        if self._variant == 'spreading' and \
                (alpha is None or alpha <= 0.0 or alpha >= 1.0):
            raise ValueError('alpha=%s is invalid: it must be inside '
                             'the open interval (0, 1)' % alpha)
        y = np.asarray(y)
        unlabeled = y == -1

        # initialize distributions
        self.label_distributions_ = np.zeros((n_samples, n_classes))
        for label in classes:
            self.label_distributions_[y == label, classes == label] = 1

        y_static = np.copy(self.label_distributions_)
        if self._variant == 'propagation':
            # LabelPropagation
            y_static[unlabeled] = 0
        else:
            # LabelSpreading
            y_static *= 1 - alpha

        l_previous = np.zeros((self.X_.shape[0], n_classes))

        unlabeled = unlabeled[:, np.newaxis]
        if sparse.isspmatrix(graph_matrix):
            graph_matrix = graph_matrix.tocsr()

        for self.n_iter_ in range(self.max_iter):
            if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
                break

            l_previous = self.label_distributions_
            self.label_distributions_ = safe_sparse_dot(
                graph_matrix, self.label_distributions_)

            if self._variant == 'propagation':
                normalizer = np.sum(
                    self.label_distributions_, axis=1)[:, np.newaxis]
                self.label_distributions_ /= normalizer
                self.label_distributions_ = np.where(unlabeled,
                                                     self.label_distributions_,
                                                     y_static)
            else:
                # clamp
                self.label_distributions_ = np.multiply(
                    alpha, self.label_distributions_) + y_static
        else:
            warnings.warn(
                'max_iter=%d was reached without convergence.' % self.max_iter,
                category=ConvergenceWarning
            )
            self.n_iter_ += 1

        normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
        self.label_distributions_ /= normalizer

        # set the transduction item
        transduction = self.classes_[np.argmax(self.label_distributions_,
                                               axis=1)]
        self.transduction_ = transduction.ravel()
        return self
Пример #55
0
    def fit(self, X, y, **kwargs):

        check_X_y(X, y)
        self.random_state_ = check_random_state(self.random_state)

        if 'warm_start' in kwargs and kwargs['warm_start']:
            check_is_fitted(self, ['chis_', 'estimated_membership_'])
            self.solve_strategy[1]['init'] = self.chis_

        self.chis_ = solve_optimization(X, y, self.c, self.k,
                                        self.solve_strategy[0],
                                        self.solve_strategy[1])

        if type(self.k) is PrecomputedKernel:
            self.gram_ = self.k.kernel_computations
        else:
            self.gram_ = np.array([[self.k.compute(x1, x2) for x1 in X]
                                   for x2 in X])
        self.fixed_term_ = np.array(self.chis_).dot(self.gram_.dot(self.chis_))

        def estimated_square_distance_from_center(x_new):
            ret = self.k.compute(x_new, x_new) \
                  - 2 * np.array([self.k.compute(x_i, x_new)
                                  for x_i in X]).dot(self.chis_) \
                  + self.fixed_term_
            return ret
        self.estimated_square_distance_from_center_ = \
                estimated_square_distance_from_center

        self.chi_SV_index_ = [
            i for i, (chi, mu) in enumerate(zip(self.chis_, y))
            if -self.c * (1 - mu) < chi < self.c * mu
        ]

        #self.chi_SV_index_ = [i for i in range(len(self.chis)_) \
        #        if -self.c*(1-self.mu[i]) < self.chis_[i] < self.c*self.mu[i]]

        chi_SV_square_distance = map(estimated_square_distance_from_center,
                                     X[self.chi_SV_index_])
        chi_SV_square_distance = list(chi_SV_square_distance)
        #chi_SV_square_distance = [estimated_square_distance_from_center(x[i])
        #                          for i in chi_SV_index]

        if len(chi_SV_square_distance) == 0:
            self.estimated_membership_ = None
            self.train_error_ = np.inf
            self.chis_ = None
            self.profile = None
            warn('No support vectors found')
            return self
            #raise ValueError('No support vectors found')

        self.SV_square_distance_ = np.mean(chi_SV_square_distance)
        num_samples = 500

        if self.sample_generator is None:
            self.sample_generator = lambda x: x

        #sample = map(self.sample_generator,
        #             self.random_state_.random_sample(num_samples))
        sample = self.sample_generator(num_samples)

        fuzzifier = self.fuzzifier(X, y)
        result = fuzzifier.get_fuzzified_membership(
            self.SV_square_distance_,
            sample,
            self.estimated_square_distance_from_center_,
            return_profile=self.return_profile)

        if self.return_profile:
            self.estimated_membership_, self.profile_ = result
        else:
            self.estimated_membership_ = result[0]

        self.train_error_ = np.mean([(self.estimated_membership_(x) - mu)**2
                                     for x, mu in zip(X, y)])

        return self
Пример #56
0
 def fit(self, X, y):
     X, y = check_X_y(X, y)
     return self
Пример #57
0
 def fit(self, X, y):
     X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
     if sp.issparse(X):
         raise ValueError("Nonsensical Error")
     return self
Пример #58
0
    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.classifiers_pool_ = self.estimators

        return self
Пример #59
0
    def partial_fit(self, X, y, classes=None):
        """
        Incremental building of Mondrian Forests.

        Parameters
        ----------
        X : array_like, shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32``

        y: array_like, shape = [n_samples]
            Input targets.

        classes: array_like, shape = [n_classes]
            Ignored for a regression problem. For a classification
            problem, if not provided this is inferred from y.
            This is taken into account for only the first call to
            partial_fit and ignored for subsequent calls.

        Returns
        -------
        self: instance of MondrianForest
        """
        X, y = check_X_y(X, y, dtype=np.float32, multi_output=False)
        random_state = check_random_state(self.random_state)

        # Wipe out estimators if partial_fit is called after fit.
        first_call = not hasattr(self, "first_")
        if first_call:
            self.first_ = True

        if isinstance(self, ClassifierMixin):
            if first_call:
                if classes is None:
                    classes = LabelEncoder().fit(y).classes_

                self.classes_ = classes
                self.n_classes_ = len(self.classes_)

        # Remap output
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        if y.ndim == 2 and y.shape[1] == 1:
            warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        self.n_outputs_ = 1

        # Initialize estimators at first call to partial_fit.
        if first_call:
            # Check estimators
            self._validate_estimator()
            self.estimators_ = []

            for _ in range(self.n_estimators):
                tree = self._make_estimator(append=False,
                                            random_state=random_state)
                self.estimators_.append(tree)

        # XXX: Switch to threading backend when GIL is released.
        if isinstance(self, ClassifierMixin):
            self.estimators_ = Parallel(
                n_jobs=self.n_jobs,
                backend="multiprocessing",
                verbose=self.verbose)(
                    delayed(_single_tree_pfit)(t, X, y, classes)
                    for t in self.estimators_)
        else:
            self.estimators_ = Parallel(n_jobs=self.n_jobs,
                                        backend="multiprocessing",
                                        verbose=self.verbose)(
                                            delayed(_single_tree_pfit)(t, X, y)
                                            for t in self.estimators_)

        return self
Пример #60
0
def _validate_and_reformat_input(X,
                                 y=None,
                                 expect_y=True,
                                 enforce_binary_labels=False,
                                 **kwargs):
    """Validate input data and return the data in an appropriate format.

    :param X: The feature matrix
    :type X: numpy.ndarray or pandas.DataFrame
    :param y: The label vector
    :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list
    :param expect_y: if True y needs to be provided, otherwise ignores the argument; default True
    :type expect_y: bool
    :param enforce_binary_labels: if True raise exception if there are more than two distinct
        values in the `y` data; default False
    :type enforce_binary_labels: bool
    :return: the validated and reformatted X, y, and sensitive_features; note that certain
        estimators rely on metadata encoded in X which may be stripped during the reformatting
        process, so mitigation methods should ideally use the input X instead of the returned X
        for training estimators and leave potential reformatting of X to the estimator.
    :rtype: (pandas.DataFrame, pandas.Series, pandas.Series)
    """
    if y is not None:
        # calling check_X_y with a 2-dimensional y causes a warning, so ensure it is 1-dimensional
        if isinstance(y, np.ndarray) and len(y.shape) == 2 and y.shape[1] == 1:
            y = y.squeeze()
        elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
            y = y.to_numpy().squeeze()

        X, y = check_X_y(X, y)
        y = check_array(y, ensure_2d=False, dtype='numeric')
        if enforce_binary_labels and not set(np.unique(y)).issubset(set([0, 1
                                                                         ])):
            raise ValueError(_LABELS_NOT_0_1_ERROR_MESSAGE)
    elif expect_y:
        raise ValueError(_MESSAGE_Y_NONE)
    else:
        X = check_array(X)

    sensitive_features = kwargs.get(_KW_SENSITIVE_FEATURES)
    if sensitive_features is None:
        raise ValueError(_MESSAGE_SENSITIVE_FEATURES_NONE)

    check_consistent_length(X, sensitive_features)
    sensitive_features = check_array(sensitive_features,
                                     ensure_2d=False,
                                     dtype=None)

    # compress multiple sensitive features into a single column
    if len(sensitive_features.shape) > 1 and sensitive_features.shape[1] > 1:
        sensitive_features = \
            _compress_multiple_sensitive_features_into_single_column(sensitive_features)

    # If we don't have a y, then need to fiddle with return type to
    # avoid a warning from pandas
    if y is not None:
        result_y = pd.Series(y)
    else:
        result_y = pd.Series(dtype="float64")

    return pd.DataFrame(X), result_y, pd.Series(sensitive_features.squeeze())