def self_tune(self, X, y, verbose=False): # fix random seed for reproducibility seed = 5 np.random.seed(seed) # define k-fold cross validation test harness kfold = StratifiedKFold(y=y, n_folds=self.tuning_csp_num_folds, shuffle=True, random_state=seed) # init scores cvscores = {} for i in xrange(1,self.num_spatial_filters): cvscores[i+1] = 0 for i, (train, test) in enumerate(kfold): # calculate CSP spatial filters csp = CSP(n_components=self.num_spatial_filters) csp.fit(X[train], y[train]) # try all filters, from the given num down to 2 # (1 is too often found to be overfitting) for j in xrange(2,self.num_spatial_filters): num_filters_to_try = j # calculate spatial filters csp.n_components = num_filters_to_try # apply CSP filters to train data tuning_train_LDA_features = csp.transform(X[train]) np.nan_to_num(tuning_train_LDA_features) check_X_y(tuning_train_LDA_features, y[train]) # apply CSP filters to test data tuning_test_LDA_features = csp.transform(X[test]) np.nan_to_num(tuning_test_LDA_features) check_X_y(tuning_test_LDA_features, y[test]) # train LDA lda = LinearDiscriminantAnalysis() prediction_score = lda.fit(tuning_train_LDA_features, y[train]).score(tuning_test_LDA_features, y[test]) cvscores[num_filters_to_try] += prediction_score if verbose: print "prediction score", prediction_score, "with",num_filters_to_try,"spatial filters" best_num = max(cvscores, key=cvscores.get) best_score = cvscores[best_num] / i+1 if verbose: print "best num filters:", best_num, "(average accuracy ",best_score,")" print "average scores per filter num:" for k in cvscores: print k,":", cvscores[k]/i+1 return [best_num, best_score]
def fit(self, X, y): """ X: data matrix, (n x d) y: scalar labels, (n) """ X, labels = check_X_y(X, y) n, d = X.shape num_dims = self.num_dims if num_dims is None: num_dims = d # Initialize A to a scaling matrix A = np.zeros((num_dims, d)) np.fill_diagonal(A, 1./(np.maximum(X.max(axis=0)-X.min(axis=0), EPS))) # Run NCA dX = X[:,None] - X[None] # shape (n, n, d) tmp = np.einsum('...i,...j->...ij', dX, dX) # shape (n, n, d, d) masks = labels[:,None] == labels[None] for it in xrange(self.max_iter): for i, label in enumerate(labels): mask = masks[i] Ax = A.dot(X.T).T # shape (n, num_dims) softmax = np.exp(-((Ax[i] - Ax)**2).sum(axis=1)) # shape (n) softmax[i] = 0 softmax /= softmax.sum() t = softmax[:, None, None] * tmp[i] # shape (n, d, d) d = softmax[mask].sum() * t.sum(axis=0) - t[mask].sum(axis=0) A += self.learning_rate * A.dot(d) self.X_ = X self.A_ = A self.n_iter_ = it return self
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'pima.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.clf = XGBOD(random_state=42) self.clf.fit(self.X_train, self.y_train) self.roc_floor = 0.8
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) prots_s = [] labels_s = [] classes = np.unique(y) self.classes_ = classes for cur_class in classes: mask = y == cur_class insts = X[mask] prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]] labels_s = labels_s + [cur_class] self.classifier.fit(prots_s, labels_s) for sample, label in zip(X, y): if self.classifier.predict(sample) != [label]: prots_s = prots_s + [sample] labels_s = labels_s + [label] self.classifier.fit(prots_s, labels_s) self.X_ = np.asarray(prots_s) self.y_ = np.asarray(labels_s) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes minority_class = self.pos_class if self.pos_class == None: minority_class = min(set(y), key = list(y).count) # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() min_groups = filter(lambda g: g.label == minority_class, self.groups) self._merge() self._pruning() max_groups = filter(lambda g: g.label != minority_class, self.groups) self.groups = min_groups + max_groups self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors classes = np.unique(y) self.classes_ = classes # loading inicial groups self.groups = [] for label in classes: mask = y == label self.groups = self.groups + [_Group(X[mask], label)] self._main_loop() self._generalization_step() self._merge() self._pruning() self.X_ = np.asarray([g.rep_x for g in self.groups]) self.y_ = np.asarray([g.label for g in self.groups]) self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def reduce_data(self, X, y): if self.classifier == None: self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors) if self.classifier.n_neighbors != self.n_neighbors: self.classifier.n_neighbors = self.n_neighbors X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes if self.n_neighbors >= len(X): self.X_ = np.array(X) self.y_ = np.array(y) self.reduction_ = 0.0 return self.X_, self.y_ mask = np.zeros(y.size, dtype=bool) tmp_m = np.ones(y.size, dtype=bool) for i in xrange(y.size): tmp_m[i] = not tmp_m[i] self.classifier.fit(X[tmp_m], y[tmp_m]) sample, label = X[i], y[i] if self.classifier.predict(sample) == [label]: mask[i] = not mask[i] tmp_m[i] = not tmp_m[i] self.X_ = np.asarray(X[mask]) self.y_ = np.asarray(y[mask]) self.reduction_ = 1.0 - float(len(self.y_)) / len(y) return self.X_, self.y_
def fit(self, X, y): '''Fit the model. Parameters ---------- X : (n, d) array-like Input data. y : (n,) array-like Class labels, one per point of data. ''' X, y = check_X_y(X, y) # Inject parameters into all fitness functions for f in self._fitness: f.inject_params( random_state=self.random_state, ) # Inject parameters into Strategy self._strategy.inject_params( n_dim=self._transformer.individual_size(X.shape[1]), fitness=self._fitness, transformer=self._transformer, random_state=self.random_state, verbose=self.verbose, ) # transformer functions using strategy by optimising _fitnesses self._strategy.fit(X, y) # Fit (fill) transformer with the weights from the best individual self._transformer.fit(X, y, self._strategy.best_individual()) return self
def fit(self, X, y, random_state=np.random): """Create constraints from labels and learn the LSML model. Parameters ---------- X : (n x d) matrix Input data, where each row corresponds to a single instance. y : (n) array-like Data labels. random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ X, y = check_X_y(X, y) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) num_constraints = 20 * num_classes**2 c = Constraints.random_subset(y, self.num_labeled, random_state=random_state) pairs = c.positive_negative_pairs(num_constraints, same_length=True, random_state=random_state) return LSML.fit(self, X, pairs, weights=self.weights)
def fit(self, X, y): # Convert data X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True) return self
def threshold_fit(X, y, alpha, n_class, mode='AE', max_iter=1000, verbose=False, tol=1e-12): """ Solve the general threshold-based ordinal regression model using the logistic loss as surrogate of the 0-1 loss Parameters ---------- mode : string, one of {'AE', '0-1'} """ X, y = check_X_y(X, y, accept_sparse='csr') unique_y = np.sort(np.unique(y)) if not np.all(unique_y == np.arange(unique_y.size)): raise ValueError( 'Values in y must be %s, got instead %s' % (unique_y, np.arange(unique_y.size))) y = np.asarray(y) # XXX check its made of integers n_samples, n_features = X.shape # convert from c to theta L = np.zeros((n_class - 1, n_class - 1)) L[np.tril_indices(n_class-1)] = 1. if mode == 'AE': # loss forward difference loss_fd = np.ones((n_class, n_class - 1)) elif mode == '0-1': loss_fd = np.diag(np.ones(n_class - 1)) + \ np.diag(np.ones(n_class - 2), k=-1) loss_fd = np.vstack((loss_fd, np.zeros(n_class - 1))) loss_fd[-1, -1] = 1 # border case elif mode == 'SE': a = np.arange(n_class-1) b = np.arange(n_class) loss_fd = np.abs((a - b[:, None])**2 - (a - b[:, None]+1)**2) else: raise NotImplementedError x0 = np.zeros(n_features + n_class - 1) x0[X.shape[1]:] = np.arange(n_class - 1) options = {'maxiter' : max_iter, 'disp': verbose} if n_class > 2: bounds = [(None, None)] * (n_features + 1) + \ [(0, None)] * (n_class - 3) + [(1, 1)] bounds = [(None, None)] * (n_features + 1) + \ [(0, None)] * (n_class - 2) else: bounds = None sol = optimize.minimize(obj_margin, x0, method='L-BFGS-B', jac=grad_margin, args=(X, y, alpha, n_class, loss_fd, L), bounds=bounds, options=options, tol=tol) if not sol.success: print(sol.message) print(sol.message) w, c = sol.x[:X.shape[1]], sol.x[X.shape[1]:] theta = L.dot(c) return w, theta
def check_X_y(self, X, y): from sklearn.utils.validation import check_X_y if X.shape[0] > self.max_train_size_: raise Exception("X_train size cannot exceed {} ({})" .format(self.max_train_size_, X.shape[0])) return check_X_y(X, y, multi_output=True, allow_nd=True, y_numeric=True, estimator="GPRNP")
def fit(self, X, y, sample_weight=None): # Convert data X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True) # Function is only called after we verify that pandas is installed from pandas import Series if isinstance(sample_weight, Series): raise ValueError("Estimator does not accept 'sample_weight'" "of type pandas.Series") return self
def fit(self, X, y, sample_weight=None): """ Build a classifier from the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] The target values (class labels in classification). sample_weight : array-like, shape = [n_samples] or None Individual weights for each sample. Returns ------- self : object Returns self. """ self._validate_params(**self.get_params()) X, y = check_X_y(X, y, accept_sparse=True) if sp.isspmatrix(X): self._is_sparse_train_X = True else: self._is_sparse_train_X = False self._n_samples, self._n_features = X.shape sample_weight = self._get_sample_weight(sample_weight) check_consistent_length(X, y, sample_weight) check_classification_targets(y) self._classes = sorted(np.unique(y)) self._n_classes = len(self._classes) self._classes_map = {} self._set_params_with_dependencies() params = self._get_params() if self._n_classes == 2: self._classes_map[0] = self._classes[0] self._classes_map[1] = self._classes[1] self._estimators = [None] y = (y == self._classes[0]).astype(int) self._fit_binary_task(X, y, sample_weight, params) elif self._n_classes > 2: if sp.isspmatrix_dok(X): X = X.tocsr().tocoo() # Fix to avoid scipy 7699 issue self._estimators = [None] * self._n_classes self._fit_multiclass_task(X, y, sample_weight, params) else: raise ValueError("Classifier can't predict when only one class is present.") self._fitted = True return self
def fit(self, X, y): # Check data X, y = np.array(X), np.array(y) X, y = check_X_y(X, y) # Split to grow cascade and validate mask = np.random.random(y.shape[0]) < self.validation_fraction X_tr, X_vl = X[mask], X[~mask] y_tr, y_vl = y[mask], y[~mask] self.classes_ = unique_labels(y) self.layers_, inp_tr, inp_vl = [], X_tr, X_vl self.scores_ = [] # First layer forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1), RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)] _ = [f.fit(inp_tr, y_tr) for f in forests] p_vl = [f.predict_proba(inp_vl) for f in forests] labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)] score = self.scoring(y_vl, labels) self.layers_.append(forests) self.scores_.append(score) p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests] # Fit other layers last_score = score inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1) while True: # Grow cascade forests = [RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(max_features=1, n_estimators=self.n_estimators, min_samples_split=10, criterion='gini', n_jobs=-1), # Complete random RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1), RandomForestClassifier(n_estimators=self.n_estimators, n_jobs=-1)] _ = [forest.fit(inp_tr, y_tr) for forest in forests] # Fit the forest p_vl = [forest.predict_proba(inp_vl) for forest in forests] labels = [self.classes_[i] for i in np.argmax(np.array(p_vl).mean(axis=0), axis=1)] score = self.scoring(y_vl, labels) if score - last_score > self.tolerance: self.layers_.append(forests) p_tr = [cross_val_predict(f, inp_tr, y_tr, cv=self.cv, method='predict_proba') for f in forests] inp_tr, inp_vl = np.concatenate([X_tr]+p_tr, axis=1), np.concatenate([X_vl]+p_vl, axis=1) self.scores_.append(score) last_score = score print(self.scores_) else: break # Retrain on entire dataset inp_ = X for forests in self.layers_: _ = [f.fit(inp_, y) for f in forests] p = [cross_val_predict(f, inp_, y, cv=self.cv, method='predict_proba') for f in forests] inp_ = np.concatenate([X]+p, axis=1) return self
def fit(self, X, y): check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) self.X_ = X check_classification_targets(y) classes = np.nonzero(y) n_samples, n_classes = len(y), len(classes) # create diagonal matrix of degree of nodes if sparse.isspmatrix(self.X_): B_ = self.X_.copy().astype(np.float) D = np.array(csr_matrix.sum(self.X_, axis=1), dtype=np.float).T[0] else: B_ = np.copy(self.X_).astype(np.float) D = np.array(np.sum(self.X_, axis=1), dtype=np.float) # if (- self.sigma) and (self.sigma - 1) doesn't equals we have different diagonal matrix at the left and right sides if (- self.sigma) == (self.sigma - 1): D_left = D_right = np.power(D, - self.sigma) else: D_left = np.power(D, - self.sigma) D_right = np.power(self.sigma - 1) # M_ = D_left.dot(B_) for i, d in enumerate(D_left): B_[i, :] *= d # B_ = M_.dot(D_right) for i, d in enumerate(D_right): B_[:, i] *= d # create labeled data Z dimension = (n_samples, n_classes) labels = np.nonzero(y) ans_y = np.zeros(dimension) for l in labels[0]: ans_y[l][y[l] - 1] = 1 Z_ = (self.sigma / (1 + self.sigma)) * ans_y self.initial_vector_ = np.ones(dimension) / n_classes self._get_method_(B_, Z_) return self
def path_calc(X, y, X_holdout, y_holdout, alphas, paramgrid, colname = 'CV', yname = '', method = 'Elastic Net'): #make a copy of the parameters before popping things off copy_params = copy.deepcopy(paramgrid) fit_intercept = copy_params.pop('fit_intercept') precompute = copy_params.pop('precompute') copy_X = copy_params.pop('copy_X') normalize = False # this code adapted from sklearn ElasticNet fit function, which unfortunately doesn't accept multiple alphas at once X, y = check_X_y(X, y, accept_sparse='csc', order='F', dtype=[np.float64, np.float32], copy=copy_X and fit_intercept, multi_output=True, y_numeric=True) y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False) #this is the step that gives the data to find intercept if fit_intercept is true. X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit(X, y, None, precompute, normalize, fit_intercept, copy=False) y = np.squeeze(y) #do the path calculation, and tell how long it took print('Calculating path...') start_t = time.time() if method == 'Elastic Net': path_alphas, path_coefs, path_gaps, path_iters = enet_path(X, y, alphas=alphas, return_n_iter = True, **copy_params) if method == 'LASSO': path_alphas, path_coefs, path_gaps, path_iters = lasso_path(X, y, alphas=alphas, return_n_iter=True, **copy_params) dt = time.time() - start_t print('Took ' + str(dt) + ' seconds') #create some empty arrays to store the result y_pred_holdouts = np.empty(shape=(len(alphas),len(y_holdout))) intercepts = np.empty(shape=(len(alphas))) rmses = np.empty(shape=(len(alphas))) cvcols = [] for j in list(range(len(path_alphas))): coef_temp = path_coefs[:, j] if fit_intercept: coef_temp = coef_temp / X_scale intercept = y_offset - np.dot(X_offset, coef_temp.T) else: intercept = 0. y_pred_holdouts[j,:] = np.dot(X_holdout, path_coefs[:, j]) + intercept intercepts[j] = intercept rmses[j] = RMSE(y_pred_holdouts[j,:], y_holdout) cvcols.append(('predict','"'+ method + ' - ' + yname + ' - ' + colname + ' - Alpha:' + str(path_alphas[j]) + ' - ' + str(paramgrid) + '"')) return path_alphas, path_coefs, intercepts, path_iters, y_pred_holdouts, rmses, cvcols
def fit(self, X, y): """Fit the RVR to the training data.""" X, y = check_X_y(X, y) n_samples, n_features = X.shape self.phi = self._apply_kernel(X, X) n_basis_functions = self.phi.shape[1] self.relevance_ = X self.y = y self.alpha_ = self.alpha * np.ones(n_basis_functions) self.beta_ = self.beta self.m_ = np.zeros(n_basis_functions) self.alpha_old = self.alpha_ for i in range(self.n_iter): self._posterior() self.gamma = 1 - self.alpha_*np.diag(self.sigma_) self.alpha_ = self.gamma/(self.m_ ** 2) if not self.beta_fixed: self.beta_ = (n_samples - np.sum(self.gamma))/( np.sum((y - np.dot(self.phi, self.m_)) ** 2)) self._prune() if self.verbose: print("Iteration: {}".format(i)) print("Alpha: {}".format(self.alpha_)) print("Beta: {}".format(self.beta_)) print("Gamma: {}".format(self.gamma)) print("m: {}".format(self.m_)) print("Relevance Vectors: {}".format(self.relevance_.shape[0])) print() delta = np.amax(np.absolute(self.alpha_ - self.alpha_old)) if delta < self.tol and i > 1: break self.alpha_old = self.alpha_ if self.bias_used: self.bias = self.m_[-1] else: self.bias = None return self
def fit(self, X, y): X, y = check_X_y(X, y) self.classes_ = unique_labels(y) self.X_ = DynamicBayesianClassifier._first_col(X) self.y_ = y self.size_ = self.X_.size for i in range(self.X_.size): if y[i] not in self.dbayesmode_major_.keys(): self.dbayesmode_major_[y[i]] = scalgoutil.DBayesMode(y[i]) self.dbayesmode_major_[y[i]].update(self.X_[i]) self.update_priors() return self
def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes; got %d class (only class %s " "is present)" % (self.n_classes_, self.classes_[0])) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self
def fit(self, x, y): # y = y.values x, y = check_X_y(x, y, accept_sparse=True) def pr(x, y_i, y): p = x[y==y_i].sum(0) return (p+1) / ((y==y_i).sum()+1) self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y))) x_nb = x.multiply(self._r) self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y) return self
def fit(self, X, y): """ Train the Logistic model, X and y are numpy arrays. """ X, y = check_X_y(X, y) #, accept_sparse=['csr', 'csc']) # not sure how to handle sparse self.classes_, y = np.unique(y, return_inverse=True) if self.fit_intercept: X = np.insert(X, 0, 1, axis=1) w0 = np.zeros(X.shape[1]) if self.bounds is None: self.bounds_ = [(None, None) for v in w0] elif isinstance(self.bounds, tuple) and len(self.bounds) == 2: self.bounds_ = [self.bounds for v in w0] elif self.fit_intercept and len(self.bounds) == len(w0) - 1: self.bounds_ = np.concatenate(([(None, None)], self.bounds)) else: self.bounds_ = self.bounds if len(self.bounds_) != len(w0): raise ValueError("Bounds must be the same length as the coef") if isinstance(self.l2, Number): self.l2_ = [self.l2 for v in w0] elif self.fit_intercept and len(self.l2) == len(w0) - 1: self.l2_ = np.insert(self.l2, 0, 0) else: self.l2_ = self.l2 if len(self.l2_) != len(w0): raise ValueError("L2 penalty must be the same length as the coef, be sure the intercept is accounted for.") # the intercept should never be regularized. if self.fit_intercept: self.l2_[0] = 0.0 w = minimize(_ll, w0, args=(X, y, self.l2_), jac=_ll_grad, method=self.method, bounds=self.bounds_, options={'maxiter': self.max_iter, #'disp': True })['x'] if self.fit_intercept: self.intercept_ = w[0:1] self.coef_ = w[1:] else: self.intercept_ = np.array([]) self.coef_ = w return self
def fit(self, X, y): self.X_, y = check_X_y(X, y, dtype=float) labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) self._lmnn.set_obj_threshold(self.convergence_tol) self._lmnn.set_regularization(self.regularization) self._lmnn.set_stepsize(self.learn_rate) if self.use_pca: self._lmnn.train() else: self._lmnn.train(np.eye(X.shape[1])) self.L_ = self._lmnn.get_linear_transform() return self
def fit(self,X,y): X, y = check_X_y(X,y,multi_output=True) self.reshape(X) #compute self._XX self.y_ = y self.nn_ = define_model_all(shape = self.shape_,\ n_feat=self.n_feat_, filter_size= self.filter_size_,\ nhid1=self.nhid1_, nhid2 = self.nhid2_,\ pool_size = self.pool_size_,lr=self.lr_) self.history_ = self.nn_.fit(self.XX_,self.y_,batch_size=self.batch_size_,\ nb_epoch=self.nb_epoch_,\ validation_split=self.validation_split_,verbose=0) return self
def reduce_data(self, X, y): X, y = check_X_y(X, y, accept_sparse="csr") classes = np.unique(y) self.classes_ = classes self.main_loop(X, y) best_index = np.argmax(self.evaluations) mask = np.asarray(self.chromosomes[best_index], dtype=bool) self.X_ = X[mask] self.y_ = y[mask] self.reduction_ = 1.0 - float(len(self.y_))/len(y) return self.X_, self.y_
def fit(self,X,y): X, y = check_X_y(X, y, multi_output=True, y_numeric=True, force_all_finite=False) if self.use_mcmc: self.mcmc = pymc.MCMC(self.lasso_model(X, y, self.sigma2)) self.mcmc.sample(self.mcmc_trials, self.mcmc_burn, 2) self.num_betas = X.shape[1] traces = [] for i in range(self.num_betas): traces.append(self.mcmc.trace('beta_{}'.format(i))[:]) self.coef_ = np.array([np.mean(trace) for trace in traces]) else: self._map = pymc.MAP(self.lasso_model(X, y, self.sigma2)) self._map.fit() self.coef_ = np.array([beta.value for beta in self._map.betas])
def predict_logpdf(self, X, y, nsamples=200, likelihood_args=()): r""" Predictive log-probability density function of a Bayesian GLM. Parameters ---------- X : ndarray (N*,d) array query input dataset (N* samples, D dimensions). y : float or ndarray The test observations of shape (N*,) to evaluate under, :math:`\log p(y^* |\mathbf{x}^*, \mathbf{X}, y)`. nsamples : int, optional Number of samples for sampling the log predictive distribution. likelihood_args : sequence, optional sequence of arguments to pass to the likelihood function. These are non-learnable parameters. They can be scalars or arrays of length N*. Returns ------- logp : ndarray The log probability of y* given X* of shape (N*,). logp_min : ndarray The minimum sampled values of the predicted log probability (same shape as p) logp_max : ndarray The maximum sampled values of the predicted log probability (same shape as p) """ X, y = check_X_y(X, y) # Get latent function samples N = X.shape[0] ps = np.empty((N, nsamples)) fsamples = self._sample_func(X, nsamples) # Push samples though likelihood pdf llargs = tuple(chain(atleast_list(self.like_hypers_), likelihood_args)) for i, f in enumerate(fsamples): ps[:, i] = self.likelihood.loglike(y, f, *llargs) # Average transformed samples (MC integration) logp = ps.mean(axis=1) logp_min = ps.min(axis=1) logp_max = ps.max(axis=1) return logp, logp_min, logp_max
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=("csr", "csc", "coo"), accept_large_sparse=True, multi_output=True, y_numeric=True) if sp.issparse(X): if X.getformat() == "coo": if X.row.dtype == "int64" or X.col.dtype == "int64": raise ValueError( "Estimator doesn't support 64-bit indices") elif X.getformat() in ["csc", "csr"]: if X.indices.dtype == "int64" or X.indptr.dtype == "int64": raise ValueError( "Estimator doesn't support 64-bit indices") return self
def fit(self, X, y): """A reference implementation of a fitting function Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels in classification, real numbers in regression). Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) return self
def fit(self, X, y): """Fit the model. Args: X (ndarray): Training data of shape ``(n_samples, n_features)``. y (ndarray): Target values of shape ``(n_samples,)``. Returns: self Raises: FitError: If the fitting failed. """ X, y = check_X_y(X, y, y_numeric=True) C = self.C cost_func, cost_opts = self._check_cost_func( self.cost_func, self.cost_opts) reg_cost_func, reg_cost_opts = self._check_cost_func( self.reg_cost_func, self.reg_cost_opts) # add a column of ones to X (for intercept coefficient) X = np.hstack((np.ones((X.shape[0], 1), dtype=float), X)) def objective(W): # compute training cost/grad cost, outer_grad = cost_func(np.dot(X, W) - y, **cost_opts) grad = np.dot(outer_grad, X) # chain rule # add regularization cost/grad (but don't regularize intercept) reg_cost, reg_grad = reg_cost_func(W[1:], **reg_cost_opts) cost += C * reg_cost grad[1:] += C * reg_grad return cost, grad initial_coef_ = np.zeros(X.shape[1]) res = scipy.optimize.minimize( objective, initial_coef_, jac=True, method='L-BFGS-B') if res.success: self.coef_ = res.x else: raise FitError("Fit failed: {}".format(res.message), res=res) return self
def fit(self, X, y): """Calculates the hash of X_train""" check_X_y(X, y, estimator=self) self.X_hash_ = self._hash(X) self.dim_ = X.shape[1] return self
def fit(self, X, y): """Fit the model using X and y as training data. Parameters ---------- X : numpy array of shape (n_samples, n_features) Training data. y : numpy array of shape (n_samples,) The ground truth (binary label) - 0 : inliers - 1 : outliers Returns ------- self : object """ # Validate inputs X and y X, y = check_X_y(X, y) X = check_array(X) self._set_n_classes(y) self.n_detector_ = self._validate_estimator(X) self.X_train_add_ = np.zeros([X.shape[0], self.n_detector_]) # keep the standardization scalar for test conversion X_norm, self._scalar = standardizer(X, keep_scalar=True) for ind, estimator in enumerate(self.estimator_list): if self.standardization_flag_list[ind]: estimator.fit(X_norm) self.X_train_add_[:, ind] = estimator.decision_scores_ else: estimator.fit(X) self.X_train_add_[:, ind] = estimator.decision_scores_ # construct the new feature space self.X_train_new_ = np.concatenate((X, self.X_train_add_), axis=1) # initialize, train, and predict on XGBoost self.clf_ = clf = XGBClassifier( max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, silent=self.silent, objective=self.objective, booster=self.booster, n_jobs=self.n_jobs, nthread=self.nthread, gamma=self.gamma, min_child_weight=self.min_child_weight, max_delta_step=self.max_delta_step, subsample=self.subsample, colsample_bytree=self.colsample_bytree, colsample_bylevel=self.colsample_bylevel, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, scale_pos_weight=self.scale_pos_weight, base_score=self.base_score, random_state=self.random_state, missing=self.missing, **self.kwargs) self.clf_.fit(self.X_train_new_, y) self.decision_scores_ = self.clf_.predict_proba(self.X_train_new_)[:, 1] self.labels_ = self.clf_.predict(self.X_train_new_).ravel() return self
def fit(self, X, y): """Fit a semi-supervised label propagation model based All the input data is provided matrix X (labeled and unlabeled) and corresponding label matrix y with a dedicated marker value for unlabeled samples. Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix will be created from this y : array_like, shape = [n_samples] n_labeled_samples (unlabeled points are marked as -1) All unlabeled samples will be transductively assigned labels Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) # actual graph construction (implementations should override this) graph_matrix = self._build_graph() # label construction # construct a categorical distribution for classification only classes = np.unique(y) classes = (classes[classes != -1]) self.classes_ = classes n_samples, n_classes = len(y), len(classes) alpha = self.alpha if self._variant == 'spreading' and \ (alpha is None or alpha <= 0.0 or alpha >= 1.0): raise ValueError('alpha=%s is invalid: it must be inside ' 'the open interval (0, 1)' % alpha) y = np.asarray(y) unlabeled = y == -1 # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) V = self.X_.shape[0] for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self._variant == 'propagation': # LabelPropagation y_static[unlabeled] = 0 else: # LabelSpreading y_static *= 1 - alpha l_previous = np.zeros((self.X_.shape[0], n_classes)) unlabeled = unlabeled[:, np.newaxis] if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() import Queue as Q def D_theta(p_uv): X = np.random.multinomial(1, [p_uv,1-p_uv], size=1) return 1 if X[0][0] == 0 else np.inf def q_u(y_v): return 1.0/y_v for self.n_iter_ in range(self.max_iter): if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: break l_previous = self.label_distributions_ q = Q.PriorityQueue()#.put(np.where(unlabeled==False) dist = np.full(V, np.inf) # distance for j in np.argwhere(unlabeled==False)[:,0]: dist[j]=0 q.put((dist[j],j)) while not q.empty(): dist_v, v= Q.get() for u in range(V): delta_uv = D_theta(graph_matrix[u][v]) if delta_uv == np.inf: # not infected continue alt = dist [v] + graph_matrix[u][v] + q_u(y[v]) if alt < dist[u]: dist[u] = alt y[u] = y[v] # u inherits label from parent v self.label_distributions_[u,y[v]] +=1 Q.put(dist[u],u) else: warnings.warn( 'max_iter=%d was reached without convergence.' % self.max_iter, category=ConvergenceWarning ) self.n_iter_ += 1 for i in range(self.max_iter): normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() return self
def fit(self, X, y): """ Kernelizes passed data and then fits data according to passed model. Function inherits all attributes and features of SKLearn's base esimator class as well as passed model. As part of fit process, feature data is kernelized (based on instance kernel parameter) and normalized -- should parameterize normalization or functionalize both together outside `fit`. __Parameters__ > __X__ : ndarray of shape (n_samples, n_features) >- Training data > > __y__ : ndarray of shape (n_samples, spatial dimensions) >- Response data (location of Tx for each sample set > of measurements) __Returns__ > Self, sets self.X_, self.Y_ """ # Check that X and y have correct shape X, y = check_X_y(X, y, multi_output=True) # Check that number of kernels and number of kernel scales is same if self.n_kernels != len(self.n_meas_array): raise ValueError("n_kernels is not same as number of n_meas_array") # Check that number of each measurement types is correct if sum(self.n_meas_array) != X.shape[1]: raise ValueError( "Sum of n_meas_array is not same as number of features in X") #put lambdau into ndarray self.lambdau = np.array([self.lambdau]) #put kernel scales together (reset in case called multiple times) kernel_scales = np.array([self.kernel_s0]) for i in range(1, self.n_kernels): kernel_scales = np.append(kernel_scales, self.get_params()["kernel_s" + str(i)]) # Generate kernelized matrix for fit input X_kernel = HFF_k_matrix(fml=X, kernel=self.skl_kernel, num_meas_array=self.n_meas_array, varMs=kernel_scales) #normalize X_kernel = Normalizer().fit_transform(X_kernel) # Fit self.glmnet_model = glmnet(x=X_kernel, y=y.copy(), alpha=self.glm_alpha, lambdau=self.lambdau, **self.glmnet_args) # Store X,y seen during fit self.X_ = X self.y_ = y # Return the regressor return self
def partial_fit(self, X, y=None, **fit_params): """ A wrapper around the partial_fit function. Parameters ---------- X : xarray DataArray, Dataset or other array-like The input samples. y : xarray DataArray, Dataset or other array-like The target values. """ if self.estimator is None: raise ValueError("You must specify an estimator instance to wrap.") if is_target(y): y = y(X) if is_dataarray(X): if not hasattr(self, "type_"): self.type_ = "DataArray" self.estimator_ = self._fit(X, y, **fit_params) elif self.type_ == "DataArray": self.estimator_ = self._partial_fit(self.estimator_, X, y, **fit_params) else: raise ValueError( "This wrapper was not fitted for DataArray inputs.") # TODO: check if this needs to be removed for compat wrappers for v in vars(self.estimator_): if v.endswith("_") and not v.startswith("_"): setattr(self, v, getattr(self.estimator_, v)) elif is_dataset(X): if not hasattr(self, "type_"): self.type_ = "Dataset" self.estimator_dict_ = { v: self._fit(X[v], y, **fit_params) for v in X.data_vars } elif self.type_ == "Dataset": self.estimator_dict_ = { v: self._partial_fit(self.estimator_dict_[v], X[v], y, **fit_params) for v in X.data_vars } else: raise ValueError("This wrapper was not fitted for Dataset inputs.") # TODO: check if this needs to be removed for compat wrappers for e_name, e in self.estimator_dict_.items(): for v in vars(e): if v.endswith("_") and not v.startswith("_"): if hasattr(self, v): getattr(self, v).update({e_name: getattr(e, v)}) else: setattr(self, v, {e_name: getattr(e, v)}) else: if not hasattr(self, "type_"): self.type_ = "other" if y is None: X = check_array(X) else: X, y = check_X_y(X, y) self.estimator_ = clone(self.estimator).fit(X, y, **fit_params) elif self.type_ == "other": self.estimator_ = self.estimator_.partial_fit(X, y, **fit_params) else: raise ValueError("This wrapper was not fitted for other inputs.") # TODO: check if this needs to be removed for compat wrappers for v in vars(self.estimator_): if v.endswith("_") and not v.startswith("_"): setattr(self, v, getattr(self.estimator_, v)) return self
def fit(self, training_data, training_labels): X, y = check_X_y(training_data, training_labels) training_data = training_data.T self.training_data = training_data self.training_labels = training_labels self.fstar = np.zeros( training_data.shape ) # selected features for each representative point; if fstar(i, j) = 1, ith feature is selected for jth representative point. self.fstar_lin = np.zeros( training_data.shape ) # fstar before applying randomized rounding process m_features, n_observations = training_data.shape # (M, N) M number of candidate features, N observations n_total_cls = [np.sum(training_labels ^ 1), np.sum(training_labels) ] # Total number of each class in our training data overall_feasibility = np.zeros((n_observations, self.n_beta)) overall_radious = np.zeros((n_observations, self.n_beta)) overall_b_ratio = np.zeros((n_observations, self.n_beta)) tb_temp = np.zeros((m_features, n_observations, self.n_beta)) tr_temp = np.zeros((m_features, n_observations, self.n_beta)) for z in range(0, self.tau): # For each feature across all observations in our training set we calculate a [0,1] fstar value for i_observation in range(0, n_observations): current_observation = training_data[:, i_observation][..., None] selected_label = training_labels[i_observation] matching_label = selected_label nonmatching_label = selected_label ^ 1 excluding_selected = np.ones( (1, n_observations), dtype=bool)[0] # mask for all observations not at i excluding_selected[ i_observation] = False # deselect the i'th observations excluded_labels = training_labels[ excluding_selected] # get the labels not at i, (convert to bool so we can use it as a mask) n_excluded_class = [ np.sum(excluded_labels ^ 1), np.sum(excluded_labels) ] fstar_mask = self.fstar.astype( bool ) # fstar mask to select all active features for this observation # Calculate the difference between this obseration and all other observations observation_distances = (training_data - current_observation)**2 # adjust weighting for each observation based on if we've previously found fstar values for the features observation_weight = np.zeros((n_observations, n_observations)) for i in range(n_observations): fstar_feature_dist_0 = ( np.sqrt( np.sum( observation_distances[:, excluding_selected & (training_labels == 0)] * fstar_mask[:, i][..., None], axis=0)) ) # total distance along features selected by fstar fstar_feature_dist_1 = ( np.sqrt( np.sum( observation_distances[:, excluding_selected & (training_labels == 1)] * fstar_mask[:, i][..., None], axis=0)) ) # total distance along features selected by fstar w11 = np.exp( (-(fstar_feature_dist_1 - np.min(fstar_feature_dist_1)) **2) / self.sigma) w22 = np.exp( (-(fstar_feature_dist_0 - np.min(fstar_feature_dist_0)) **2) / self.sigma) observation_weight[i, excluding_selected & (training_labels == 0)] = w22 observation_weight[i, excluding_selected & (training_labels == 1)] = w11 # observation_weight[i, excluding_selected] = np.concatenate((w22, w11)) average_observation_weight = np.mean( observation_weight, axis=0) # mean weight of all observations or features normalized_weight = np.zeros( (1, n_observations) ) # normalized weight for all observations not including the current observation normalized_weight[:, training_labels == 0] = average_observation_weight[ training_labels == 0] / np.sum( average_observation_weight[ training_labels == 0]) normalized_weight[:, training_labels == 1] = average_observation_weight[ training_labels == 1] / np.sum( average_observation_weight[ training_labels == 1]) # Find the average weighted difference for each feature average_feature_distances = [0, 0] average_feature_distances[matching_label] = np.sum( normalized_weight[0, excluding_selected & (training_labels == matching_label)] * observation_distances[:, excluding_selected & (training_labels == matching_label)], axis=1) / (n_total_cls[matching_label] - 1) average_feature_distances[nonmatching_label] = np.sum( normalized_weight[0, excluding_selected & (training_labels == nonmatching_label)] * observation_distances[:, excluding_selected & ( training_labels == nonmatching_label)], axis=1) / n_total_cls[nonmatching_label] A_ub_0 = np.concatenate( (np.ones((1, m_features)), -np.ones((1, m_features))), axis=0 ) # The inequality constraint matrix. Each row of A_ub specifies the coefficients of a linear inequality constraint on x. b_ub_0 = np.array( [[self.alpha], [-1]] ) # The inequality constraint vector. Each element represents an upper bound on the corresponding value of A_ub @ x. linprog_res_0 = linprog( -average_feature_distances[nonmatching_label], A_ub=A_ub_0, b_ub=b_ub_0, bounds=(0, 1), method='interior-point', options={ 'tol': 0.000001, 'maxiter': 200 }) # This is secretly a maximization function if linprog_res_0.success: epsilon_max = -linprog_res_0.fun for i_beta in range( 0, self.n_beta ): # beta is kind of the granularity or resolution, higher = better estimation? beta = np.round(1 / self.n_beta * (i_beta + 1), decimals=15) epsilon = beta * epsilon_max A_ub_1 = np.vstack( (np.ones((1, m_features)), -np.ones( (1, m_features)), -average_feature_distances[nonmatching_label])) b_ub_1 = np.vstack( (self.alpha, -1, -epsilon)) # b1 TODO: Rename linprog_res_1 = linprog( average_feature_distances[matching_label], A_ub=A_ub_1, b_ub=b_ub_1, bounds=(0.0, 1.0), method='interior-point', options={ 'tol': 1e-6, 'maxiter': 200 }) class_estimations = linprog_res_1.x[..., None] if linprog_res_1.success: # Random rounding, for each of our estimates that are close to 0.5 (in the middle of what class it should be) if self.rr_seed is not None: np.random.seed(seed=self.rr_seed) random_numbers = np.random.rand( m_features, self.nrrp) requires_adjustment = random_numbers <= class_estimations # compare our class estimations against random numbers, where our results are close to 0.5 we will get true, otherwise False unique_options = np.unique( requires_adjustment, axis=1 ) # this will result in all probable options for what our less certain features can be n_options = unique_options.shape[1] option_feasabilities = np.zeros( (1, n_options) )[0] # not all options are feasible, this is adjusted as feasible options are found option_radiuses = np.zeros((1, n_options))[0] option_distance_within = np.inf * np.ones( (1, n_options))[0] dr = np.zeros((1, n_options)) far = np.zeros((1, n_options)) # Each option is a probable case for which features could be further classified. We try each option, and find the one that best fits for i_option, option in enumerate( unique_options.T ): # For each probable option if np.sum( A_ub_1 @ option > b_ub_1[:, 0] ) == 0: # if atleast one feature is active and no more than maxNoFeatures if np.sum( option ) > 0: # If there is atleast one relevant feature in this option option_feasabilities[ i_option] = 1 # this is a feasible option if the above criteria is fulfilled representative_points = training_data[ option, :] # Each feature that has been selected in this option option_distance_within[ i_option] = average_feature_distances[ matching_label] @ option # get our previously calculated feature distance for the selected features active_point = representative_points[:, i_observation][ ..., None] rep_distances = np.abs( np.sqrt( np.sum( (-representative_points + active_point)**2, 0)) ) # We get the difference between this active point and all other rep points unique_distances = np.msort( np.unique(rep_distances) ) # we filter out all duplicate distances and sort in ascending order in order to find the smallest distance # Increase the difference threshold until we have more dissimilar observations than similar observations for i_distance, distance in enumerate( unique_distances, start=0): radious = distance observations_within_distance = ( rep_distances <= distance ) # find all representative points that are atleast distance different n_cls_within = [ np. sum(( training_labels ^ 1 )[observations_within_distance] ), # the number of 0's that fall within this difference threshold np. sum(training_labels[ observations_within_distance] ) # the number of 1's within the zone ] n_cls_within[ selected_label] = n_cls_within[ selected_label] - 1 # we subtract 1 since we arnt including our selected point # We want there to be less of our similar class proportionally at this distance than our dissimilar class if self.gamma * ( n_cls_within[ selected_label] / (n_total_cls[selected_label] - 1)) < ( n_cls_within[ selected_label ^ 1] / n_total_cls[selected_label ^ 1]): if i_distance > 0: radious = 0.5 * ( radious + unique_distances[ i_distance - 1] ) # this is the radious of how far we need to go for this n_cls_within[ selected_label] = n_cls_within[ selected_label] + 1 # increment the cound of how many classes are within this radious if radious == 0: # if the radious is 0, that is probably if the point is right on top of it, then pad it a bit radious = 0.000001 option_radiuses[ i_option] = radious observations_within_radious = ( rep_distances <= radious ) # how many are within the zone rep_points_within_radious = representative_points[:, ( observations_within_radious == 1 )] # these are which points are within the zone classes_within_radious = training_labels[ observations_within_radious == 1] dr[0, i_option] = 0 far[0, i_option] = 0 for i_point, rep_point_within in enumerate( rep_points_within_radious .T): rep_point_within = rep_point_within[ ..., None] dist_quasi_test = np.absolute( np.sqrt( np.sum(( representative_points - rep_point_within )**2, axis=0)) ) # distance between this point and all other points dist_quasi_test_cls = classes_within_radious[ i_point] min_uniq = np.sort( np.unique( dist_quasi_test) ) # we once again sort the features by ascending difference total_nearest_neighbours = 0 # Searches until it finds k nearest neighbours for i_distance_within, distance_within in enumerate( min_uniq): nearest_neighbours = dist_quasi_test <= min_uniq[ i_distance_within] # from smallest to largest, tries to find k nearest neighbours total_nearest_neighbours = np.sum( nearest_neighbours) if (total_nearest_neighbours > self.knn): break n_nearest_neighbours = [ # number of nearest neighbours of each class np.sum( nearest_neighbours & (training_labels ^ 1)), np.sum( nearest_neighbours & training_labels) ] # The case where the this point's class is in the majority amongst neighbouring points in the localized radious if dist_quasi_test_cls == selected_label and ( n_nearest_neighbours[ selected_label] - 1 ) > n_nearest_neighbours[ selected_label ^ 1]: dr[0, i_option] = dr[ 0, i_option] + 1 # Count the number of points that are in the majority # The case where the this point's class is in the minority amongst neighbouring points in the localized radious, that is there are more dissimilar points within the radious if dist_quasi_test_cls == ( selected_label ^ 1 ) and n_nearest_neighbours[ selected_label] > ( n_nearest_neighbours[ selected_label ^ 1] - 1): far[0, i_option] = far[ 0, i_option] + 1 # count the number of times points are in the minority break eval_criteria = [ dr / n_total_cls[0] - far / n_total_cls[ 1], # we get the difference between the proportion of similar to dissimilar neighbouring points within the radious dr / n_total_cls[1] - far / n_total_cls[0] ] i_lowest_distance_within = np.argmin( option_distance_within ) # find the shortest within distance TT_binary = unique_options[:, i_lowest_distance_within] overall_feasibility[i_observation, i_beta] = option_feasabilities[ i_lowest_distance_within] overall_radious[i_observation, i_beta] = option_radiuses[ i_lowest_distance_within] overall_b_ratio[ i_observation, i_beta] = eval_criteria[ training_labels[i_observation]][ 0, i_lowest_distance_within] if overall_feasibility[i_observation, i_beta] == 1: tb_temp[:, i_observation, i_beta] = TT_binary tr_temp[:, i_observation, i_beta] = linprog_res_1.x overall_b_ratio[overall_feasibility == 0] = -np.inf I1 = np.argmax( overall_b_ratio, axis=1 ) # what column (observation) contains the largest value for each row (feature) for j in range(n_observations): self.fstar[:, j] = tb_temp[:, j, I1[ j]] # what class is associated with the observation that has the largest value for this feature? self.fstar_lin[:, j] = tr_temp[:, i_observation, I1[j]]
def _validate_params(self, X, y): """Validate parameters as soon as :meth:`fit` is called. Parameters ---------- X : array-like, shape (n_samples, n_features) The training samples. y : array-like, shape (n_samples,) The corresponding training labels. Returns ------- X : array, shape (n_samples, n_features) The validated training samples. y : array, shape (n_samples,) The validated training labels, encoded to be integers in the range(0, n_classes). init : string or numpy array of shape (n_features_a, n_features_b) The validated initialization of the linear transformation. Raises ------- TypeError If a parameter is not an instance of the desired type. ValueError If a parameter's value violates its legal value range or if the combination of two or more given parameters is incompatible. """ # Validate the inputs X and y, and converts y to numerical classes. X, y = check_X_y(X, y, ensure_min_samples=2) check_classification_targets(y) y = LabelEncoder().fit_transform(y) # Check the preferred dimensionality of the projected space if self.n_components is not None: check_scalar(self.n_components, 'n_components', int, 1) if self.n_components > X.shape[1]: raise ValueError('The preferred dimensionality of the ' 'projected space `n_components` ({}) cannot ' 'be greater than the given data ' 'dimensionality ({})!'.format( self.n_components, X.shape[1])) # If warm_start is enabled, check that the inputs are consistent check_scalar(self.warm_start, 'warm_start', bool) if self.warm_start and hasattr(self, 'components_'): if self.components_.shape[1] != X.shape[1]: raise ValueError( 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).'.format( X.shape[1], self.components_.shape[1])) check_scalar(self.max_iter, 'max_iter', int, 1) check_scalar(self.tol, 'tol', float, 0.) check_scalar(self.verbose, 'verbose', int, 0) if self.callback is not None: if not callable(self.callback): raise ValueError('`callback` is not callable.') # Check how the linear transformation should be initialized init = self.init if isinstance(init, np.ndarray): init = check_array(init) # Assert that init.shape[1] = X.shape[1] if init.shape[1] != X.shape[1]: raise ValueError( 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], X.shape[1])) # Assert that init.shape[0] <= init.shape[1] if init.shape[0] > init.shape[1]: raise ValueError( 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1])) if self.n_components is not None: # Assert that self.n_components = init.shape[0] if self.n_components != init.shape[0]: raise ValueError('The preferred dimensionality of the ' 'projected space `n_components` ({}) does' ' not match the output dimensionality of ' 'the given linear transformation ' '`init` ({})!'.format( self.n_components, init.shape[0])) elif init in ['auto', 'pca', 'lda', 'identity', 'random']: pass else: raise ValueError( "`init` must be 'auto', 'pca', 'lda', 'identity', 'random' " "or a numpy array of shape (n_components, n_features).") return X, y, init
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y train_X, train_y = X, y unique, counts = np.unique(train_y, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = smote.fit_resample(train_X, train_y) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = svmSmote.fit_resample(train_X, train_y) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) train_X, train_y = adasyn.fit_resample(train_X, train_y) except RuntimeError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) train_X, train_y = sls.sample(train_X, train_y) # Testing all models scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning if len(self.ensemble_) > 1: alpha_good = scores > (0.5 + self.alpha) self.ensemble_ = [ self.ensemble_[i] for i in np.where(alpha_good)[0] ] if len(self.ensemble_) > self.ensemble_size - 1: worst = np.argmin(scores) del self.ensemble_[worst] # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
def fit(self, X, y=None): self.wrong_attribute = 0 X, y = check_X_y(X, y) return self
def fit(self, X, y): X, y = check_X_y(X, y) self.coef_ = np.ones(X.shape[1]) return self
def fit(self, X, y): X, y = check_X_y(X, y, dtype=None) return self
def fit(self, X, y, sample_weight=None): """Fit all base estimators. Parameters ---------- X : 2d numpy array or sparse matrix of shape [n_samples, n_features] Training data y : 1d numpy array of shape [n_samples] Target values. sample_weight : 1d numpy array of shape [n_samples] Individual weights for each sample. Passed to fit method of each estimator. Note: will be split automatically for each fold. Returns ------- self : object Fitted StackingTransformer instance. """ # --------------------------------------------------------------------- # Validation # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Check input data # --------------------------------------------------------------------- # Check X and y # ``check_estimator`` does not allow ``force_all_finite=False`` X, y = check_X_y(X, y, accept_sparse=['csr'], # allow csr, cast all others to csr force_all_finite=True, # do not allow nan and inf multi_output=False) # allow only one column in y_train # Check X and sample_weight # X is alredy checked, but we need it to compare length of sample_weight if sample_weight is not None: X, sample_weight = check_X_y(X, sample_weight, accept_sparse=['csr'], force_all_finite=True, multi_output=False) # --------------------------------------------------------------------- # Check ``estimators`` # --------------------------------------------------------------------- if self.estimators is None: if self.regression: self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))] else: self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))] # warnings.warn('No estimators were specified. ' # 'Using single dummy estimator as demo.', UserWarning) else: if 0 == len(self.estimators): raise ValueError('List of estimators is empty') else: # Clone self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators] # Check names of estimators names, estims = zip(*self.estimators_) self._validate_names(names) # Check if all estimators support ``sample_weight`` if sample_weight is not None: for name, estim in self.estimators_: if not has_fit_parameter(estim, 'sample_weight'): raise ValueError('Underlying estimator [%s] does not ' 'support sample weights.' % name) # --------------------------------------------------------------------- # Check other StackingTransformer parameters # --------------------------------------------------------------------- # ``variant`` if self.variant not in ['A', 'B']: raise ValueError('Parameter ``variant`` must be set properly') # ``n_folds`` if not isinstance(self.n_folds, int): raise ValueError('Parameter ``n_folds`` must be integer') if not self.n_folds > 1: raise ValueError('Parameter ``n_folds`` must be not less than 2') # ``verbose`` if self.verbose not in [0, 1, 2]: raise ValueError('Parameter ``verbose`` must be 0, 1, or 2') # Additional check for inapplicable parameter combinations # If ``regression=True`` we ignore classification-specific # parameters and issue user warning if self.regression and (self.needs_proba or self.stratified): warn_str = ('This is regression task hence classification-specific' 'parameters set to ``True`` were ignored:') if self.needs_proba: self.needs_proba = False warn_str += ' ``needs_proba``' if self.stratified: self.stratified = False warn_str += ' ``stratified``' warnings.warn(warn_str, UserWarning) # --------------------------------------------------------------------- # Compute attributes (basic properties of data, number of estimators, etc.) # --------------------------------------------------------------------- self.train_shape_ = X.shape self.n_train_examples_ = X.shape[0] self.n_features_ = X.shape[1] if not self.regression: self.n_classes_ = len(np.unique(y)) else: self.n_classes_ = None self.n_estimators_ = len(self.estimators_) self.train_footprint_ = self._get_footprint(X) # --------------------------------------------------------------------- # Specify default metric # --------------------------------------------------------------------- if self.metric is None and self.regression: self.metric_ = mean_absolute_error elif self.metric is None and not self.regression: if self.needs_proba: self.metric_ = log_loss else: self.metric_ = accuracy_score else: self.metric_ = self.metric # --------------------------------------------------------------------- # Create report header strings and print report header # --------------------------------------------------------------------- if self.verbose > 0: if self.regression: task_str = 'task: [regression]' else: task_str = 'task: [classification]' n_classes_str = 'n_classes: [%d]' % self.n_classes_ metric_str = 'metric: [%s]' % self.metric_.__name__ variant_str = 'variant: [%s]' % self.variant n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_ print(task_str) if not self.regression: print(n_classes_str) print(metric_str) print(variant_str) print(n_estimators_str + '\n') # --------------------------------------------------------------------- # Initialize cross-validation split # Stratified can be used only for classification # --------------------------------------------------------------------- if not self.regression and self.stratified: self.kf_ = StratifiedKFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) # Save target to be able to create stratified split in ``transform`` method # This is more efficient than to save split indices self._y_ = y.copy() else: self.kf_ = KFold(n_splits=self.n_folds, shuffle=self.shuffle, random_state=self.random_state) self._y_ = None # --------------------------------------------------------------------- # Compute implicit number of classes to create appropriate empty arrays. # !!! Important. In order to unify array creation # variable ``n_classes_implicit_`` is always equal to 1, except the case # when we performing classification task with ``needs_proba=True`` # --------------------------------------------------------------------- if not self.regression and self.needs_proba: self.n_classes_implicit_ = len(np.unique(y)) self.action_ = 'predict_proba' else: self.n_classes_implicit_ = 1 self.action_ = 'predict' # --------------------------------------------------------------------- # Create empty numpy array for train predictions (OOF) # !!! Important. We have to implicitly predict during fit # in order to compute CV scores, because # the most reasonable place to print out CV scores is fit method # --------------------------------------------------------------------- S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) # --------------------------------------------------------------------- # Prepare (clone) estmators for fitting and storing # We need models_A_ for both variant A and varian B # We need models_B_ for varian B only (in variant A attribute models_B_ is None) # --------------------------------------------------------------------- self.models_A_ = [] self.models_B_ = None for n, est in self.estimators_: self.models_A_.append([clone(est) for _ in range(self.n_folds)]) if self.variant in ['B']: self.models_B_ = [clone(est) for n, est in self.estimators_] # --------------------------------------------------------------------- # Create empty numpy array to store scores for each estimator and each fold # --------------------------------------------------------------------- self.scores_ = np.zeros((self.n_estimators_, self.n_folds)) # --------------------------------------------------------------------- # Create empty list to store name, mean and std for each estimator # --------------------------------------------------------------------- self.mean_std_ = [] # --------------------------------------------------------------------- # MAIN FIT PROCEDURE # --------------------------------------------------------------------- # Loop across estimators # --------------------------------------------------------------------- for estimator_counter, (name, estimator) in enumerate(self.estimators_): if self.verbose > 0: estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) print(estimator_str) # ----------------------------------------------------------------- # Loop across folds # ----------------------------------------------------------------- for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)): # Split data and target X_tr = X[tr_index] y_tr = y[tr_index] X_te = X[te_index] y_te = y[te_index] # Split sample weights accordingly (if passed) if sample_weight is not None: sample_weight_tr = sample_weight[tr_index] # sample_weight_te = sample_weight[te_index] else: sample_weight_tr = None # sample_weight_te = None # Fit estimator _ = self._estimator_action(self.models_A_[estimator_counter][fold_counter], X_tr, y_tr, None, sample_weight=sample_weight_tr, action='fit', transform=self.transform_target) # Predict out-of-fold part of train set if 'predict_proba' == self.action_: col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) else: col_slice_estimator = estimator_counter S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter], None, None, X_te, action=self.action_, transform=self.transform_pred) # Compute score score = self.metric_(y_te, S_train[te_index, col_slice_estimator]) self.scores_[estimator_counter, fold_counter] = score # Print fold score if self.verbose > 1: fold_str = ' fold %2d: [%.8f]' % (fold_counter, score) print(fold_str) # Compute mean and std and save in dict estim_name = self.estimators_[estimator_counter][0] estim_mean = np.mean(self.scores_[estimator_counter]) estim_std = np.std(self.scores_[estimator_counter]) self.mean_std_.append((estim_name, estim_mean, estim_std)) if self.verbose > 1: sep_str = ' ----' print(sep_str) # Compute mean + std (and full) if self.verbose > 0: mean_str = ' MEAN: [%.8f] + [%.8f]\n' % (estim_mean, estim_std) print(mean_str) # Fit estimator on full train set if self.variant in ['B']: if self.verbose > 0: print(' Fitting on full train set...\n') _ = self._estimator_action(self.models_B_[estimator_counter], X, y, None, sample_weight=sample_weight, action='fit', transform=self.transform_target) # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Return fitted StackingTransformer instance return self
def fit(self, X_l, y_l, X_h, y_h): """Fit Gaussian process regression model. Parameters ---------- X_l : array-like, shape = (n_l_samples, n_features) Training data y_l : array-like, shape = (n_l_samples, [n_output_dims]) Target values X_h : array-like, shape = (n_h_samples, n_features) Training data y_h : array-like, shape = (n_h_samples, [n_output_dims]) Target values Returns ------- self : returns an instance of self. """ if self.kernel is None: # Use an RBF kernel as default self.kernel_l_ = C(1.0, constant_value_bounds="fixed") \ * RBF(1.0, length_scale_bounds="fixed") else: self.kernel_l_ = clone(self.kernel) self.kernel_d_ = clone(self.kernel_l_) self.rng = check_random_state(self.random_state) X_l, y_l = check_X_y(X_l, y_l, multi_output=True, y_numeric=True) X_h, y_h = check_X_y(X_h, y_h, multi_output=True, y_numeric=True) self.n_l_ = len(X_l) # Normalize target value if self.normalize_y: self._y_l_train_mean = np.mean(y_l, axis=0) self._y_h_train_mean = np.mean(y_h, axis=0) # demean y y_l = y_l - self._y_l_train_mean y_h = y_h - self._y_h_train_mean else: self._y_l_train_mean = np.zeros(1) self._y_h_train_mean = np.zeros(1) self.X_train_ = np.vstack((X_l, X_h)) self.y_train_ = np.hstack((y_l, y_h)) theta_initial = np.hstack( (np.array([self.rho]), self.kernel_l_.theta, self.kernel_d_.theta)) if self.optimizer is not None and self.kernel_l_.n_dims > 0: # Choose hyperparameters based on maximizing the log-marginal # likelihood (potentially starting from several initial values) def obj_func(theta, eval_gradient=self.eval_gradient): if eval_gradient: raise Warning( "eval_gradient = True mode is not implemented yet!") lml, grad = self.log_marginal_likelihood( theta, eval_gradient=True) return -lml, -grad else: return -self.log_marginal_likelihood(theta) theta_bounds = np.r_[np.array(self.rho_bounds)[np.newaxis], self.kernel_l_.bounds, self.kernel_d_.bounds] # First optimize starting from theta specified in kernel optima = [(self._constrained_optimization(obj_func, theta_initial, theta_bounds, self.eval_gradient))] # Additional runs are performed from log-uniform chosen initial # theta if self.n_restarts_optimizer > 0: flag = np.isfinite(self.kernel_l_.bounds).all() and \ np.isfinite(self.kernel_d_.bounds).all() and \ np.isfinite(self.rho_bounds).all() if not flag: raise ValueError( "Multiple optimizer restarts (n_restarts_optimizer>0) " "requires that all bounds are finite.") bounds = np.vstack( (np.array(self.rho_bounds).reshape(1, -1), self.kernel_l_.bounds, self.kernel_d_.bounds)) for iteration in range(self.n_restarts_optimizer): theta_initial = np.hstack( (self.rng.uniform(bounds[0, 0], bounds[0, 1]), np.exp(self.rng.uniform(bounds[1:, 0], bounds[1:, 1])))) optima.append( self._constrained_optimization(obj_func, theta_initial, bounds, self.eval_gradient)) # Select result from run with minimal (negative) log-marginal # likelihood lml_values = list(map(itemgetter(1), optima)) best_hyperparams = optima[np.argmin(lml_values)][0] self.rho = best_hyperparams[0] self.kernel_l_.theta = best_hyperparams[1:1 + len(self.kernel_l_.theta)] self.kernel_d_.theta = best_hyperparams[1 + len(self.kernel_l_.theta):] self.log_marginal_likelihood_value_ = -np.min(lml_values) else: self.log_marginal_likelihood_value_ = \ self.log_marginal_likelihood(theta_initial) # Precompute quantities required for predictions which are independent # of actual query points K_lf = self.kernel_l_(self.X_train_[:self.n_l_]) K = np.vstack(( np.hstack((self.kernel_l_(self.X_train_[:self.n_l_]), self.rho * self.kernel_l_(self.X_train_[:self.n_l_], self.X_train_[self.n_l_:]))), np.hstack(( self.rho * self.kernel_l_(self.X_train_[self.n_l_:], self.X_train_[:self.n_l_]), self.rho**2 * self.kernel_l_(self.X_train_[self.n_l_:]) + # noqa W504 self.kernel_d_(self.X_train_[self.n_l_:]))))) K_lf[np.diag_indices_from(K_lf)] += self.alpha K[np.diag_indices_from(K)] += self.alpha try: self.L_lf_ = cholesky(K_lf, lower=True) # Line 2 (lf) self.L_ = cholesky(K, lower=True) # Line 2 # self.L_ changed, self._K_inv needs to be recomputed self._K_inv = None self._K_lf_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel is not returning a " "positive definite matrix. Try gradually " "increasing the 'alpha' parameter of your " "GaussianProcessRegressor estimator.", ) + exc.args raise self.alpha_lf_ = cho_solve((self.L_lf_, True), self.y_train_[:self.n_l_]) # Line 3 (Lf) self.alpha_ = cho_solve((self.L_, True), self.y_train_) # Line 3 return self
def fit(self, X, y): """Fit Gaussian process regression model. Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples, [n_output_dims]) Target values Returns ------- self : returns an instance of self. """ if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") \ * RBF(1.0, length_scale_bounds="fixed") else: self.kernel_ = clone(self.kernel) self._rng = check_random_state(self.random_state) X, y = check_X_y(X, y, multi_output=True, y_numeric=True) # Normalize target value if self.normalize_y: self._y_train_mean = np.mean(y, axis=0) # demean y y = y - self._y_train_mean else: self._y_train_mean = np.zeros(1) if np.iterable(self.alpha) \ and self.alpha.shape[0] != y.shape[0]: if self.alpha.shape[0] == 1: self.alpha = self.alpha[0] else: raise ValueError( "alpha must be a scalar or an array" " with same number of entries as y.(%d != %d)" % (self.alpha.shape[0], y.shape[0])) self.X_train_ = np.copy(X) if self.copy_X_train else X self.y_train_ = np.copy(y) if self.copy_X_train else y if self.optimizer is not None and self.kernel_.n_dims > 0: # Choose hyperparameters based on maximizing the log-marginal # likelihood (potentially starting from several initial values) def obj_func(theta, eval_gradient=True): if eval_gradient: lml, grad = self.log_marginal_likelihood( theta, eval_gradient=True) return -lml, -grad else: return -self.log_marginal_likelihood(theta) # First optimize starting from theta specified in kernel optima = [(self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds))] # Additional runs are performed from log-uniform chosen initial # theta if self.n_restarts_optimizer > 0: if not np.isfinite(self.kernel_.bounds).all(): raise ValueError( "Multiple optimizer restarts (n_restarts_optimizer>0) " "requires that all bounds are finite.") bounds = self.kernel_.bounds for iteration in range(self.n_restarts_optimizer): theta_initial = \ self._rng.uniform(bounds[:, 0], bounds[:, 1]) optima.append( self._constrained_optimization(obj_func, theta_initial, bounds)) # Select result from run with minimal (negative) log-marginal # likelihood lml_values = list(map(itemgetter(1), optima)) self.kernel_.theta = optima[np.argmin(lml_values)][0] self.log_marginal_likelihood_value_ = -np.min(lml_values) else: self.log_marginal_likelihood_value_ = \ self.log_marginal_likelihood(self.kernel_.theta) # Precompute quantities required for predictions which are independent # of actual query points K = self.kernel_(self.X_train_) K[np.diag_indices_from(K)] += self.alpha try: self.L_ = cholesky(K, lower=True) # Line 2 except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " "increasing the 'alpha' parameter of your " "GaussianProcessRegressor estimator." % self.kernel_, ) + exc.args raise self.alpha_ = cho_solve((self.L_, True), self.y_train_) # Line 3 L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0])) self.K_inv = L_inv.dot(L_inv.T) return self
def _fit(self, x, y): estimator = clone(self.estimator) def score_pri(slices, x0, y0): slices = list(slices) if len(slices) < 1: score0 = -np.inf else: slices = self.feature_unfold(slices) data_x0 = x0[:, slices] if hasattr(estimator, "best_score_"): estimator.fit(data_x0, y0) score0 = np.mean(estimator.best_score_) # score_test else: score0 = cross_val_score(estimator, data_x0, y0, cv=self.cv) score0 = np.mean(score0) # print(slices, score0) return score0 score = partial(score_pri, x0=x, y0=y) self.score_ = [] x, y = check_X_y(x, y, "csc") assert all((self.check_must, self.check_muti)) in [True, False] feature_list = list(range(x.shape[1])) fold_feature_list = self.feature_fold(feature_list) if self.check_must: fold_feature_list = [ i for i in fold_feature_list if i not in self.check_must ] slice_all = [combinations(fold_feature_list, i) for i in self.n_select] slice_all = [ list(self.feature_must_fold(_)) for i in slice_all for _ in i ] scores = parallelize(n_jobs=self.n_jobs, func=score, iterable=slice_all) feature_combination = [self.feature_unfold(_) for _ in slice_all] index = np.argmax(scores) select_feature = feature_combination[int(index)] su = np.zeros(x.shape[1], dtype=np.bool) su[select_feature] = 1 self.best_score_ = max(scores) self.score_ = scores self.support_ = su self.estimator_ = clone(self.estimator) if self.refit: if not hasattr(self.estimator_, 'best_score_'): warnings.warn( UserWarning( "The self.estimator_ :{} used all the X,y data.". format(self.estimator_.__class__.__name__), "please be careful with the later 'score' and 'predict'." )) if hasattr(self.estimator_, 'best_score_') and hasattr(self.estimator_, "refit") \ and self.estimator_.refit is True: warnings.warn( UserWarning( "The self.estimator_ :{} used all the X,y data.". format(self.estimator_.__class__.__name__), "please be careful with the later 'score' and 'predict'." )) self.estimator_.fit(x[:, select_feature], y) self.n_feature_ = len(select_feature) self.score_ex = list(zip(feature_combination, scores)) self.scatter = list(zip([len(i) for i in slice_all], scores)) self.score_ex.sort(key=lambda _: _[1], reverse=True) return self
def _scrub(self, X, y, sample_weight, output_weight, missing, **kwargs): ''' Sanitize input data. ''' # Check for sparseness if sparse.issparse(y): raise TypeError( 'A sparse matrix was passed, but dense data ' 'is required. Use y.toarray() to convert to dense.') if sparse.issparse(sample_weight): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use sample_weight.toarray()' 'to convert to dense.') if sparse.issparse(output_weight): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use output_weight.toarray()' 'to convert to dense.') # Check whether X is the output of patsy.dmatrices if y is None and isinstance(X, tuple): y, X = X # Handle X separately X, missing = self._scrub_x(X, missing, **kwargs) # Convert y to internally used data type y = np.asarray(y, dtype=np.float64) assert_all_finite(y) if len(y.shape) == 1: y = y[:, np.newaxis] # Deal with sample_weight if sample_weight is None: sample_weight = np.ones(y.shape[0], dtype=y.dtype) else: sample_weight = np.asarray(sample_weight) assert_all_finite(sample_weight) # Deal with output_weight if output_weight is None: output_weight = np.ones(y.shape[1], dtype=y.dtype) else: output_weight = np.asarray(output_weight) assert_all_finite(output_weight) # Make sure dimensions match if y.shape[0] != X.shape[0]: raise ValueError('X and y do not have compatible dimensions.') if y.shape[0] != sample_weight.shape[0]: raise ValueError( 'y and sample_weight do not have compatible dimensions.') if y.shape[1] != output_weight.shape[0]: raise ValueError( 'y and output_weight do not have compatible dimensions.') # Make sure everything is finite (except X, which is allowed to have # missing values) assert_all_finite(missing) assert_all_finite(y) assert_all_finite(sample_weight) assert_all_finite(output_weight) # Make sure everything is consistent check_X_y(X, y, accept_sparse=None, multi_output=True, force_all_finite=False) return X, y, sample_weight, output_weight, missing
def fit(self, X, y, estimator, cutting_rule, test_size=0.3, delta=0.5, feature_names=None, points=None): """ :param X: :param y: :param estimator: :param cutting_rule: :param test_size: :param delta: :param feature_names: :param points: :return: """ if self.verbose: print('Running basic MeLiF\nEnsemble of :{}'.format(self.ensemble)) feature_names = generate_features(X, feature_names) check_shapes(X, y) # check_features(features_names) self.__X, self.__y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr', accept_large_sparse=False) self.__feature_names = feature_names self.__filter_weights = np.ones(len(self.ensemble)) / len( self.ensemble) self.__points = points self.__estimator = estimator self.__cutting_rule = cutting_rule self.__delta = delta if self.verbose: print('Estimator: {}'.format(estimator)) print("Optimizer greedy search, optimizing measure is {}".format( self.__score)) time = dt.datetime.now() print("time:{}".format(time)) check_cutting_rule(cutting_rule) self._train_x, self._test_x, self._train_y, self._test_y = train_test_split( self.__X, self.__y, test_size=test_size) nu = self.ensemble.score(self.__X, self.__y, self.__feature_names) if self.__points is None: self.__points = [self.__filter_weights] for i in range(len(self.ensemble)): a = np.zeros(len(self.ensemble)) a[i] = 1 self.__points.append(a) best_point = self.__points[0] mapping = dict(zip(range(len(nu.keys())), nu.keys())) n = dict( zip(nu.keys(), self.__measure(np.array(list(nu.values())), best_point))) self.selected_features = self.__cutting_rule(n) self.best_f = {i: nu[i] for i in self.selected_features} for k, v in mapping.items(): nu[k] = nu.pop(v) self.__search(self.__points, nu) self.selected_features = [mapping[i] for i in self.selected_features] for k in list(self.best_f.keys()): self.best_f[mapping[k]] = self.best_f.pop(k) if self.verbose: print('Footer') print("Best point:{}".format(self.best_point)) print("Best Score:{}".format(self.best_score)) print('Top features:') for key, value in sorted(self.best_f.items(), key=lambda x: x[1], reverse=True): print("Feature: {}, value: {}".format(key, value))
def fit(self, X, y): """Prepare the DS model by setting the KNN algorithm and pre-processing the information required to apply the DS methods Parameters ---------- X : array of shape = [n_samples, n_features] The input data. y : array of shape = [n_samples] class labels of each example in X. Returns ------- self """ self.random_state_ = check_random_state(self.random_state) # Check if the length of X and y are consistent. X, y = check_X_y(X, y) # Check if the pool of classifiers is None. # If yes, use a BaggingClassifier for the pool. if self.pool_classifiers is None: if len(X) < 2: raise ValueError('More than one sample is needed ' 'if the pool of classifiers is not informed.') # Split the dataset into training (for the base classifier) and # DSEL (for DS) X_train, X_dsel, y_train, y_dsel = train_test_split( X, y, test_size=self.DSEL_perc, random_state=self.random_state_) # self.pool_classifiers_ = BaggingClassifier( # random_state=self.random_state_) self.pool_classifiers_ = RandomForestClassifier(n_estimators=200) # print(self.pool_classifiers_) self.pool_classifiers_.fit(X_train, y_train) else: self._check_base_classifier_fitted() self.pool_classifiers_ = self.pool_classifiers # print(self.pool_classifiers_) X_dsel = X y_dsel = y self.n_classifiers_ = len(self.pool_classifiers_) # check if the input parameters are correct. Raise an error if the # generated_pool is not fitted or k < 1 self._validate_parameters() # None # print(self.s_validate_parameters()) # Check label encoder on the pool of classifiers self.check_label_encoder() self._setup_label_encoder(y) # None y_dsel = self.enc_.transform(y_dsel) self._set_dsel(X_dsel, y_dsel) # validate the value of k self._validate_k() self._set_region_of_competence_algorithm() self._fit_region_competence(X_dsel, y_dsel) # validate the IH if self.with_IH: self._validate_ih() return self
def fit_transform(self, X, y=None, **fit_params): """ A wrapper around the fit_transform function. Parameters ---------- X : xarray DataArray, Dataset or other array-like The input samples. y : xarray DataArray, Dataset or other array-like The target values. Returns ------- Xt : xarray DataArray, Dataset or other array-like The transformed output. """ if self.estimator is None: raise ValueError("You must specify an estimator instance to wrap.") if is_target(y): y = y(X) if is_dataarray(X): self.type_ = "DataArray" self.estimator_ = clone(self.estimator) if self.reshapes is not None: data, dims = self._fit_transform(self.estimator_, X, y, **fit_params) coords = self._update_coords(X) return xr.DataArray(data, coords=coords, dims=dims) else: return xr.DataArray( self.estimator_.fit_transform(X.data, y, **fit_params), coords=X.coords, dims=X.dims, ) elif is_dataset(X): self.type_ = "Dataset" self.estimator_dict_ = {v: clone(self.estimator) for v in X.data_vars} if self.reshapes is not None: data_vars = dict() for v, e in self.estimator_dict_.items(): yp_v, dims = self._fit_transform(e, X[v], y, **fit_params) data_vars[v] = (dims, yp_v) coords = self._update_coords(X) return xr.Dataset(data_vars, coords=coords) else: data_vars = { v: (X[v].dims, e.fit_transform(X[v].data, y, **fit_params)) for v, e in self.estimator_dict_.items() } return xr.Dataset(data_vars, coords=X.coords) else: self.type_ = "other" if y is None: X = check_array(X) else: X, y = check_X_y(X, y) self.estimator_ = clone(self.estimator) Xt = self.estimator_.fit_transform(X, y, **fit_params) for v in vars(self.estimator_): if v.endswith("_") and not v.startswith("_"): setattr(self, v, getattr(self.estimator_, v)) return Xt
def fit(self, X, y, column_names=None, save_human_readable=False, remove_files=True): """A reference implementation of a fitting function for a classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) The training input samples. No numerical inputs are allowed it. y : array-like, shape (n_samples,) The target values. An array of int. column_names : list, default=None A list containing the names to assign to columns in the dataset. They will be used when printing the human readable format of the rules. remove_files : bool, default=True Use this parameter to remove all the file generated by the original L3 implementation at training time. Returns ------- self : object Returns self. """ self._train_bin_path = join(self.l3_root, BIN_DIR, TRAIN_BIN) self._classify_bin_path = join(self.l3_root, BIN_DIR, CLASSIFY_BIN) self._logger = logging.getLogger(__name__) X = check_dtype(X) check_classification_targets(y) # Check that X and y have correct shape X, y = check_X_y(X, y, dtype=np.unicode_) # Check that y has correct values according to sklearn's policy unique = unique_labels(y) # Check that the rule sets modifier is valid valid_modifiers = ['standard', 'level1'] if self.rule_sets_modifier not in valid_modifiers: raise NotImplementedError( f"The rule sets modifier specified is not" \ f"supported. Use one of {valid_modifiers}." ) # create mappings letting L3 binaries to work on strings only self._yorig_to_str, self._ystr_to_orig = build_y_mappings(unique) y = np.array([self._yorig_to_str[label] for label in y]) # Store the classes seen during fit self.classes_ = [label for label in self._ystr_to_orig.keys()] # Define the label when no rule matches if self.assign_unlabeled == 'majority_class': self.unlabeled_class_ = _get_majority_class(y) else: self.unlabeled_class_ = self.assign_unlabeled self.X_ = X self.y_ = y token = secrets.token_hex(4) filestem = f"{token}" train_dir = token if exists(train_dir): raise RuntimeError( f"The training dir with token {token} already exists") else: mkdir(train_dir) old_dir = getcwd() chdir(train_dir) # Create column names if not provided if column_names is None: column_names = _create_column_names(X) check_column_names(X, column_names) self._column_id_to_name = build_columns_dictionary(column_names) # Dump X and y in a single .data (csv) file. "y" target labels are inserted as the last column X_todump = np.hstack([X, y.reshape(-1, 1)]) _dump_array_to_file(X_todump, filestem, "data") # Invoke the training module of L3. if self.specialistic_rules: specialistic_flag = "0" else: specialistic_flag = "1" with open(f"{filestem}_stdout.txt", "w") as stdout: subprocess.run( [ self._train_bin_path, filestem, # training file filestem f"{self.min_sup * 100:.2f}", # min sup f"{self.min_conf * 100:.2f}", # min conf "nofiltro", # filtering measure for items (DEPRECATED) "0", # filtering threshold (DEPRECATED) specialistic_flag, # specialistic/general rules (TO VERIFY) f"{self.max_length}", # max length allowed for rules self. l3_root # L3 root containing the 'bin' directory with binaries ], stdout=stdout) # rename useful (lvl1) and sparse (lvl2) rule files rename(LEVEL1_FILE, f"{token}_{LEVEL1_FILE}") rename(LEVEL2_FILE, f"{token}_{LEVEL2_FILE}") # read the mappings of classification labels self._class_dict = build_class_dict(filestem) # read the mappings item->"column_name","value" self._item_id_to_item, self._item_to_item_id = build_item_dictionaries( filestem) self.n_items_used_ = len(self._item_id_to_item) # apply the rule set modifier if self.rule_sets_modifier == 'level1': with open(f"{token}_{LEVEL2_FILE}", "w") as fp: self._logger.debug("Empty the level 2 rule set.") # parse the two rule sets and store them self.lvl1_rules_ = parse_raw_rules(f"{token}_{LEVEL1_FILE}") self.lvl2_rules_ = parse_raw_rules(f"{token}_{LEVEL2_FILE}") self.n_lvl1_rules_ = len(self.lvl1_rules_) self.n_lvl2_rules_ = len(self.lvl2_rules_) # translate the model to human readable format if save_human_readable: write_human_readable(f"{token}_{LEVEL1_FILE_READABLE}", self.lvl1_rules_, self._item_id_to_item, self._column_id_to_name, self._class_dict) write_human_readable(f"{token}_{LEVEL2_FILE_READABLE}", self.lvl2_rules_, self._item_id_to_item, self._column_id_to_name, self._class_dict) if remove_files: _remove_fit_files(token) chdir(old_dir) if remove_files and not save_human_readable: shutil.rmtree(train_dir) self.current_token_ = token # keep track of the latest token generated by the fit method return self
def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): random_state = check_random_state(self.random_state) if check_input: X, y = check_X_y(X, y, dtype=DTYPE, multi_output=False) # Determine output settings n_samples, self.n_features_ = X.shape is_classification = isinstance(self, ClassifierMixin) y = np.atleast_1d(y) expanded_class_weight = None if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs # [:, np.newaxis] that does not. y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] if is_classification: check_classification_targets(y) y = np.copy(y) self.classes_ = [] self.n_classes_ = [] if self.class_weight is not None: y_original = np.copy(y) y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( self.class_weight, y_original) else: self.classes_ = [None] * self.n_outputs_ self.n_classes_ = [1] * self.n_outputs_ self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters max_depth = ((2 ** 31) - 1 if self.max_depth is None else self.max_depth) if isinstance(self.min_samples_split, (numbers.Integral, np.integer)): if not 2 <= self.min_samples_split: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the integer %s" % self.min_samples_split) min_samples_split = self.min_samples_split else: # float if not 0. < self.min_samples_split <= 1.: raise ValueError("min_samples_split must be an integer " "greater than 1 or a float in (0.0, 1.0]; " "got the float %s" % self.min_samples_split) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) if len(y) != n_samples: raise ValueError("Number of labels=%d does not match " "number of samples=%d" % (len(y), n_samples)) if max_depth <= 0: raise ValueError("max_depth must be greater than zero. ") if sample_weight is not None: if (getattr(sample_weight, "dtype", None) != DOUBLE or not sample_weight.flags.contiguous): sample_weight = np.ascontiguousarray( sample_weight, dtype=DOUBLE) if len(sample_weight.shape) > 1: raise ValueError("Sample weights array has more " "than one dimension: %d" % len(sample_weight.shape)) if len(sample_weight) != n_samples: raise ValueError("Number of weights=%d does not match " "number of samples=%d" % (len(sample_weight), n_samples)) if expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: sample_weight = expanded_class_weight # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): if is_classification: criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) splitter = self.splitter if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter](criterion, random_state) self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) builder = DepthFirstTreeBuilder(splitter, min_samples_split, max_depth) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] return self
def check_input(input_data, y=None, preprocessor=None, type_of_inputs='classic', tuple_size=None, accept_sparse=False, dtype='numeric', order=None, copy=False, force_all_finite=True, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, estimator=None): """Checks that the input format is valid, and converts it if specified (this is the equivalent of scikit-learn's `check_array` or `check_X_y`). All arguments following tuple_size are scikit-learn's `check_X_y` arguments that will be enforced on the data and labels array. If indicators are given as an input data array, the returned data array will be the formed points/tuples, using the given preprocessor. Parameters ---------- input : array-like The input data array to check. y : array-like The input labels array to check. preprocessor : callable (default=`None`) The preprocessor to use. If None, no preprocessor is used. type_of_inputs : `str` {'classic', 'tuples'} The type of inputs to check. If 'classic', the input should be a 2D array-like of points or a 1D array like of indicators of points. If 'tuples', the input should be a 3D array-like of tuples or a 2D array-like of indicators of tuples. accept_sparse : `bool` Set to true to allow sparse inputs (only works for sparse inputs with dim < 3). tuple_size : int The number of elements in a tuple (e.g. 2 for pairs). dtype : string, type, list of types or None (default='numeric') Data type of result. If None, the dtype of the input is preserved. If 'numeric', dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=`None`) Whether an array will be forced to be fortran or c-style. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf and np.nan in X. This parameter does not influence whether y can have np.inf or np.nan values. The possibilities are: - True: Force all values of X to be finite. - False: accept both np.inf and np.nan in X. - 'allow-nan': accept only np.nan values in X. Values cannot be infinite. ensure_min_samples : int (default=1) Make sure that X has a minimum number of samples in its first axis (rows for a 2D array). ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when X has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. estimator : str or estimator instance (default=`None`) If passed, include the name of the estimator in warning messages. Returns ------- X : `numpy.ndarray` The checked input data array. y: `numpy.ndarray` (optional) The checked input labels array. """ context = make_context(estimator) args_for_sk_checks = dict(accept_sparse=accept_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, estimator=estimator) # We need to convert input_data into a numpy.ndarray if possible, before # any further checks or conversions, and deal with y if needed. Therefore # we use check_array/check_X_y with fixed permissive arguments. if y is None: input_data = check_array(input_data, ensure_2d=False, allow_nd=True, copy=False, force_all_finite=False, accept_sparse=True, dtype=None, ensure_min_features=0, ensure_min_samples=0) else: input_data, y = check_X_y(input_data, y, ensure_2d=False, allow_nd=True, copy=False, force_all_finite=False, accept_sparse=True, dtype=None, ensure_min_features=0, ensure_min_samples=0, multi_output=multi_output, y_numeric=y_numeric) if type_of_inputs == 'classic': input_data = check_input_classic(input_data, context, preprocessor, args_for_sk_checks) elif type_of_inputs == 'tuples': input_data = check_input_tuples(input_data, context, preprocessor, args_for_sk_checks, tuple_size) # if we have y and the input data are pairs, we need to ensure # the labels are in [-1, 1]: if y is not None and input_data.shape[1] == 2: check_y_valid_values_for_pairs(y) else: raise ValueError( "Unknown value {} for type_of_inputs. Valid values are " "'classic' or 'tuples'.".format(type_of_inputs)) return input_data if y is None else (input_data, y)
def fit(self, X, y=None): self._good_attribute = 1 X, y = check_X_y(X, y) return self
def fit(self, X, y): """Fit a semi-supervised label propagation model based All the input data is provided matrix X (labeled and unlabeled) and corresponding label matrix y with a dedicated marker value for unlabeled samples. Parameters ---------- X : array-like, shape = [n_samples, n_features] A {n_samples by n_samples} size matrix will be created from this y : array_like, shape = [n_samples] n_labeled_samples (unlabeled points are marked as -1) All unlabeled samples will be transductively assigned labels Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y) self.X_ = X check_classification_targets(y) # actual graph construction (implementations should override this) graph_matrix = self._build_graph() # label construction # construct a categorical distribution for classification only classes = np.unique(y) classes = (classes[classes != -1]) self.classes_ = classes n_samples, n_classes = len(y), len(classes) alpha = self.alpha if self._variant == 'spreading' and \ (alpha is None or alpha <= 0.0 or alpha >= 1.0): raise ValueError('alpha=%s is invalid: it must be inside ' 'the open interval (0, 1)' % alpha) y = np.asarray(y) unlabeled = y == -1 # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) if self._variant == 'propagation': # LabelPropagation y_static[unlabeled] = 0 else: # LabelSpreading y_static *= 1 - alpha l_previous = np.zeros((self.X_.shape[0], n_classes)) unlabeled = unlabeled[:, np.newaxis] if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() for self.n_iter_ in range(self.max_iter): if np.abs(self.label_distributions_ - l_previous).sum() < self.tol: break l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) if self._variant == 'propagation': normalizer = np.sum( self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer self.label_distributions_ = np.where(unlabeled, self.label_distributions_, y_static) else: # clamp self.label_distributions_ = np.multiply( alpha, self.label_distributions_) + y_static else: warnings.warn( 'max_iter=%d was reached without convergence.' % self.max_iter, category=ConvergenceWarning ) self.n_iter_ += 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] self.transduction_ = transduction.ravel() return self
def fit(self, X, y, **kwargs): check_X_y(X, y) self.random_state_ = check_random_state(self.random_state) if 'warm_start' in kwargs and kwargs['warm_start']: check_is_fitted(self, ['chis_', 'estimated_membership_']) self.solve_strategy[1]['init'] = self.chis_ self.chis_ = solve_optimization(X, y, self.c, self.k, self.solve_strategy[0], self.solve_strategy[1]) if type(self.k) is PrecomputedKernel: self.gram_ = self.k.kernel_computations else: self.gram_ = np.array([[self.k.compute(x1, x2) for x1 in X] for x2 in X]) self.fixed_term_ = np.array(self.chis_).dot(self.gram_.dot(self.chis_)) def estimated_square_distance_from_center(x_new): ret = self.k.compute(x_new, x_new) \ - 2 * np.array([self.k.compute(x_i, x_new) for x_i in X]).dot(self.chis_) \ + self.fixed_term_ return ret self.estimated_square_distance_from_center_ = \ estimated_square_distance_from_center self.chi_SV_index_ = [ i for i, (chi, mu) in enumerate(zip(self.chis_, y)) if -self.c * (1 - mu) < chi < self.c * mu ] #self.chi_SV_index_ = [i for i in range(len(self.chis)_) \ # if -self.c*(1-self.mu[i]) < self.chis_[i] < self.c*self.mu[i]] chi_SV_square_distance = map(estimated_square_distance_from_center, X[self.chi_SV_index_]) chi_SV_square_distance = list(chi_SV_square_distance) #chi_SV_square_distance = [estimated_square_distance_from_center(x[i]) # for i in chi_SV_index] if len(chi_SV_square_distance) == 0: self.estimated_membership_ = None self.train_error_ = np.inf self.chis_ = None self.profile = None warn('No support vectors found') return self #raise ValueError('No support vectors found') self.SV_square_distance_ = np.mean(chi_SV_square_distance) num_samples = 500 if self.sample_generator is None: self.sample_generator = lambda x: x #sample = map(self.sample_generator, # self.random_state_.random_sample(num_samples)) sample = self.sample_generator(num_samples) fuzzifier = self.fuzzifier(X, y) result = fuzzifier.get_fuzzified_membership( self.SV_square_distance_, sample, self.estimated_square_distance_from_center_, return_profile=self.return_profile) if self.return_profile: self.estimated_membership_, self.profile_ = result else: self.estimated_membership_ = result[0] self.train_error_ = np.mean([(self.estimated_membership_(x) - mu)**2 for x, mu in zip(X, y)]) return self
def fit(self, X, y): X, y = check_X_y(X, y) return self
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) if sp.issparse(X): raise ValueError("Nonsensical Error") return self
def fit(self, X, y): X, y = check_X_y(X, y) self.classes_ = unique_labels(y) self.classifiers_pool_ = self.estimators return self
def partial_fit(self, X, y, classes=None): """ Incremental building of Mondrian Forests. Parameters ---------- X : array_like, shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` y: array_like, shape = [n_samples] Input targets. classes: array_like, shape = [n_classes] Ignored for a regression problem. For a classification problem, if not provided this is inferred from y. This is taken into account for only the first call to partial_fit and ignored for subsequent calls. Returns ------- self: instance of MondrianForest """ X, y = check_X_y(X, y, dtype=np.float32, multi_output=False) random_state = check_random_state(self.random_state) # Wipe out estimators if partial_fit is called after fit. first_call = not hasattr(self, "first_") if first_call: self.first_ = True if isinstance(self, ClassifierMixin): if first_call: if classes is None: classes = LabelEncoder().fit(y).classes_ self.classes_ = classes self.n_classes_ = len(self.classes_) # Remap output n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn( "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, stacklevel=2) self.n_outputs_ = 1 # Initialize estimators at first call to partial_fit. if first_call: # Check estimators self._validate_estimator() self.estimators_ = [] for _ in range(self.n_estimators): tree = self._make_estimator(append=False, random_state=random_state) self.estimators_.append(tree) # XXX: Switch to threading backend when GIL is released. if isinstance(self, ClassifierMixin): self.estimators_ = Parallel( n_jobs=self.n_jobs, backend="multiprocessing", verbose=self.verbose)( delayed(_single_tree_pfit)(t, X, y, classes) for t in self.estimators_) else: self.estimators_ = Parallel(n_jobs=self.n_jobs, backend="multiprocessing", verbose=self.verbose)( delayed(_single_tree_pfit)(t, X, y) for t in self.estimators_) return self
def _validate_and_reformat_input(X, y=None, expect_y=True, enforce_binary_labels=False, **kwargs): """Validate input data and return the data in an appropriate format. :param X: The feature matrix :type X: numpy.ndarray or pandas.DataFrame :param y: The label vector :type y: numpy.ndarray, pandas.DataFrame, pandas.Series, or list :param expect_y: if True y needs to be provided, otherwise ignores the argument; default True :type expect_y: bool :param enforce_binary_labels: if True raise exception if there are more than two distinct values in the `y` data; default False :type enforce_binary_labels: bool :return: the validated and reformatted X, y, and sensitive_features; note that certain estimators rely on metadata encoded in X which may be stripped during the reformatting process, so mitigation methods should ideally use the input X instead of the returned X for training estimators and leave potential reformatting of X to the estimator. :rtype: (pandas.DataFrame, pandas.Series, pandas.Series) """ if y is not None: # calling check_X_y with a 2-dimensional y causes a warning, so ensure it is 1-dimensional if isinstance(y, np.ndarray) and len(y.shape) == 2 and y.shape[1] == 1: y = y.squeeze() elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: y = y.to_numpy().squeeze() X, y = check_X_y(X, y) y = check_array(y, ensure_2d=False, dtype='numeric') if enforce_binary_labels and not set(np.unique(y)).issubset(set([0, 1 ])): raise ValueError(_LABELS_NOT_0_1_ERROR_MESSAGE) elif expect_y: raise ValueError(_MESSAGE_Y_NONE) else: X = check_array(X) sensitive_features = kwargs.get(_KW_SENSITIVE_FEATURES) if sensitive_features is None: raise ValueError(_MESSAGE_SENSITIVE_FEATURES_NONE) check_consistent_length(X, sensitive_features) sensitive_features = check_array(sensitive_features, ensure_2d=False, dtype=None) # compress multiple sensitive features into a single column if len(sensitive_features.shape) > 1 and sensitive_features.shape[1] > 1: sensitive_features = \ _compress_multiple_sensitive_features_into_single_column(sensitive_features) # If we don't have a y, then need to fiddle with return type to # avoid a warning from pandas if y is not None: result_y = pd.Series(y) else: result_y = pd.Series(dtype="float64") return pd.DataFrame(X), result_y, pd.Series(sensitive_features.squeeze())