def predict_proba(self, X): """Predict label probabilities with the fitted estimator on predictor(s) X. Returns ------- proba : array of shape = [n_samples] The predicted label probabilities of the input samples. """ proba = [] X_subs = self._get_subdata(X) for i in range(self.n_classes_): e = self.estimators_[i] X_i = X_subs[i] pred = e.predict(X_i).reshape(-1, 1) proba.append(pred) proba = np.hstack(proba) normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer assert_all_finite(proba) return proba
def predict(self, X): """ Perform regression on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples] Predicted target values for X """ try: assert_all_finite(self.coef_) pred = safe_sparse_dot(X, self.coef_.T) except ValueError: n_samples = X.shape[0] n_vectors = self.coef_.shape[0] pred = np.zeros((n_samples, n_vectors)) if not self.outputs_2d_: pred = pred.ravel() return pred
def _check_X_y(X, y, dtype="numeric", accept_sparse=False, order=None, copy=False, force_all_finite=True, ensure_2d=True): if y is None: raise ValueError("y cannot be None") X = _check_array(X, accept_sparse=accept_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d) y = _column_or_1d(y) if y.dtype.kind == 'O': y = y.astype(np.float64) # TODO: replace on daal4py from sklearn.utils.validation import assert_all_finite assert_all_finite(y) lengths = [len(X), len(y)] uniques = np.unique(lengths) if len(uniques) > 1: raise ValueError("Found input variables with inconsistent numbers of" " samples: %r" % [int(length) for length in lengths]) return X, y
def _fit_diag(self, pairs, y): """Learn diagonal metric using MMC. Parameters ---------- X : (n x d) data matrix each row corresponds to a single instance constraints : 4-tuple of arrays (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d) dissimilar pairs """ num_dim = pairs.shape[2] pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] s_sum = np.sum((pos_pairs[:, 0, :] - pos_pairs[:, 1, :]) ** 2, axis=0) it = 0 error = 1.0 eps = 1e-6 reduction = 2.0 w = np.diag(self.A_).copy() while error > self.convergence_threshold and it < self.max_iter: fD0, fD_1st_d, fD_2nd_d = self._D_constraint(neg_pairs, w) obj_initial = np.dot(s_sum, w) + self.diagonal_c * fD0 fS_1st_d = s_sum # first derivative of the similarity constraints gradient = fS_1st_d - self.diagonal_c * fD_1st_d # gradient of the objective hessian = -self.diagonal_c * fD_2nd_d + eps * np.eye(num_dim) # Hessian of the objective step = np.dot(np.linalg.inv(hessian), gradient) # Newton-Rapshon update # search over optimal lambda lambd = 1 # initial step-size w_tmp = np.maximum(0, w - lambd * step) obj = (np.dot(s_sum, w_tmp) + self.diagonal_c * self._D_objective(neg_pairs, w_tmp)) assert_all_finite(obj) obj_previous = obj + 1 # just to get the while-loop started inner_it = 0 while obj < obj_previous: obj_previous = obj w_previous = w_tmp.copy() lambd /= reduction w_tmp = np.maximum(0, w - lambd * step) obj = (np.dot(s_sum, w_tmp) + self.diagonal_c * self._D_objective(neg_pairs, w_tmp)) inner_it += 1 assert_all_finite(obj) w[:] = w_previous error = np.abs((obj_previous - obj_initial) / obj_previous) if self.verbose: print('mmc iter: %d, conv = %f' % (it, error)) it += 1 self.A_ = np.diag(w) self.transformer_ = transformer_from_metric(self.A_) return self
def _scrub_x(self, X, missing, **kwargs): ''' Sanitize input predictors and extract column names if appropriate. ''' # Check for sparseness if sparse.issparse(X): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use X.toarray() to convert to dense.') # Figure out missingness if missing is None: # Infer missingness missing = np.isnan(X) # Convert to internally used data type missing = np.asarray(missing, dtype=BOOL, order='F') assert_all_finite(missing) if missing.ndim == 1: missing = missing[:, np.newaxis] X = np.asarray(X, dtype=np.float64, order='F') if not self.allow_missing: try: assert_all_finite(X) except ValueError: raise ValueError("Input contains NaN, infinity or a value that's too large. Did you mean to set allow_missing=True?") if X.ndim == 1: X = X[:, np.newaxis] # Ensure correct number of columns if hasattr(self, 'basis_') and self.basis_ is not None: if X.shape[1] != self.basis_.num_variables: raise ValueError('Wrong number of columns in X') return X, missing
def predict_proba(self, X): """ Predict label probabilities with the fitted estimator on predictor(s) X. Returns ------- proba : array of shape = [n_samples] The predicted label probabilities of the input samples. """ proba = [] X_subs = self._get_subdata(X) for i in range(self.n_classes_): e = self.estimators_[i] X_i = X_subs[i] pred = e.predict(X_i).reshape(-1, 1) proba.append(pred) proba = np.hstack(proba) normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer assert_all_finite(proba) return proba
def _make_meta(self, X): rows = [] for e in self.estimators_: proba = e.predict_proba(X) assert_all_finite(proba) rows.append(proba) return np.hstack(rows)
def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == 'randomized': kwargs = {} if self.n_svd_vecs is not None: kwargs['n_oversamples'] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == 'arpack': u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == "randomized": kwargs = {} if self.n_svd_vecs is not None: kwargs["n_oversamples"] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == "arpack": u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def predict(self, X): """ Perform regression on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples] Predicted target values for X """ try: assert_all_finite(self.coef_) pred = safe_sparse_dot(X, self.coef_.T) pred += self.intercept_ except ValueError: n_samples = X.shape[0] n_vectors = self.coef_.shape[0] pred = np.zeros((n_samples, n_vectors)) if not self.outputs_2d_: pred = pred.ravel() return pred
def test_suppress_validation(): X = np.array([0, np.inf]) assert_raises(ValueError, assert_all_finite, X) sklearn.set_config(assume_finite=True) assert_all_finite(X) sklearn.set_config(assume_finite=False) assert_raises(ValueError, assert_all_finite, X)
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : classifier Returns self. """ rs = check_random_state(self.random_state) reencode = self.multiclass y, n_classes, n_vectors = self._set_label_transformers(y, reencode) ds = get_dataset(X) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() if n_vectors == 1 or not self.multiclass: Y = np.asfortranarray(self.label_binarizer_.fit_transform(y), dtype=np.float64) for i in xrange(n_vectors): _binary_sgd(self, self.coef_, self.intercept_, i, ds, Y[:, i], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) elif self.multiclass: _multiclass_sgd(self, self.coef_, self.intercept_, ds, y.astype(np.int32), loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) else: raise ValueError("Wrong value for multiclass.") try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def test_gibbs_smoke(): """Check if we don't get NaNs sampling the full digits dataset.""" rng = np.random.RandomState(42) X = Xdigits.astype(np.float32) rbm1 = BernoulliRBM(X.shape[1], n_hidden=42, batch_size=40, n_iter=20, random_state=rng) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled)
def test_gibbs_smoke(): """ just seek if we don't get NaNs sampling the full digits dataset """ rng = np.random.RandomState(42) X = Xdigits rbm1 = BernoulliRBM(n_components=42, batch_size=10, n_iter=20, random_state=rng) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled)
def test_gibbs_smoke(): """Check if we don't get NaNs sampling the full digits dataset. Also check that sampling again will yield different results.""" X = Xdigits rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled) X_sampled2 = rbm1.gibbs(X) assert_true(np.all((X_sampled != X_sampled2).max(axis=1)))
def test_gibbs_smoke(): # Check if we don't get NaNs sampling the full digits dataset. # Also check that sampling again will yield different results. X = Xdigits rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42) rbm1.fit(X) X_sampled = rbm1.gibbs(X) assert_all_finite(X_sampled) X_sampled2 = rbm1.gibbs(X) assert np.all((X_sampled != X_sampled2).max(axis=1))
def bad_rows(X, is_X=True): bad = [] from sklearn.utils.validation import assert_all_finite for i in range(X.shape[0]): try: assert_all_finite(X[i]) except ValueError: print("Index %s was not finite" % i) bad.append(i) print_bad(X[i], i, is_X) return bad
def custom_svd(array, n_components, n_discard,n_svd_vecs): u, _, vt = svds(array, k=n_components, ncv=n_svd_vecs) if np.any(np.isnan(vt)): _, v = eigsh(safe_sparse_dot(array.T, array),ncv=n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T),ncv=n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Returns ------- self : regressor Returns self. """ rs = check_random_state(self.random_state) ds = get_dataset(X) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.outputs_2d_ = len(y.shape) == 2 if self.outputs_2d_: Y = y else: Y = y.reshape(-1, 1) Y = np.asfortranarray(Y) n_vectors = Y.shape[1] self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() for k in range(n_vectors): _binary_sgd(self, self.coef_, self.intercept_, k, ds, Y[:, k], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Returns ------- self : regressor Returns self. """ rs = check_random_state(self.random_state) ds = get_dataset(X) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.outputs_2d_ = len(y.shape) == 2 if self.outputs_2d_: Y = y else: Y = y.reshape(-1, 1) Y = np.asfortranarray(Y) n_vectors = Y.shape[1] self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() for k in xrange(n_vectors): _binary_sgd(self, self.coef_, self.intercept_, k, ds, Y[:, k], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def main(name, num, useSpecial=False): labels = [] with open("C:/MissingWord/corrScoring/" + name + "Labels.txt", "r") as f: for line in f: labels.append(float(line)) features = [] with open("C:/MissingWord/corrScoring/1000features.txt", "r") as f: for line in f: features.append([float(elem) for elem in line.split(",")]) specialFeatures = getSpecialFeatures(len(features)) if useSpecial: for i in range(min(len(specialFeatures), len(features))): features[i].extend(specialFeatures[i]) features = features[:num] labels = labels[:num] for i in range(len(features)): if len(features[i]) != len(features[0]): print(i) try: assert_all_finite(features[i]) except: print(i) cutoff = int(len(features) * 7 / 10) trainFeatures = features[:cutoff] testFeatures = features[cutoff:] trainLabels = labels[:cutoff] testLabels = labels[cutoff:] #regr = svm.SVR(C=1) regr = RandomForestRegressor(n_estimators=300, n_jobs=7) #regr = linear_model.LinearRegression() regr.fit(trainFeatures, trainLabels) print("Train Residual sum of squares: %.2f" % np.mean( (regr.predict(trainFeatures) - trainLabels)**2)) print("Test Residual sum of squares: %.2f" % np.mean( (regr.predict(testFeatures) - testLabels)**2)) print('Variance score: %.2f' % regr.score(testFeatures, testLabels)) with open("C:/MissingWord/corrScoring/" + name + ".regr", "wb") as f: pickle.dump(regr, f)
def check_input_arrays(*args, validate_len=True, force_all_finite=True): """Cast input sequences into numpy arrays. Only inputs that are sequence-like will be converted, all other inputs will be left as is. When `validate_len` is True, the sequences will be checked for equal length. Parameters ---------- args : scalar or array_like Inputs to be checked. validate_len : bool (default=True) Whether to check if the input arrays have the same length. force_all_finite : bool (default=True) Whether to allow inf and nan in input arrays. Returns ------- args: array-like List of inputs where sequence-like objects have been cast to numpy arrays. """ n = None args = list(args) for i, arg in enumerate(args): if np.ndim(arg) > 0: new_arg = check_array(arg, dtype=None, ensure_2d=False, accept_sparse=True, force_all_finite=force_all_finite) if not force_all_finite: # For when checking input values is disabled try: assert_all_finite(new_arg) except ValueError: warnings.warn( "Input contains NaN, infinity or a value too large for dtype('float64') " "but input check is disabled. Check the inputs before proceeding." ) if validate_len: m = new_arg.shape[0] if n is None: n = m else: assert ( m == n ), "Input arrays have incompatible lengths: {} and {}".format( n, m) args[i] = new_arg return args
def test_cd_linear_trivial(): # trivial example that failed due to gh#4 loss = Squared() alpha = 1e-5 n_features = 100 x = np.zeros((1, n_features)) x[0, 1] = 1 y = np.ones(1) cb = Callback(x, y, alpha) w = _fit_linear(x, y, alpha, n_iter=20, loss=loss, callback=cb) assert_all_finite(w) assert_all_finite(cb.losses_)
def predict(self, X): try: assert_all_finite(self.coef_) pred = safe_sparse_dot(X, self.coef_.T) except ValueError: n_samples = X.shape[0] n_vectors = self.coef_.shape[0] pred = np.zeros((n_samples, n_vectors)) if not self.outputs_2d_: pred = pred.ravel() return pred
def _check_alphas(self): create_path = self.alphas is None if create_path: if self.n_alphas <= 0: raise ValueError("n_alphas must be a positive integer") alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64) else: alphas = column_or_1d(self.alphas, warn=True) assert_all_finite(alphas) check_non_negative(alphas, "alphas") assert_all_finite(alphas) return alphas, create_path
def fit(self, X, y): rs = check_random_state(self.random_state) reencode = self.multiclass y, n_classes, n_vectors = self._set_label_transformers(y, reencode) ds = get_dataset(X) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() if n_vectors == 1 or not self.multiclass: Y = np.asfortranarray(self.label_binarizer_.fit_transform(y), dtype=np.float64) for i in xrange(n_vectors): _binary_sgd(self, self.coef_, self.intercept_, i, ds, Y[:, i], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) elif self.multiclass: _multiclass_sgd(self, self.coef_, self.intercept_, ds, y.astype(np.int32), loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) else: raise ValueError("Wrong value for multiclass.") try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def main(name, num, useSpecial = False): labels = [] with open("C:/MissingWord/corrScoring/"+name+"Labels.txt", "r") as f: for line in f: labels.append(float(line)) features = [] with open("C:/MissingWord/corrScoring/1000features.txt", "r") as f: for line in f: features.append([float(elem) for elem in line.split(",")]) specialFeatures = getSpecialFeatures(len(features)) if useSpecial: for i in range(min(len(specialFeatures), len(features))): features[i].extend(specialFeatures[i]) features = features[:num] labels = labels[:num] for i in range(len(features)): if len(features[i]) != len(features[0]): print(i) try: assert_all_finite(features[i]) except: print(i) cutoff = int(len(features) * 7 / 10) trainFeatures = features[:cutoff] testFeatures = features[cutoff:] trainLabels = labels[:cutoff] testLabels = labels[cutoff:] #regr = svm.SVR(C=1) regr = RandomForestRegressor(n_estimators = 300, n_jobs = 7) #regr = linear_model.LinearRegression() regr.fit(trainFeatures, trainLabels) print("Train Residual sum of squares: %.2f"% np.mean((regr.predict(trainFeatures) - trainLabels) ** 2)) print("Test Residual sum of squares: %.2f"% np.mean((regr.predict(testFeatures) - testLabels) ** 2)) print('Variance score: %.2f' % regr.score(testFeatures, testLabels)) with open("C:/MissingWord/corrScoring/"+name+".regr", "wb") as f: pickle.dump(regr, f)
def _check_params(self, n_features): if not 0 < self.l1_ratio <= 1: raise ValueError("l1_ratio must be in interval ]0;1], but was %f" % self.l1_ratio) if self.tol <= 0: raise ValueError("tolerance must be positive, but was %f" % self.tol) if self.penalty_factor is None: penalty_factor = numpy.ones(n_features, dtype=numpy.float64) else: pf = column_or_1d(self.penalty_factor, warn=True) if pf.shape[0] != n_features: raise ValueError("penalty_factor must be array of length n_features (%d), " "but got %d" % (n_features, pf.shape[0])) assert_all_finite(pf) check_non_negative(pf, "penalty_factor") penalty_factor = pf * n_features / pf.sum() assert_all_finite(penalty_factor) create_path = self.alphas is None if create_path: if self.n_alphas <= 0: raise ValueError("n_alphas must be a positive integer") alphas = numpy.empty(int(self.n_alphas), dtype=numpy.float64) else: alphas = column_or_1d(self.alphas, warn=True) assert_all_finite(alphas) check_non_negative(alphas, "alphas") assert_all_finite(alphas) if self.max_iter <= 0: raise ValueError("max_iter must be a positive integer") return create_path, alphas.astype(numpy.float64), penalty_factor.astype(numpy.float64)
def _check_penalty_factor(self, n_features): if self.penalty_factor is None: penalty_factor = numpy.ones(n_features, dtype=numpy.float64) else: pf = column_or_1d(self.penalty_factor, warn=True) if pf.shape[0] != n_features: raise ValueError( "penalty_factor must be array of length n_features (%d), " "but got %d" % (n_features, pf.shape[0])) assert_all_finite(pf) check_non_negative(pf, "penalty_factor") penalty_factor = pf * n_features / pf.sum() assert_all_finite(penalty_factor) return penalty_factor
def fit(self, X, X_error=None): """Implements the standard fitting function for a DL8.5 classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) The training input samples. If X_error is provided, it represents explanation input X_error : array-like, shape (n_samples, n_features_1) The training input used to calculate error. If it is not provided X is used to calculate error Returns ------- self : object Returns self. """ # Check that X_error has correct shape and raise ValueError if not if X_error is not None: assert_all_finite(X_error) X_error = check_array(X_error, dtype='int32') if self.error_function is None: if X_error is None: self.error_function = lambda tids: self.default_error(tids, X) else: if X_error.shape[0] == X.shape[0]: self.error_function = lambda tids: self.default_error( tids, X_error) else: raise ValueError( "X_error does not have the same number of rows as X") if self.leaf_value_function is None: if X_error is None: self.leaf_value_function = lambda tids: self.default_leaf_value( tids, X) else: if X_error.shape[0] == X.shape[0]: self.leaf_value_function = lambda tids: self.default_leaf_value( tids, X_error) else: raise ValueError( "X_error does not have the same number of rows as X") # call fit method of the predictor DL85Predictor.fit(self, X) # print(self.tree_) # Return the classifier return self
def _scrub(self, X, y, sample_weight, **kwargs): ''' Sanitize input data. ''' # Check for sparseness if sparse.issparse(y): raise TypeError( 'A sparse matrix was passed, but dense data ' 'is required. Use y.toarray() to convert to dense.') if sparse.issparse(sample_weight): raise TypeError( 'A sparse matrix was passed, but dense data ' 'is required. Use sample_weight.toarray() to convert to dense.' ) # Check whether X is the output of patsy.dmatrices if y is None and isinstance(X, tuple): y, X = X # Handle X separately X = self._scrub_x(X, **kwargs) # Convert y to internally used data type y = np.asarray(y, dtype=np.float64) assert_all_finite(y) y = y.reshape(y.shape[0]) # Deal with sample_weight if sample_weight is None: sample_weight = np.ones(y.shape[0], dtype=y.dtype) else: sample_weight = np.asarray(sample_weight) assert_all_finite(sample_weight) sample_weight = sample_weight.reshape(sample_weight.shape[0]) # Make sure dimensions match if y.shape[0] != X.shape[0]: raise ValueError('X and y do not have compatible dimensions.') if y.shape != sample_weight.shape: raise ValueError( 'y and sample_weight do not have compatible dimensions.') # Make sure everything is finite assert_all_finite(X) assert_all_finite(y) assert_all_finite(sample_weight) return X, y, sample_weight
def _validate_inputs(self, X, y): X, y = check_X_y(X, y, accept_sparse=False) assert_all_finite(X, y) if np.any(np.iscomplex(X)) or np.any(np.iscomplex(y)): raise ValueError("Complex data not supported") if np.issubdtype(X.dtype, np.object_) or np.issubdtype( y.dtype, np.object_): try: X = X.astype(float) y = y.astype(int) except TypeError: raise TypeError("argument must be a string.* number") return (X, y)
def predict(self, X): assert_all_finite(X) check_is_fitted(self, 'is_fitted_') X = check_array(X, accept_sparse=True) n_iteration = len(self.clf) #ensemble the number of choosen classifier self.pred1 = np.zeros((X.shape[0], n_iteration)) self.ensemble_pred1 = np.zeros((X.shape[0], )) for i in range(n_iteration): pred1 = self.clf[i].predict(X) self.pred1[:, i] = pred1 self.ensemble_pred1 = (self.pred1 * self.alpha[:, :n_iteration]).sum(axis=1) result = sgn(self.ensemble_pred1) return np.where(result == -1, 0, result)
def _scrub(self, X, y, sample_weight, **kwargs): ''' Sanitize input data. ''' # Check for sparseness if sparse.issparse(y): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use y.toarray() to convert to dense.') if sparse.issparse(sample_weight): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use sample_weight.toarray()' 'to convert to dense.') # Check whether X is the output of patsy.dmatrices if y is None and isinstance(X, tuple): y, X = X # Handle X separately X = self._scrub_x(X, **kwargs) # Convert y to internally used data type y = np.asarray(y, dtype=np.float64) assert_all_finite(y) y = y.reshape(y.shape[0]) # Deal with sample_weight if sample_weight is None: sample_weight = np.ones(y.shape[0], dtype=y.dtype) else: sample_weight = np.asarray(sample_weight) assert_all_finite(sample_weight) sample_weight = sample_weight.reshape(sample_weight.shape[0]) # Make sure dimensions match if y.shape[0] != X.shape[0]: raise ValueError('X and y do not have compatible dimensions.') if y.shape != sample_weight.shape: raise ValueError( 'y and sample_weight do not have compatible dimensions.') # Make sure everything is finite assert_all_finite(X) assert_all_finite(y) assert_all_finite(sample_weight) return X, y, sample_weight
def predict_proba(self, X): proba = [] X_subs = self._get_subdata(X) for i in range(self.n_classes_): e = self.estimators_[i] X_i = X_subs[i] pred = e.predict(X_i).reshape(-1, 1) proba.append(pred) proba = np.hstack(proba) normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer assert_all_finite(proba) return proba
def _base_estimator_predict(self, e, X): """Predict label values with the specified estimator on predictor(s) X. Parameters ---------- e : int The estimator object. X : np.ndarray, shape=(n, m) The feature data for which to compute the predicted outputs. Returns ------- pred : np.ndarray, shape=(len(X), 1) The mean of the label probabilities predicted by the specified estimator for each fold for each instance X. """ # Generate array for the base-level testing set, which is n x n_folds. pred = e.predict(X) assert_all_finite(pred) return pred
def _scrub_x(self, X, **kwargs): ''' Sanitize input predictors and extract column names if appropriate. ''' # Check for sparseness if sparse.issparse(X): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use X.toarray() to convert to dense.') # Convert to internally used data type X = np.asarray(X, dtype=np.float64, order='F') assert_all_finite(X) if X.ndim == 1: X = X[:, np.newaxis] # Ensure correct number of columns if hasattr(self, 'basis_') and self.basis_ is not None: if X.shape[1] != self.basis_.num_variables: raise ValueError('Wrong number of columns in X') return X
def _scrub_x(self, X, **kwargs): ''' Sanitize input predictors and extract column names if appropriate. ''' # Check for sparseness if sparse.issparse(X): raise TypeError( 'A sparse matrix was passed, but dense data ' 'is required. Use X.toarray() to convert to dense.') # Convert to internally used data type X = np.asarray(X, dtype=np.float64) assert_all_finite(X) if len(X.shape) == 1: X = X.reshape((X.shape[0], 1)) # Ensure correct number of columns if hasattr(self, 'basis_') and self.basis_ is not None: if X.shape[1] != self.basis_.num_variables: raise ValueError('Wrong number of columns in X') return X
def _validate_inputs(self, X): # Things we don't want to allow until we've tested them: # - Sparse inputs # - Multiclass outputs (e.g., more than 2 classes in `y`) # - Non-finite inputs # - Complex inputs if isinstance(X, pd.DataFrame): X = X.to_numpy() X = check_array(X, accept_sparse=False, allow_nd=False) assert_all_finite(X) if np.any(np.iscomplex(X)): raise ValueError("Complex data not supported") if np.issubdtype(X.dtype, np.object_): try: X = X.astype(float) except (TypeError, ValueError): raise ValueError("argument must be a string.* number") return (X)
def fit(self, X, y): rs = check_random_state(self.random_state) ds = get_dataset(X) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.outputs_2d_ = len(y.shape) == 2 if self.outputs_2d_: Y = y else: Y = y.reshape(-1, 1) Y = np.asfortranarray(Y) n_vectors = Y.shape[1] self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() for k in xrange(n_vectors): _binary_sgd(self, self.coef_, self.intercept_, k, ds, Y[:, k], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def _scrub_x(self, X, missing, **kwargs): ''' Sanitize input predictors and extract column names if appropriate. ''' # Check for sparseness if sparse.issparse(X): raise TypeError( 'A sparse matrix was passed, but dense data ' 'is required. Use X.toarray() to convert to dense.') X = np.asarray(X, dtype=np.float64, order='F') # Figure out missingness if missing is None: # Infer missingness missing = np.isnan(X) # Convert to internally used data type missing = np.asarray(missing, dtype=BOOL, order='F') assert_all_finite(missing) if missing.ndim == 1: missing = missing[:, np.newaxis] if not self.allow_missing: try: assert_all_finite(X) except ValueError: raise ValueError( "Input contains NaN, infinity or a value that's too large. Did you mean to set allow_missing=True?" ) if X.ndim == 1: X = X[:, np.newaxis] # Ensure correct number of columns if hasattr(self, 'basis_') and self.basis_ is not None: if X.shape[1] != self.basis_.num_variables: raise ValueError('Wrong number of columns in X') return X, missing
def test_make_biclusters(): X, rows, cols = make_biclusters(shape=(100, 100), n_clusters=4, shuffle=True, random_state=0) assert_equal(X.shape, (100, 100), "X shape mismatch") assert_equal(rows.shape, (4, 100), "rows shape mismatch") assert_equal(cols.shape, (4, 100), "columns shape mismatch") assert_all_finite(X) assert_all_finite(rows) assert_all_finite(cols) X2, _, _ = make_biclusters(shape=(100, 100), n_clusters=4, shuffle=True, random_state=0) assert_array_equal(X, X2)
def test_make_checkerboard(): X, rows, cols = make_checkerboard(shape=(100, 100), n_clusters=(20, 5), shuffle=True, random_state=0) assert_equal(X.shape, (100, 100), "X shape mismatch") assert_equal(rows.shape, (100, 100), "rows shape mismatch") assert_equal(cols.shape, (100, 100), "columns shape mismatch") X, rows, cols = make_checkerboard(shape=(100, 100), n_clusters=2, shuffle=True, random_state=0) assert_all_finite(X) assert_all_finite(rows) assert_all_finite(cols) X1, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2, shuffle=True, random_state=0) X2, _, _ = make_checkerboard(shape=(100, 100), n_clusters=2, shuffle=True, random_state=0) assert_array_equal(X1, X2)
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : classifier Returns self. """ rs = check_random_state(self.random_state) reencode = self.multiclass y, n_classes, n_vectors = self._set_label_transformers(y, reencode) self.train_x = get_dataset(X) n_samples = self.train_x.get_n_samples() n_features = self.train_x.get_n_features() #self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.upd_ = np.zeros(int(self.max_iter * n_samples)+1, dtype=np.float64) self.seq_ = np.zeros(int(self.max_iter * n_samples)+1, dtype=np.int32) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() if n_vectors == 1 or not self.multiclass: Y = np.asfortranarray(self.label_binarizer_.fit_transform(y), dtype=np.float64) for i in xrange(n_vectors): (self.upd_, self.tr_err) = _karma_sgd(self, #self.coef_, self.upd_, self.seq_, self.intercept_, i, self.train_x, Y[:, i], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.gamma, self.verbose) # elif self.multiclass: # _multiclass_sgd(self, self.coef_, self.intercept_, # ds, y.astype(np.int32), loss, penalty, # self.alpha, self._get_learning_rate(), # self.eta0, self.power_t, self.fit_intercept, # self.intercept_decay, # int(self.max_iter * n_samples), # self.shuffle, rs, self.callback, self.n_calls, # self.verbose) else: raise ValueError("Wrong value for multiclass.") try: assert_all_finite(self.upd_) assert_all_finite(self.seq_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def fit(self, X, X_test, y, y_test): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : classifier Returns self. """ rs = check_random_state(self.random_state) reencode = self.multiclass y, _, n_vectors = self._set_label_transformers(y, reencode) y_test, _, n_vectors_test = self._set_label_transformers(y_test, reencode) #assert n_vectors==n_vectors_test ds = get_dataset(X) ds_test = get_dataset(X_test) n_samples = ds.get_n_samples() n_features = ds.get_n_features() self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) loss = self._get_loss() penalty = self._get_penalty() if n_vectors == 1 or not self.multiclass: Y = np.asfortranarray(self.label_binarizer_.fit_transform(y), dtype=np.float64) Y_test = np.asfortranarray(self.label_binarizer_.fit_transform(y_test), dtype=np.float64) for i in xrange(n_vectors): _binary_sgd_test(self, self.coef_, self.intercept_, i, ds, Y[:, i], ds_test, Y_test[:, i], loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose, self.black_out, (self.disp_freq * n_samples), (self.test_freq * n_samples)) elif self.multiclass: _multiclass_sgd(self, self.coef_, self.intercept_, ds, y.astype(np.int32), loss, penalty, self.alpha, self._get_learning_rate(), self.eta0, self.power_t, self.fit_intercept, self.intercept_decay, int(self.max_iter * n_samples), self.shuffle, rs, self.callback, self.n_calls, self.verbose) else: raise ValueError("Wrong value for multiclass.") try: assert_all_finite(self.coef_) except ValueError: warnings.warn("coef_ contains infinite values") return self
def _scrub(self, X, y, sample_weight, output_weight, missing, **kwargs): ''' Sanitize input data. ''' # Check for sparseness if sparse.issparse(y): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use y.toarray() to convert to dense.') if sparse.issparse(sample_weight): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use sample_weight.toarray()' 'to convert to dense.') if sparse.issparse(output_weight): raise TypeError('A sparse matrix was passed, but dense data ' 'is required. Use output_weight.toarray()' 'to convert to dense.') # Check whether X is the output of patsy.dmatrices if y is None and isinstance(X, tuple): y, X = X # Handle X separately X, missing = self._scrub_x(X, missing, **kwargs) # Convert y to internally used data type y = np.asarray(y, dtype=np.float64) assert_all_finite(y) if len(y.shape) == 1: y = y[:, np.newaxis] # Deal with sample_weight if sample_weight is None: sample_weight = np.ones(y.shape[0], dtype=y.dtype) else: sample_weight = np.asarray(sample_weight) assert_all_finite(sample_weight) # Deal with output_weight if output_weight is None: output_weight = np.ones(y.shape[1], dtype=y.dtype) else: output_weight = np.asarray(output_weight) assert_all_finite(output_weight) # Make sure dimensions match if y.shape[0] != X.shape[0]: raise ValueError('X and y do not have compatible dimensions.') if y.shape[0] != sample_weight.shape[0]: raise ValueError( 'y and sample_weight do not have compatible dimensions.') if y.shape[1] != output_weight.shape[0]: raise ValueError( 'y and output_weight do not have compatible dimensions.') # Make sure everything is finite (except X, which is allowed to have # missing values) assert_all_finite(missing) assert_all_finite(y) assert_all_finite(sample_weight) assert_all_finite(output_weight) # Make sure everything is consistent check_X_y(X, y, accept_sparse=None, multi_output=True, force_all_finite=False) return X, y, sample_weight, output_weight, missing
def _base_estimator_predict_proba(self, e, X): pred = e.predict_proba(X) assert_all_finite(pred) return pred
__author__ = 'SEOKHO' from sklearn.utils.validation import assert_all_finite import numpy as np print(assert_all_finite(np.array([1.5465978588774336, 13.173744467937684, 0.7164582594283925, 6.073044534100405, 0.563888990932914, 9.253016646256619, 1.5479898935732566, 13.172805509656142, 1.76135884564872, 15.4882753202587, 0.6621080655920463, 7.368970402194793, 0.5638305796422928, 9.249753374333018, 0.6931471805599453, 3.4011973816621555, 1.7616927591930054, 15.488775300844194, 0.5637344396624081, 9.249657234353133, 1.791759469228055, 1.791759469228055, 4.240504214996096, 9.587611745713565, 0.8472978603872037, 4.584967478670572, 1.78673244403277, 15.47979284224048, 1.7552292288982276, 15.488951473171578, 1.5461643256320923, 13.173248488496412, 1.7612086097539006, 15.48839602164465, 1.6084169832999933, 7.579678823090456, 1.5475324485137967, 13.172303356473655, 1.761260622260961, 15.488375937043429, 6.163314804034641, 3.258096538021482])))