def _svd(self, array, n_components, n_discard): """Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`. """ if self.svd_method == "randomized": kwargs = {} if self.n_svd_vecs is not None: kwargs["n_oversamples"] = self.n_svd_vecs u, _, vt = randomized_svd(array, n_components, random_state=self.random_state, **kwargs) elif self.svd_method == "arpack": u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs) if np.any(np.isnan(vt)): # some eigenvalues of A * A.T are negative, causing # sqrt() to be np.nan. This causes some vectors in vt # to be np.nan. _, v = eigsh(safe_sparse_dot(array.T, array), ncv=self.n_svd_vecs) vt = v.T if np.any(np.isnan(u)): _, u = eigsh(safe_sparse_dot(array, array.T), ncv=self.n_svd_vecs) assert_all_finite(u) assert_all_finite(vt) u = u[:, n_discard:] vt = vt[n_discard:] return u, vt.T
def compute(function, x, A, b, args, coordinate=None): L2 = args["L2"] if function == "loss": """Compute the square error.""" reg = 0.5 * L2 * np.sum(x ** 2) if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] return ((b - b_pred) ** 2).sum() / 2 + reg elif function == "gradient": if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] residual = b_pred - b if coordinate is None: grad = safe_sparse_dot(residual, A) grad += L2 * x else: grad = safe_sparse_dot(residual, A[:, coordinate]) grad += (L2 * x[coordinate]) return grad elif function == "lipschitz": lipschitz_values = np.sum(A ** 2, axis=0) + L2 return lipschitz_values
def _bilinear_cd(U, V, X_left, X_right, y, alpha): n_samples, n_features_left = X_left.shape n_components = V.shape[1] XrV = safe_sparse_dot(X_right, V) viol = 0 for j in range(n_features_left): for s in range(n_components): XlU = safe_sparse_dot(X_left, U) y_pred = np.sum(XlU * XrV, axis=1) # grad_loss = loss.dloss(y_pred, y) grad_loss = y_pred - y grad = np.dot(grad_loss * X_left[:, j], XrV[:, s]) # grad /= n_samples grad += alpha * U[j, s] inv_step_size = np.dot(X_left[:, j] ** 2, XrV[:, s] ** 2) # inv_step_size /= np.sqrt(n_samples) inv_step_size += alpha update = grad / inv_step_size viol += np.abs(update) U[j, s] -= update XlU = safe_sparse_dot(X_left, U) y_pred = np.sum(XlU * XrV, axis=1) lv = 0.5 * np.sum((y_pred - y) ** 2) lv += 0.5 * alpha * (np.sum(U ** 2) + np.sum(V ** 2)) return viol, lv
def _decision_scores(self, X): """Predict using the ELM model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y_pred : array-like, shape (n_samples,) or (n_samples, n_outputs) The predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) if self.batch_size is None: hidden_activations = self._compute_hidden_activations(X) y_pred = safe_sparse_dot(hidden_activations, self.coef_output_) else: n_samples = X.shape[0] batches = gen_batches(n_samples, self.batch_size) y_pred = np.zeros((n_samples, self.n_outputs_)) for batch in batches: h_batch = self._compute_hidden_activations(X[batch]) y_pred[batch] = safe_sparse_dot(h_batch, self.coef_output_) return y_pred
def _backprop(self, X, y, n_samples, a_hidden, a_output, delta_o): """Computes the MLP cost function and its corresponding derivatives with respect to the different parameters given in the initialization. Parameters ---------- theta : array-like, shape (size(W1) * size(W2) * size(b1) * size(b2)) A vector comprising the flattened weights : "W1, W2, b1, b2" X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. y : numpy array of shape (n_samples) Subset of the target values. n_samples : int Number of samples Returns ------- cost : float grad : array-like, shape (size(W1) * size(W2) * size(b1) * size(b2)) """ # Forward propagate a_hidden[:] = self.activation_func(safe_sparse_dot(X, self.coef_hidden_) + self.intercept_hidden_) a_output[:] = self.output_func(safe_sparse_dot(a_hidden, self.coef_output_) + self.intercept_output_) # get cost cost = self.loss_functions[self.loss](y, a_output) # add regularization term to cost cost += (0.5 * self.alpha) * (np.sum(self.coef_hidden_ ** 2) + np.sum(self.coef_output_ ** 2)) \ / n_samples # backward propagate diff = y - a_output delta_o[:] = -diff delta_h = np.dot(delta_o, self.coef_output_.T) *\ self.derivative_func(a_hidden) # get regularized gradient W1grad = (safe_sparse_dot(X.T, delta_h) + (self.alpha * self.coef_hidden_)) / n_samples W2grad = (safe_sparse_dot(a_hidden.T, delta_o) + (self.alpha * self.coef_output_)) / n_samples b1grad = np.mean(delta_h, 0) b2grad = np.mean(delta_o, 0) return cost, W1grad, W2grad, b1grad, b2grad
def complement_joint_log_likelihood(self, X, i): """Calculate the posterior log probability of the samples X 1 - (|c| - 1) * ((P(¬c)ΠP(w_i|¬c)) / (ΣP(¬c)ΠP(w_i|¬c)))""" check_is_fitted(self, "classes_") X = check_array(X, accept_sparse='csr') return (1 - (len(self.classes_) - 1)) * np.array(safe_sparse_dot(X, self.complement_feature_log_prob_.T) - np.sum(self.class_log_prior_[i] + safe_sparse_dot(X, self.complement_feature_log_prob_.T)))
def transform(self, X): # compute hidden layer activation if hasattr(self, 'weights_u_') and hasattr(self, 'weights_v_'): projected = safe_sparse_dot(X, self.weights_u_, dense_output=True) projected = safe_sparse_dot(projected, self.weights_v_) else: projected = safe_sparse_dot(X, self.weights_, dense_output=True) return self._activate(projected + self.biases_)
def _neg_free_energy(self,V): ''' Compute -1 * free energy (i.e. log p(V) * Z, where Z - normalizer) ''' # sum_j = 1:M b_j * Vj fe = safe_sparse_dot(V,self.bias_visible_,dense_output = True) # sum_j=1:M log( 1 + exp(sum_i=1:N Wij * Vj)) fe += np.log( 1 + np.exp( self.bias_hidden_ + safe_sparse_dot(V,self.weights_.T))).sum(1) return fe
def _joint_log_likelihood(self, X, i): """Calculate the posterior log probability of the samples X P(c) * Π P(w_i|c) / ΣP(c) * Π P(w_i|c)""" check_is_fitted(self, "classes_") X = check_array(X, accept_sparse='csr') numerator = self.class_log_prior_[i] + safe_sparse_dot(X, self.feature_log_prob_.T) denominator = np.sum(self.class_log_prior_[i] + safe_sparse_dot(X, self.feature_log_prob_.T)) return np.array(numerator - denominator)
def compute(function, x, A, b, args, coordinate=None): L1 = args["L1"] if function == "loss": reg = L1 * np.sum(np.abs(x)) if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] loss = np.sum((b - b_pred) ** 2) / 2 + reg return loss elif function == "gradient": if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] loss = b_pred - b if coordinate is None: grad = safe_sparse_dot(A.T, loss) else: grad = safe_sparse_dot(A[:, coordinate], loss) return grad elif function == "proximal_step": L = args["prox_lipschitz"] g_func = args["g_func"] L1 = args["L1"] g = g_func(x, A, b, args, coordinate) if coordinate is None: x_half = x - g / L # soft thresholding x = np.sign(x_half) * np.maximum(0, np.abs(x_half) - L1 / L) else: L = args["prox_lipschitz"][coordinate] x_half = x[coordinate] - g / L # soft thresholding x[coordinate] = np.sign(x_half) * np.maximum(0, np.abs(x_half) - L1 / L) return x elif function == "lipschitz": lipschitz_values = np.sum(A ** 2, axis=0) return lipschitz_values
def compute(function, x, A, b, args, coordinate=None): np.testing.assert_equal(np.unique(b), np.array([-1, 1])) L2 = args["L2"] if function == "loss": reg = 0.5 * L2 * np.sum(x ** 2) if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] loss = np.sum(np.log(1 + np.exp(- b*b_pred))) + reg return loss elif function == "gradient": if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] residual = - b / (1. + np.exp(b * b_pred)) if coordinate is None: grad = safe_sparse_dot(A.T, residual) grad += L2 * x else: grad = safe_sparse_dot(A[:, coordinate].T, residual) grad += (L2 * x[coordinate]) return grad elif function == "hessian": if "b_pred" not in args: b_pred = safe_sparse_dot(A, x) else: b_pred = args["b_pred"] sig = 1. / (1. + np.exp(- b * b_pred)) if coordinate is None: hessian = A.T.dot(np.diag(sig * (1-sig)).dot(A)) hessian += L2 else: hessian = A[:, coordinate].T.dot(np.diag(sig * \ (1-sig)).dot(A[:, coordinate])) hessian += L2 return hessian elif function == "lipschitz": lipschitz_values = 0.25 * np.sum(A ** 2, axis=0) + L2 return lipschitz_values
def _emission_log_probs_params(self, emission_params, X): ''' Computes log of emission probabilities ''' success = emission_params['success_prob'] fail = emission_params['fail_prob'] log_total = psi(success + fail) log_success = psi(success) - log_total log_fail = psi(fail) - log_total return safe_sparse_dot(X,log_success.T) + safe_sparse_dot(np.ones(X.shape) - X, log_fail.T)
def fit(self, X, y): """ Learn the idf vector (global term weights) :param X: sparse matrix, [n_samples, n_features] X must be a matrix of term counts :param y: class_label, [n_samples] :return: [n_class, n_features] """ if self.use_idf: labelbin = LabelBinarizer() # 计算样本属于哪个类别 [n_samples, n_classes] Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ # 计算类别下的文档数 [n_classes] class_count_ = np.sum(Y, axis=0) class_size = class_count_.shape[0] # 计算每个特征词属于每个类别的样本数 [n_classes, n_features] class_df_ = vectorize.class_df(X, Y) # 计算类别下的词汇数 [n_classes] self.class_freq_ = np.sum(safe_sparse_dot(Y.T, X), axis=1) # 计算出现特征词的类别数 [n_features] feature_count_ = np.sum(vectorize.tobool(class_df_), axis=0) # 如果特征词所在的类别不确定或不知道时,用这个特征词出现的总样本数来代替 unknow_class_count_ = np.array([np.sum(class_count_, axis=0)]) class_count_ = np.concatenate((class_count_, unknow_class_count_)) unknow_class_df_ = np.sum(class_df_, axis=0).reshape(1, -1) class_df_ = np.concatenate((class_df_, unknow_class_df_), axis=0) unknow_class_freq_ = np.array([np.sum(self.class_freq_, axis=0)]) self.class_freq_ = np.concatenate((self.class_freq_, unknow_class_freq_)) self.classes_ = np.concatenate((self.classes_, np.array(["unknow"])), axis=0) # smooth class_count_, class_df_, feature_count_ class_count_ += int(self.smooth_idf) class_df_ += int(self.smooth_idf) feature_count_ += int(self.smooth_idf) _, n_features = X.shape # [n_classes, n_features] first_part = np.log(np.divide(class_count_.reshape(-1, 1), class_df_)) + 1.0 # [n_features] second_part = np.log(class_size / feature_count_) + 1.0 second_part_diag = sp.spdiags(second_part, diags=0, m=n_features, n=n_features) self._idf = safe_sparse_dot(first_part, second_part_diag) return self
def predict(self, X_left, X_right): y_pred = _bilinear_forward(self.U_, self.V_, X_left, X_right) if self.fit_linear: y_pred += safe_sparse_dot(X_left, self.w_left_) y_pred += safe_sparse_dot(X_right, self.w_right_) if self.fit_diag: y_pred += safe_sparse_dot(safe_sparse_mul(X_left, X_right), self.diag_) return y_pred
def _free_energy(self, v): """Computes the free energy F(v) = - log sum_h exp(-E(v,h)). v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- free_energy : array-like, shape (n_samples,) The value of the free energy. """ return (- safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_).sum(axis=1))
def add_fit(self,X): n_samples = X.shape[0] # old first = safe_sparse_dot(self.hidden_activations_.T, self.hidden_activations_) M = pinv2(first+1*np.identity(first.shape[0])) beta = self.coef_output_ # new H = self._get_hidden_activations(X) # update first = pinv2(1*np.identity(n_samples)+safe_sparse_dot(safe_sparse_dot(H,M),H.T)) second = safe_sparse_dot(safe_sparse_dot(safe_sparse_dot(safe_sparse_dot(M,H.T),first),H),M) M = M - second self.coef_output_ = beta + safe_sparse_dot(safe_sparse_dot(M,H.T),(X - safe_sparse_dot(H,beta)))
def least_square_gradient(X, y, theta, alpha=0, y_pred=None, coordinate=None): """Compute the gradient for each feature.""" if y_pred is None: y_pred = safe_sparse_dot(X, theta) loss = y_pred - y if coordinate is None: grad = safe_sparse_dot(X.T, loss) grad += alpha * theta else: grad = safe_sparse_dot(X[:, coordinate], loss) grad += (alpha * theta[coordinate]) return grad
def instance_proba(self, X): """Calculates the probability of each instance in X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Returns ------- array-like, shape = [n_samples] """ feat_prob = safe_sparse_dot(np.exp(self.class_log_prior_), np.exp(self.feature_log_prob_)).T instance_log_prob = safe_sparse_dot(X, np.log(feat_prob)) return np.exp(instance_log_prob)
def decision_function(self, X): """Predict confidence scores for samples The confidence score for a sample is the signed distance of that sample to the hyperplane. Parameters ---------- X : {array-like, sparse matrix}, shape = (n_samples, n_features) Samples. Returns ------- array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes) Confidence scores per (sample, class) combination. In the binary case, confidence score for self.classes_[1] where >0 means this class would be predicted. """ # handle regression (least-squared loss) if not self.is_classif: return LinearModel.decision_function(self, X) X = atleast2d_or_csr(X) n_features = self.coef_.shape[1] if X.shape[1] != n_features: raise ValueError("X has %d features per sample; expecting %d" % (X.shape[1], n_features)) scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ return scores.ravel() if scores.shape[1] == 1 else scores
def chi2_contingency_matrix(X_train, y_train): X = X_train.copy() X.data = np.ones_like(X.data) X = check_array(X, accept_sparse='csr') if np.any((X.data if issparse(X) else X) < 0): raise ValueError("Input X must be non-negative.") Y = LabelBinarizer().fit_transform(y_train) if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) observed = safe_sparse_dot(Y.T, X) # n_classes * n_features # feature_count = check_array(X.sum(axis=0)) # class_prob = check_array(Y.mean(axis=0)) feature_count = X.sum(axis=0).reshape(1, -1) class_prob = Y.mean(axis=0).reshape(1, -1) expected = np.dot(class_prob.T, feature_count) observed = np.asarray(observed, dtype=np.float64) k = len(observed) # Reuse observed for chi-squared statistics contingency_matrix = observed contingency_matrix -= expected contingency_matrix **= 2 expected[expected == 0.0] = 1.0 contingency_matrix /= expected # weights = contingency_matrix.max(axis=0) return contingency_matrix
def compute_distances(self, x1, x2): """ The method imputes the missing values as means and calls safe_sparse_dot. Imputation simplifies computation at a cost of (theoretically) slightly wrong distance between pairs of missing values. """ def prepare_data(x): if self.discrete.any(): data = Cosine.discrete_to_indicators(x, self.discrete) else: data = x.copy() for col, mean in enumerate(self.means): column = data[:, col] column[np.isnan(column)] = mean if self.axis == 0: data = data.T data /= row_norms(data)[:, np.newaxis] return data data1 = prepare_data(x1) data2 = data1 if x2 is None else prepare_data(x2) dist = safe_sparse_dot(data1, data2.T) np.clip(dist, 0, 1, out=dist) if x2 is None: diag = np.diag_indices_from(dist) dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0) return 1 - dist
def inverse_transform(self, X, y=None): """Transform data back to its original space. Returns an array X_original whose transform would be X. Parameters ---------- X : array-like, shape (n_samples, n_components) New data, where n_samples in the number of samples and n_components is the number of components. Returns ------- X_original array-like, shape (n_samples, n_features) Notes ----- If whitening is enabled, inverse_transform does not compute the exact inverse operation of transform. """ # XXX remove scipy.sparse support here in 0.16 X_original = safe_sparse_dot(X, self.components_) if self.mean_ is not None: X_original = X_original + self.mean_ return X_original
def get_bmu(normalized_Kohonen, y): """Returns the ID of the best matching unit. Best is determined from the cosine similarity of the sample with the normalized Kohonen network. See https://en.wikipedia.org/wiki/Cosine_similarity for cosine similarity documentation. TODO: make possible the finding the second best matching unit Parameters ---------- normalized_Kohonen : sparse matrix Shape = [n_nodes, n_features] must be normalized according to l2 norm as used in the sklearn Normalizer() y : vector of dimension 1 x nfeatures Target sample. Returns ------- tuple : (loc, cosine_distance) index of the matching unit, with the corresponding cosine distance """ # The dot product of the vector with each node is computed sampleN=Normalizer().fit_transform(y) #similarity = normalized_Kohonen.dot(sampleN.T).toarray() similarity = safe_sparse_dot(normalized_Kohonen,sampleN.T) loc = np.argmax(similarity) return loc, similarity[loc]
def predict(self, X): """ Perform regression on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- p : array, shape = [n_samples] Predicted target values for X """ try: assert_all_finite(self.coef_) pred = safe_sparse_dot(X, self.coef_.T) except ValueError: n_samples = X.shape[0] n_vectors = self.coef_.shape[0] pred = np.zeros((n_samples, n_vectors)) if not self.outputs_2d_: pred = pred.ravel() return pred
def _forward_pass(self, activations, with_output_activation=True): """Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer. Parameters ---------- activations: list, length = n_layers - 1 The ith element of the list holds the values of the ith layer. with_output_activation : bool, default True If True, the output passes through the output activation function, which is either the softmax function or the logistic function """ hidden_activation = ACTIVATIONS[self.activation] # Iterate over the hidden layers for i in range(self.n_layers_ - 1): activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i]) activations[i + 1] += self.intercepts_[i] # For the hidden layers if (i + 1) != (self.n_layers_ - 1): activations[i + 1] = hidden_activation(activations[i + 1]) # For the last layer if with_output_activation: output_activation = ACTIVATIONS[self.out_activation_] activations[i + 1] = output_activation(activations[i + 1]) return activations
def calculate_AW(ds, y, n_samples, n_classes,coefs_): AW = np.ones((n_samples, n_classes)) for i in xrange(n_samples): for r in xrange(n_classes): Xi=ds.get_row(i) AW[i,r]-=safe_sparse_dot(Xi,coefs_[:,y[i]]-coefs_[:,r]) return AW
def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" X = atleast2d_or_csr(X) neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) jll = safe_sparse_dot(X, (self.feature_log_prob_ - neg_prob).T) jll += self.class_log_prior_ + neg_prob.sum(axis=1) return jll
def test_epoch(): U = rng.randn(*true_U.shape) U2 = U.copy() viol, lv = _bilinear_cd(U, true_V, X_left, X_right, y, 1.0) dataset = get_dataset(X_left, 'fortran') # precomputing for cython y_pred = _bilinear_forward(U2, true_V, X_left, X_right) XrV = safe_sparse_dot(X_right, true_V) VtGsq = safe_sparse_dot(XrV.T ** 2, X_left ** 2) v2 = _cd_bilinear_epoch(U2, dataset, XrV, y, y_pred, VtGsq, 1.0) assert_almost_equal(viol, v2) assert_array_almost_equal(U, U2)
def compute_distances(self, x1, x2=None): """ The method - extracts normalized continuous attributes and then uses `row_norms` and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2 (the trick from sklearn); - calls a function in Cython that adds the contributions of discrete columns """ if self.normalize: x1 = x1 - self.means x1 /= np.sqrt(2 * self.vars) # adapted from sklearn.metric.euclidean_distances xx = row_norms(x1.T, squared=True)[:, np.newaxis] distances = safe_sparse_dot(x1.T, x1, dense_output=True) distances *= -2 distances += xx distances += xx.T with np.errstate(invalid="ignore"): # Nans are fixed below np.maximum(distances, 0, out=distances) distances.flat[::distances.shape[0] + 1] = 0.0 fixer = _distance.fix_euclidean_cols_normalized if self.normalize \ else _distance.fix_euclidean_cols fixer(distances, x1, self.means, self.vars) return np.sqrt(distances)
def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, random_state): """Helper function for _fit_coordinate_descent Update W to minimize the objective function, iterating once over all coordinates. By symmetry, to update H, one can call _update_coordinate_descent(X.T, Ht, W, ...) """ n_components = Ht.shape[1] HHt = fast_dot(Ht.T, Ht) XHt = safe_sparse_dot(X, Ht) # L2 regularization corresponds to increase of the diagonal of HHt if l2_reg != 0.: # adds l2_reg only on the diagonal HHt.flat[::n_components + 1] += l2_reg # L1 regularization corresponds to decrease of each element of XHt if l1_reg != 0.: XHt -= l1_reg if shuffle: permutation = random_state.permutation(n_components) else: permutation = np.arange(n_components) # The following seems to be required on 64-bit Windows w/ Python 3.5. permutation = np.asarray(permutation, dtype=np.intp) return _update_cdnmf_fast(W, HHt, XHt, permutation)
def _count(self, X, Y): """ Count and smooth feature occurrences """ if self.binarize is not None: X = binarize(X, threshold=self.binarize) # Y is n_samples by n_classes. Each row represents a sample whose label is one-hot encoded # self.feature_count_[i][j] = count of feature j in class i self.feature_count_ += safe_sparse_dot(Y.T, X) # n_classes by n_features self.class_count_ += Y.sum(axis=0) # 1 by n_classes
def _update_coordinate_descent(X, W, Ht, shuffle, random_state): n_components = Ht.shape[1] HHt = np.dot(Ht.T, Ht) XHt = safe_sparse_dot(X, Ht) if shuffle: permutation = random_state.permutation(n_components) else: permutation = np.arange(n_components) # The following seems to be required on 64-bit Windows w/ Python 3.5. permutation = np.asarray(permutation, dtype=np.intp) return cdnmf_fast._update_cdnmf_fast2(W, HHt, XHt, permutation)
def transform(self, X): """ Computes the extracted features. Parameters ---------- X : array-like, shape (n_samples, n_features) Returns ------- h : array-like, shape (n_samples, n_components) """ return self.activation_func(safe_sparse_dot(X, self.coef_hidden_) + self.intercept_hidden_)
def _fit_regression(self, y): """ fit regression using internal linear regression or supplied regressor """ if (self.regressor is None): self.coefs_ = safe_sparse_dot( pinv2(self.hidden_activations_), y) #pinv2 计算广义逆矩阵 safe_sparse_dot做点积 #β=HT else: self.regressor.fit(self.hidden_activations_, y) self.fitted_ = True
def tf_to_cooccurrence(X, min_count=1, batch_size=10000): """ Arguments --------- X: scipy.sparse Shape = (n_docs, n_terms) min_count: int Mininum co-occurrence count. Default is 1 batch_size: int The number of words in a batch. Default is 10000 Returns ---------- C : scipy.sparse.csr_matrix Co-occurrence matrix """ XT = X.T n_terms = X.shape[1] if batch_size == -1: C = safe_sparse_dot(XT, X) if min_count > 1: C = larger_than(C, min_count) else: stacks = [] n_batch = math.ceil(n_terms / batch_size) for i in range(n_batch): b = i * batch_size e = min(n_terms, (i + 1) * batch_size) C = safe_sparse_dot(XT[b:e], X) if min_count > 1: C = larger_than(C, min_count) stacks.append(C) C = vstack(stacks) if not isinstance(C, csr_matrix): C = C.tocsr() return C
def _mean_hiddens(self, v): """Computes the conditional probabilities P(h=1|v). Parameters ---------- v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.W.T) + self.h_bias return expit(p, out=p)
def predict_proba(self, X, y): features, A = X Y_multi = self.lbin.transform(y) betas = self.coef_[:, :-1] eta = self.coef_[:, -1] p = safe_sparse_dot(features, betas.T, dense_output=True) p += self.intercept_ p_nonspatial = np.hstack((p, np.zeros((features.shape[0], 1)))) p_nonspatial -= logsumexp(p_nonspatial, axis=1)[:, np.newaxis] p_nonspatial = np.exp(p_nonspatial, p_nonspatial) spatial = safe_sparse_dot(A, (Y_multi - p_nonspatial))[:, :-1] p += eta.T * np.array(spatial/A.sum(axis=1)) p = np.hstack((p, np.zeros((features.shape[0], 1)))) return softmax(p)
def decision_function(self, X): """Fit the model to the data X and target y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples) Predicted target values per element in X. """ X = atleast2d_or_csr(X) a_hidden = self.activation_func( safe_sparse_dot(X, self.coef_hidden_) + self.intercept_hidden_) output = safe_sparse_dot(a_hidden, self.coef_output_) +\ self.intercept_output_ if output.shape[1] == 1: output = output.ravel() return output
def hessian_trace(self, x): """Return a callable that returns matrix-vector products with the Hessian.""" n_samples, n_features = self.A.shape if self.intercept: x_, c = x[:-1], x[-1] else: x_, c = x, 0.0 z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c) # The mat-vec product of the Hessian d = z * (1 - z) if sparse.issparse(self.A): dX = safe_sparse_dot( sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A ) else: # Precompute as much as possible dX = d[:, np.newaxis] * self.A if self.intercept: # Calculate the double derivative with respect to intercept # In the case of sparse matrices this returns a matrix object. dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) def _Hs(s): ret = np.empty_like(s) ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features])) ret[:n_features] += self.alpha * s[:n_features] # For the fit intercept case. if self.intercept: ret[:n_features] += s[-1] * dd_intercept ret[-1] = dd_intercept.dot(s[:n_features]) ret[-1] += d.sum() * s[-1] return ret / n_samples return _Hs
def grad_loss(X, Y, w, alpha=0): n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept)), dtype=X.dtype) loss, p, w = loss_function(X, Y, w,alpha) diff = (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) grad[:, :n_features] += alpha * w if fit_intercept: grad[:, -1] = diff.sum(axis=0) return loss, grad.ravel(), p
def similar_documents(self, tfidf): if self.corpus_info is None: return None similarities = safe_sparse_dot(tfidf, self.corpus_info["tfidf"].T, dense_output=True).ravel() order = np.argsort(similarities)[::-1] order = order[similarities[order] > 0][:_MAX_SIMILAR_DOCS_RETURNED] ordered_simil = similarities[order] similar_docs = (self.corpus_info["metadata"].iloc[order].reset_index( drop=True)) similar_docs["similarity"] = ordered_simil return similar_docs
def logistic_gradient(w, X, y_, l2, normalize=True): """ Gradient of the logistic loss at point w with features X, labels y and l2 regularization. If labels are from {-1, 1}, they will be changed to {0, 1} internally """ y = (y_ + 1) / 2 if -1 in y_ else y_ activation = scipy.special.expit( safe_sparse_dot(X, w, dense_output=True).ravel()) grad = safe_sparse_add(X.T.dot(activation - y) / X.shape[0], l2 * w) grad = np.asarray(grad).ravel() if normalize: return grad return grad * len(y)
def f_grad(self, x, return_gradient=True): if self.intercept: x_, c = x[:-1], x[-1] else: x_, c = x, 0.0 z = safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c loss = np.mean((1 - self.b) * z - self.logsig(z)) penalty = safe_sparse_dot(x_.T, x_, dense_output=True).ravel()[0] loss += 0.5 * self.alpha * penalty if not return_gradient: return loss z0_b = self.expit_b(z, self.b) grad = safe_sparse_add(self.A.T.dot(z0_b) / self.A.shape[0], self.alpha * x_) grad = np.asarray(grad).ravel() grad_c = z0_b.mean() if self.intercept: return np.concatenate((grad, [grad_c])) return loss, grad
def joint_feature(self, x, y): if isinstance(y, DocLabel): Y_prop, Y_link, compat, second_order = self._marg_rounded(x, y) else: Y_prop, Y_link, compat, second_order = self._marg_fractional(x, y) prop_acc = safe_sparse_dot(Y_prop.T, x.X_prop) # node_cls * node_feats link_acc = safe_sparse_dot(Y_link.T, x.X_link) # link_cls * link_feats f_sec_ord = [] if len(second_order): second_order = second_order.reshape(-1, len(x.second_order)) if self.coparents: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) second_order = second_order[1:] if self.grandparents: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) second_order = second_order[1:] if self.siblings: f_sec_ord.append(safe_sparse_dot(second_order[0], x.X_sec_ord)) elif self.n_second_order_factors_: # document has no second order factors so the joint feature # must be filled with zeros manually f_sec_ord = [ np.zeros(self.n_second_order_features_) for _ in range(self.n_second_order_factors_) ] jf = np.concatenate( [prop_acc.ravel(), link_acc.ravel(), compat.ravel()] + f_sec_ord) return jf
def predict(self, X): """Predicts output y according to input X. Parameters ---------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Returns ------- Y : ndarray of shape (n_samples,) or (n_samples, n_targets) """ if self._output_weights is None: raise NotFittedError(self) return safe_sparse_dot(self._preprocessing(X, partial_normalize=False), self._output_weights)
def decision_function(self, X): """ Predict confidence scores for samples. """ n_features = self.coef_.shape[1] if X.shape[1] != n_features: raise ValueError("X has %d features per sample; expecting %d" % (X.shape[1], n_features)) scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ return scores.ravel() if scores.shape[1] == 1 else scores
def _multinomial_loss_grad(w, X, Y, alpha, sample_weight): """Computes the multinomial loss, gradient and class probabilities. Parameters ---------- w : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Y : ndarray, shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. Returns ------- loss : float Multinomial loss. grad : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Ravelled gradient of the multinomial loss. p : ndarray, shape (n_samples, n_classes) Estimated class probabilities Reference --------- Bishop, C. M. (2006). Pattern recognition and machine learning. Springer. (Chapter 4.3.4) """ n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = (w.size == n_classes * (n_features + 1)) grad = np.zeros((n_classes, n_features + bool(fit_intercept))) alpha = alpha.reshape(n_classes, -1) loss, p, w = _multinomial_loss(w, X, Y, alpha, sample_weight) sample_weight = sample_weight[:, np.newaxis] diff = sample_weight * (p - Y) grad[:, :n_features] = safe_sparse_dot(diff.T, X) grad[:, :n_features] += alpha * w if fit_intercept: grad[:, -1] = diff.sum(axis=0) return loss, grad.ravel(), p
def _logistic_loss_and_grad(w, X, y, alpha, mask, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. mask : array-like, shape (n_features), (n_classes, n_features) optional Masking array for coef. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ n_samples, n_features = X.shape if mask is not None: w[:n_features] *= mask grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(n_samples) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) / n_samples out += .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = (safe_sparse_dot(X.T, z0) / n_samples) + alpha * w if mask is not None: grad[:n_features] *= mask # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() / n_samples return out, grad
def uncertainty_selection(ent, label, modified_matrix, label_distributions_, alpha, y_static, train_y, build_laplacian_graph, origin_graph, neighbors): graph_matrix = build_laplacian_graph(modified_matrix) P = safe_sparse_dot(graph_matrix, label_distributions_) P = np.multiply(alpha, P) + y_static pre_ent = entropy(P.T + 1e-20) ind = ent > np.log(label.shape[1]) * 0.1 ind[ent > (np.log(label.shape[1]) - 0.001)] = False modified_matrix[:, ind] = modified_matrix[:, ind] * 0 graph_matrix = build_laplacian_graph(modified_matrix) P = safe_sparse_dot(graph_matrix, label_distributions_) P = np.multiply(alpha, P) + y_static next_ent = entropy(P.T + 1e-20) # postprocess for case where some instances are not propagated to # num_point_to = modified_matrix.sum(axis=1).reshape(-1) # num_point_to = np.array(num_point_to).reshape(-1) # ids = np.array(range(len(num_point_to)))[num_point_to == 0] # for id in ids: # point_to_idxs = origin_graph[:, id].nonzero()[0] # dists = label_distributions_[point_to_idxs, :] # labels = dists.argmax(axis=1) # bins = np.bincount(labels) # max_labels = bins.argmax() # point_to_idxs = point_to_idxs[labels == max_labels] # for p in point_to_idxs: # modified_matrix[id, p] = 1 # a = 1 modified_matrix = correct_unconnected_nodes(modified_matrix, train_y, neighbors) print("removed_num: {}, ent1: {}, ent2: {}, ent_gain: {}".format( ind.sum(), pre_ent.sum(), next_ent.sum(), pre_ent.sum() - next_ent.sum())) return modified_matrix
def fit(self, X, Y): """ Fit the model with the training data :param X: Inputs :param Y: Label associated to inputs :return: None """ super().fit(X, Y) # https://github.com/vaquierm/RedditCommentTextClassification/issues/1 subreddits = np.unique(Y) # fit the model self.parameters = {} total_per_class = [] thetak = [ ] # parameter theta k = nb comment of class 1 / total number of comments alpha = 1 # compute theta k # for each class for i in range(len(subreddits)): feature = subreddits[i] numbExamples = 0 # loop through all the comments for j in range(len(Y)): if (Y[j] == feature): numbExamples += 1 total_per_class.append(float(numbExamples)) thetak_i = float(numbExamples) / float(X.shape[0]) thetak.append(thetak_i) binarizer = LabelBinarizer() Y = binarizer.fit_transform(Y) # parameter thate of kj using sparse matrices # add 1 for Laplace Smoothing kj_numerator = safe_sparse_dot(Y.T, X) + alpha # kj_denominator == # of comments from that class total_per_class = np.array(total_per_class) # add 2 for Laplace Smoothing kj_denominator = total_per_class.reshape(-1, 1) + 2 * alpha log_thetakj = np.log(kj_numerator) - np.log(kj_denominator) self.parameters.update({'parameter_k': thetak}) self.parameters.update({'parameter_log_kj': log_thetakj})
def _multinomial_loss(w, X, Y, alpha, sample_weight): """Computes multinomial loss and class probabilities. Parameters ---------- w : ndarray, shape (n_classes * n_features,) or (n_classes * (n_features + 1),) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Y : ndarray, shape (n_samples, n_classes) Transformed labels according to the output of LabelBinarizer. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : ndarray, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- loss : float Multinomial loss. p : ndarray, shape (n_samples, n_classes) Estimated class probabilities. w : ndarray, shape (n_classes, n_features) Reshaped param vector excluding intercept terms. """ n_classes = Y.shape[1] n_features = X.shape[1] fit_intercept = w.size == (n_classes * (n_features + 1)) w = w.reshape(n_classes, -1) sample_weight = sample_weight[:, np.newaxis] if fit_intercept: intercept = w[:, -1] w = w[:, :-1] else: intercept = 0 p = safe_sparse_dot(X, w.T) p += intercept p -= logsumexp(p, axis=1)[:, np.newaxis] loss = -(sample_weight * Y * p).sum() loss += 0.5 * alpha * squared_norm(w) p = np.exp(p, p) return loss, p, w
def test_safe_sparse_dot_dense_output(dense_output): rng = np.random.RandomState(0) A = sparse.random(30, 10, density=0.1, random_state=rng) B = sparse.random(10, 20, density=0.1, random_state=rng) expected = A.dot(B) actual = safe_sparse_dot(A, B, dense_output=dense_output) assert sparse.issparse(actual) == (not dense_output) if dense_output: expected = expected.toarray() assert_allclose_dense_sparse(actual, expected)
def linear_kernel(X, Y=None): """ Compute the linear kernel between X and Y. Read more in the :ref:`User Guide <linear_kernel>`. Parameters ---------- X : array of shape (n_samples_1, n_features) Y : array of shape (n_samples_2, n_features) Returns ------- Gram matrix : array of shape (n_samples_1, n_samples_2) """ X, Y = check_pairwise_arrays(X, Y) return safe_sparse_dot(X, Y.T, dense_output=True)
def predict(self, X): predicted_labels = [] #create empty list # complete implementation #https://learnai1.home.blog/2019/11/16/perceptron-delta-rule-python-implementation/ for x in X: activation = safe_sparse_dot(self.weights, x.transpose()) + self.bias #wx+b if activation > 0: #predict labels y_hat = 1 else: y_hat = -1 predicted_labels.append(y_hat) return predicted_labels
def _mean_visible(self, h): """Computes the conditional probabilities P(v=1|h). Parameters ---------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. Returns ------- v : array-like, shape (n_samples, n_features) Values of the visible layer. """ #p = np.dot(h, self.W) + self.v_bias p = safe_sparse_dot(h, self.W) + self.v_bias return expit(p, out=p)
def _pass_through_recurrent_weights(self, X, y=None): hidden_layer_state = np.zeros(shape=(X.shape[0] + 1, self.hidden_layer_size)) for sample in range(X.shape[0]): a = X[sample, :] b = safe_sparse_dot(hidden_layer_state[sample, :], self._recurrent_weights) * self.spectral_radius c = self._bias_weights * self.bias_scaling pre_activation = a + b + c ACTIVATIONS[self.activation](pre_activation) hidden_layer_state[sample + 1, :] = pre_activation hidden_layer_state[sample + 1, :] = (1 - self.leakage) * hidden_layer_state[sample, :] \ + self.leakage * hidden_layer_state[sample + 1, :] return hidden_layer_state[1:, :]
def transform(self, X): check_is_fitted(self, ['mean_', 'components_']) check_array(X, accept_sparse=['csr', 'csc']) if hasattr(self, 'scaler_'): X = self.scaler_.transform(X, copy=self.copy) X_t = safe_sparse_dot(X, self.components_.T) if self.whiten: X_t /= np.sqrt(self.explained_variance_) #1. return X_t
def forward(clf, X): ''' @clf - pretrained MLPClassifier from sklearn ''' ''' @X - input vector ''' ''' @returns - list of layer activations ''' hidden_layer_sizes = list(clf.hidden_layer_sizes) hidden_activation = ACTIVATIONS[clf.activation] activations = [X] for i in range(clf.n_layers_ - 1): activations.append(safe_sparse_dot(activations[i], clf.coefs_[i])) activations[i + 1] += clf.intercepts_[i] if (i + 1) != (clf.n_layers_ - 1): v = hidden_activation(activations[i + 1]) return activations
def transform(x_original): #x_original = preprocessing.scale(x_original) #min_max_scaler = preprocessing.MinMaxScaler() #x_original = np.sqrt(x_original) #w_feature = RBFSampler(gamma=0.0001, n_components=1000, random_state=1) #w_feature = Nystroem(kernel='rbf', gamma=1.0, n_components=400, random_state=1) #x_original = preprocessing.normalize(x_original, norm='l1') #w_feature = AdditiveChi2Sampler(sample_steps=6, sample_interval=0.29) projection = safe_sparse_dot(x_original, random_weights) projection += random_offset np.cos(projection, projection) projection *= np.sqrt(2.) / np.sqrt(n_components) #x = w_feature.fit_transform(x_original) return projection
def _fit_neumann(self, X, y=None): super().fit(X, y=None) s = np.sort(BatchIntrinsicPlasticity._node_inputs( X, self._input_weights, self.input_scaling, self._bias_weights, self.bias_scaling), axis=0) phi = np.transpose(np.stack((s, np.ones(s.shape)), axis=2), axes=(1, 0, 2)) if callable( BatchIntrinsicPlasticity.OUT_DISTRIBUTION[self.distribution]): t = BatchIntrinsicPlasticity.OUT_DISTRIBUTION[self.distribution]( size=X.shape[0]) t_min, t_max = np.min(t), np.max(t) if self.distribution in {'uniform'} and self.input_activation in { 'tanh', 'logistic' }: bound_low = ACTIVATIONS_INVERSE_BOUNDS[ self.input_activation][0] * .5 bound_high = ACTIVATIONS_INVERSE_BOUNDS[ self.input_activation][1] * .5 else: bound_low = ACTIVATIONS_INVERSE_BOUNDS[ self.input_activation][0] bound_high = ACTIVATIONS_INVERSE_BOUNDS[ self.input_activation][1] if bound_low == np.inf: bound_low = t_min if bound_high == np.inf: bound_high = t_max t = (t - t_min) * (bound_high - bound_low) / (t_max - t_min) + bound_low t.sort() ACTIVATIONS_INVERSE[self.input_activation](t) else: raise ValueError('Not a valid activation inverse, got {0}'.format( self.distribution)) v = safe_sparse_dot(np.linalg.pinv(phi), t) np.multiply(self._input_weights, v[:, 0], out=self._input_weights) self._bias_weights += v[:, 1] return self