def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, n_components, angle=0.5, skip_num_points=0, verbose=False): """t-SNE objective function: KL divergence of p_ijs and q_ijs. Uses Barnes-Hut tree methods to calculate the gradient that runs in O(NlogN) instead of O(N^2) Parameters ---------- params : array, shape (n_params,) Unraveled embedding. P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. neighbors: int64 array, shape (n_samples, K) Array with element [i, j] giving the index for the jth closest neighbor to point i. degrees_of_freedom : float Degrees of freedom of the Student's-t distribution. n_samples : int Number of samples. n_components : int Dimension of the embedded space. angle : float (default: 0.5) This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 'angle' is the angular size (referred to as theta in [3]) of a distant node as measured from a point. If this size is below 'angle' then it is used as a summary node of all points contained within it. This method is not very sensitive to changes in this parameter in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing computation time and angle greater 0.8 has quickly increasing error. skip_num_points : int (optional, default:0) This does not compute the gradient for points with indices below `skip_num_points`. This is useful when computing transforms of new data where you'd like to keep the old data fixed. verbose : int Verbosity level. Returns ------- kl_divergence : float Kullback-Leibler divergence of p_ij and q_ij. grad : array, shape (n_params,) Unraveled gradient of the Kullback-Leibler divergence with respect to the embedding. """ params = astype(params, np.float32, copy=False) X_embedded = params.reshape(n_samples, n_components) neighbors = astype(neighbors, np.int64, copy=False) if len(P.shape) == 1: sP = squareform(P).astype(np.float32) else: sP = P.astype(np.float32) grad = np.zeros(X_embedded.shape, dtype=np.float32) error = _barnes_hut_tsne.gradient(sP, X_embedded, neighbors, grad, angle, n_components, verbose, dof=degrees_of_freedom) c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom grad = grad.ravel() grad *= c return error, grad
def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity(distances, neighbors, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _joint_probabilities_nn(distances, neighbors, labels, label_importance, rep_sample, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. labels : array, shape (n_samples,) An integer labelling of each sample, with unknown samples given the label -1. label_importance: float How much to deviate from a uniform prior via the label classes. rep_samples: boolean Whether the partial labelling is a representative sample of the full (and unknown) labelling. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) labels = astype(labels, np.int64, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity(distances, neighbors, labels, label_importance, rep_sample, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _joint_probabilities_nn(distances, neighbors, labels, label_importance, rep_sample, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. labels : array, shape (n_samples,) An integer labelling of each sample, with unknown samples given the label -1. label_importance: float How much to deviate from a uniform prior via the label classes. rep_samples: boolean Whether the partial labelling is a representative sample of the full (and unknown) labelling. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) labels = astype(labels, np.int64, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, labels, label_importance, rep_sample, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _joint_probabilities(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances. Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) conditional_P = _utils._binary_search_perplexity(distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P
def _joint_probabilities(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances. Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P
def _translate_train_sizes(train_sizes, n_max_training_samples): """Determine absolute sizes of training subsets and validate 'train_sizes'. Examples: _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] _translate_train_sizes([5, 10], 10) -> [5, 10] Parameters ---------- train_sizes : array-like, shape (n_ticks,), dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. n_max_training_samples : int Maximum number of training samples (upper bound of 'train_sizes'). Returns ------- train_sizes_abs : array, shape (n_unique_ticks,), dtype int Numbers of training examples that will be used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. """ train_sizes_abs = np.asarray(train_sizes) n_ticks = train_sizes_abs.shape[0] n_min_required_samples = np.min(train_sizes_abs) n_max_required_samples = np.max(train_sizes_abs) if np.issubdtype(train_sizes_abs.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: raise ValueError("train_sizes has been interpreted as fractions " "of the maximum number of training samples and " "must be within (0, 1], but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, dtype=np.int, copy=False) train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): raise ValueError("train_sizes has been interpreted as absolute " "numbers of training samples and must be within " "(0, %d], but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) train_sizes_abs = np.unique(train_sizes_abs) if n_ticks > train_sizes_abs.shape[0]: warnings.warn("Removed duplicate entries from 'train_sizes'. Number " "of ticks will be less than than the size of " "'train_sizes' %d instead of %d)." % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) return train_sizes_abs
def _translate_train_sizes(train_sizes, n_max_training_samples): """Determine absolute sizes of training subsets and validate 'train_sizes'. Examples: _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] _translate_train_sizes([5, 10], 10) -> [5, 10] Parameters ---------- train_sizes : array-like, shape (n_ticks,), dtype float or int Numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. n_max_training_samples : int Maximum number of training samples (upper bound of 'train_sizes'). Returns ------- train_sizes_abs : array, shape (n_unique_ticks,), dtype int Numbers of training examples that will be used to generate the learning curve. Note that the number of ticks might be less than n_ticks because duplicate entries will be removed. """ train_sizes_abs = np.asarray(train_sizes) n_ticks = train_sizes_abs.shape[0] n_min_required_samples = np.min(train_sizes_abs) n_max_required_samples = np.max(train_sizes_abs) if np.issubdtype(train_sizes_abs.dtype, np.float): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: raise ValueError("train_sizes has been interpreted as fractions " "of the maximum number of training samples and " "must be within (0, 1], but is within [%f, %f]." % (n_min_required_samples, n_max_required_samples)) train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, dtype=np.int, copy=False) train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples) else: if (n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples): raise ValueError("train_sizes has been interpreted as absolute " "numbers of training samples and must be within " "(0, %d], but is within [%d, %d]." % (n_max_training_samples, n_min_required_samples, n_max_required_samples)) train_sizes_abs = np.unique(train_sizes_abs) if n_ticks > train_sizes_abs.shape[0]: warnings.warn( "Removed duplicate entries from 'train_sizes'. Number " "of ticks will be less than than the size of " "'train_sizes' %d instead of %d)." % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) return train_sizes_abs
def test_astype_copy_memory(): a_int32 = np.ones(3, np.int32) # Check that dtype conversion works b_float32 = astype(a_int32, dtype=np.float32, copy=False) assert_equal(b_float32.dtype, np.float32) # Changing dtype forces a copy even if copy=False assert_false(np.may_share_memory(b_float32, a_int32)) # Check that copy can be skipped if requested dtype match c_int32 = astype(a_int32, dtype=np.int32, copy=False) assert_true(c_int32 is a_int32) # Check that copy can be forced, and is the case by default: d_int32 = astype(a_int32, dtype=np.int32, copy=True) assert_false(np.may_share_memory(d_int32, a_int32)) e_int32 = astype(a_int32, dtype=np.int32) assert_false(np.may_share_memory(e_int32, a_int32))
def dbscan(X, eps=0.5, minpts=5, metric='minkowski', algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1): X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = astype(X.indices, np.intp, copy=False)[X_mask] masked_indptr = np.cumsum(X_mask)[X.indptr[1:] - 1] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data masked_indices = np.insert(masked_indices, masked_indptr, np.arange(X.shape[0])) masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0]) # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array( [np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= minpts, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
def _generate_hypercube(samples, dimensions, rng): """Returns distinct binary samples of length dimensions """ if dimensions > 30: return np.hstack([ _generate_hypercube(samples, dimensions - 30, rng), _generate_hypercube(samples, 30, rng) ]) out = astype(sample_without_replacement(2**dimensions, samples, random_state=rng), dtype='>u4', copy=False) out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:] return out
def MB_step(X, x_squared_norms, centers, counts, curr_iter, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False, learn_rate=0.0): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. learn_rate: learning rate Returns ------- centers: Updated centers inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low counts to_reassign = counts < reassignment_ratio * counts.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = choice(X.shape[0], replace=False, size=n_reassigns, random_state=random_state) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr(X, astype(new_centers, np.intp), astype(np.where(to_reassign)[0], np.intp), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. counts[to_reassign] = np.min(counts[~to_reassign]) squared_diff = 0.0 ## implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): if compute_squared_diff: old_center_buffer = centers #rand_vec = make_rand_vector(X.shape[1]) #learn_rate = 0.0 centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers, counts, nearest_center, old_center_buffer, compute_squared_diff, curr_iter, learn_rate) if compute_squared_diff: diff = centers - old_center_buffer squared_diff = row_norms(diff, squared=True).sum() return centers, squared_diff, inertia ## dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx old_count = counts[center_idx] this_count = center_mask.sum() counts[center_idx] += this_count # update counts if this_count > 0: new_count = counts[center_idx] if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling #centers[center_idx] *= counts[center_idx] # inplace sum with new points members of this cluster #centers[center_idx] += np.sum(X[center_mask], axis=0) # update the count statistics for this center #counts[center_idx] += count # inplace rescale to compute mean of all points (old and new) #centers[center_idx] /= counts[center_idx] new_center = np.sum(X[center_mask], axis=0) if learn_rate == 0.0: learn_rate = (new_count - old_count) / float(new_count) centers[center_idx] = centers[center_idx] + learn_rate * ( new_center / (new_count - old_count) - centers[center_idx]) # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return centers, squared_diff, inertia
def _sparse_fit(self, X, strategy, missing_values, axis): """Fit the transformer on sparse data.""" # Imputation is done "by column", so if we want to do it # by row we only need to convert the matrix to csr format. if axis == 1: X = X.tocsr() else: X = X.tocsc() # Count the zeros if missing_values == 0: n_zeros_axis = np.zeros(X.shape[not axis], dtype=int) else: n_zeros_axis = X.shape[axis] - np.diff(X.indptr) # Mean if strategy == "mean": if missing_values != 0: n_non_missing = n_zeros_axis # Mask the missing elements mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.logical_not(mask_missing_values) # Sum only the valid elements new_data = X.data.copy() new_data[mask_missing_values] = 0 X = sparse.csc_matrix((new_data, X.indices, X.indptr), copy=False) sums = X.sum(axis=0) # Count the elements != 0 mask_non_zeros = sparse.csc_matrix( (mask_valids.astype(np.float64), X.indices, X.indptr), copy=False) s = mask_non_zeros.sum(axis=0) n_non_missing = np.add(n_non_missing, s) else: sums = X.sum(axis=axis) n_non_missing = np.diff(X.indptr) # Ignore the error, columns with a np.nan statistics_ # are not an error at this point. These columns will # be removed in transform with np.errstate(all="ignore"): return np.ravel(sums) / np.ravel(n_non_missing) # Median + Most frequent else: # Remove the missing values, for each column columns_all = np.hsplit(X.data, X.indptr[1:-1]) mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.hsplit(np.logical_not(mask_missing_values), X.indptr[1:-1]) # astype necessary for bug in numpy.hsplit before v1.9 columns = [ col[astype(mask, bool, copy=False)] for col, mask in zip(columns_all, mask_valids) ] # Median if strategy == "median": median = np.empty(len(columns)) for i, column in enumerate(columns): median[i] = _get_median(column, n_zeros_axis[i]) return median # Most frequent elif strategy == "most_frequent": most_frequent = np.empty(len(columns)) for i, column in enumerate(columns): most_frequent[i] = _most_frequent(column, 0, n_zeros_axis[i]) return most_frequent
def random_choice_csc(n_samples, classes, class_probability=None, random_state=None): """Generate a sparse random matrix given column class distributions Parameters ---------- n_samples : int, Number of samples to draw in each column. classes : list of size n_outputs of arrays of size (n_classes,) List of classes for each column. class_probability : list of size n_outputs of arrays of size (n_classes,) Optional (default=None). Class distribution of each column. If None the uniform distribution is assumed. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- random_matrix : sparse csc matrix of size (n_samples, n_outputs) """ data = array.array('i') indices = array.array('i') indptr = array.array('i', [0]) for j in range(len(classes)): classes[j] = np.asarray(classes[j]) if classes[j].dtype.kind != 'i': raise ValueError("class dtype %s is not supported" % classes[j].dtype) classes[j] = astype(classes[j], np.int64, copy=False) # use uniform distribution if no class_probability is given if class_probability is None: class_prob_j = np.empty(shape=classes[j].shape[0]) class_prob_j.fill(1 / classes[j].shape[0]) else: class_prob_j = np.asarray(class_probability[j]) if np.sum(class_prob_j) != 1.0: raise ValueError("Probability array at index {0} does not sum to " "one".format(j)) if class_prob_j.shape[0] != classes[j].shape[0]: raise ValueError("classes[{0}] (length {1}) and " "class_probability[{0}] (length {2}) have " "different length.".format( j, classes[j].shape[0], class_prob_j.shape[0])) # If 0 is not present in the classes insert it with a probability 0.0 if 0 not in classes[j]: classes[j] = np.insert(classes[j], 0, 0) class_prob_j = np.insert(class_prob_j, 0, 0.0) # If there are nonzero classes choose randomly using class_probability rng = check_random_state(random_state) if classes[j].shape[0] > 1: p_nonzero = 1 - class_prob_j[classes[j] == 0] nnz = int(n_samples * p_nonzero) ind_sample = sample_without_replacement(n_population=n_samples, n_samples=nnz, random_state=random_state) indices.extend(ind_sample) # Normalize probabilites for the nonzero elements classes_j_nonzero = classes[j] != 0 class_probability_nz = class_prob_j[classes_j_nonzero] class_probability_nz_norm = (class_probability_nz / np.sum(class_probability_nz)) classes_ind = np.searchsorted(class_probability_nz_norm.cumsum(), rng.rand(nnz)) data.extend(classes[j][classes_j_nonzero][classes_ind]) indptr.append(len(indices)) return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
def transform(self, X): """Impute all missing values in X. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ if self.axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) statistics = self.statistics_.copy() if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data need to be recomputed # when the imputation is done per sample else: X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) if sparse.issparse(X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, self.axis) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, self.axis) # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] elif self.axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: mask = _get_mask(X.data, self.missing_values) indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), np.diff(X.indptr))[mask] X.data[mask] = astype(valid_statistics[indexes], X.dtype, copy=False) else: if sparse.issparse(X): X = X.toarray() mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) values = np.repeat(valid_statistics, n_missing) if self.axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask X[coordinates] = values return X
def random_choice_csc(n_samples, classes, class_probability=None, random_state=None): """Generate a sparse random matrix given column class distributions Parameters ---------- n_samples : int, Number of samples to draw in each column. classes : list of size n_outputs of arrays of size (n_classes,) List of classes for each column. class_probability : list of size n_outputs of arrays of size (n_classes,) Optional (default=None). Class distribution of each column. If None the uniform distribution is assumed. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- random_matrix : sparse csc matrix of size (n_samples, n_outputs) """ data = array.array('i') indices = array.array('i') indptr = array.array('i', [0]) for j in range(len(classes)): classes[j] = np.asarray(classes[j]) if classes[j].dtype.kind != 'i': raise ValueError("class dtype %s is not supported" % classes[j].dtype) classes[j] = astype(classes[j], np.int64, copy=False) # use uniform distribution if no class_probability is given if class_probability is None: class_prob_j = np.empty(shape=classes[j].shape[0]) class_prob_j.fill(1 / classes[j].shape[0]) else: class_prob_j = np.asarray(class_probability[j]) if np.sum(class_prob_j) != 1.0: raise ValueError("Probability array at index {0} does not sum to " "one".format(j)) if class_prob_j.shape[0] != classes[j].shape[0]: raise ValueError("classes[{0}] (length {1}) and " "class_probability[{0}] (length {2}) have " "different length.".format(j, classes[j].shape[0], class_prob_j.shape[0])) # If 0 is not present in the classes insert it with a probability 0.0 if 0 not in classes[j]: classes[j] = np.insert(classes[j], 0, 0) class_prob_j = np.insert(class_prob_j, 0, 0.0) # If there are nonzero classes choose randomly using class_probability rng = check_random_state(random_state) if classes[j].shape[0] > 1: p_nonzero = 1 - class_prob_j[classes[j] == 0] nnz = int(n_samples * p_nonzero) ind_sample = sample_without_replacement(n_population=n_samples, n_samples=nnz, random_state=random_state) indices.extend(ind_sample) # Normalize probabilites for the nonzero elements classes_j_nonzero = classes[j] != 0 class_probability_nz = class_prob_j[classes_j_nonzero] class_probability_nz_norm = (class_probability_nz / np.sum(class_probability_nz)) classes_ind = np.searchsorted(class_probability_nz_norm.cumsum(), rng.rand(nnz)) data.extend(classes[j][classes_j_nonzero][classes_ind]) indptr.append(len(indices)) return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
def _sparse_fit(self, X, strategy, missing_values, axis): """Fit the transformer on sparse data.""" # Imputation is done "by column", so if we want to do it # by row we only need to convert the matrix to csr format. if axis == 1: X = X.tocsr() else: X = X.tocsc() # Count the zeros if missing_values == 0: n_zeros_axis = np.zeros(X.shape[not axis], dtype=int) else: n_zeros_axis = X.shape[axis] - np.diff(X.indptr) # Mean if strategy == "mean": if missing_values != 0: n_non_missing = n_zeros_axis # Mask the missing elements mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.logical_not(mask_missing_values) # Sum only the valid elements new_data = X.data.copy() new_data[mask_missing_values] = 0 X = sparse.csc_matrix((new_data, X.indices, X.indptr), copy=False) sums = X.sum(axis=0) # Count the elements != 0 mask_non_zeros = sparse.csc_matrix( (mask_valids.astype(np.float64), X.indices, X.indptr), copy=False) s = mask_non_zeros.sum(axis=0) n_non_missing = np.add(n_non_missing, s) else: sums = X.sum(axis=axis) n_non_missing = np.diff(X.indptr) # Ignore the error, columns with a np.nan statistics_ # are not an error at this point. These columns will # be removed in transform with np.errstate(all="ignore"): return np.ravel(sums) / np.ravel(n_non_missing) # Median + Most frequent else: # Remove the missing values, for each column columns_all = np.hsplit(X.data, X.indptr[1:-1]) mask_missing_values = _get_mask(X.data, missing_values) mask_valids = np.hsplit(np.logical_not(mask_missing_values), X.indptr[1:-1]) # astype necessary for bug in numpy.hsplit before v1.9 columns = [col[astype(mask, bool, copy=False)] for col, mask in zip(columns_all, mask_valids)] # Median if strategy == "median": median = np.empty(len(columns)) for i, column in enumerate(columns): median[i] = _get_median(column, n_zeros_axis[i]) return median # Most frequent elif strategy == "most_frequent": most_frequent = np.empty(len(columns)) for i, column in enumerate(columns): most_frequent[i] = _most_frequent(column, 0, n_zeros_axis[i]) return most_frequent