def sparse_correlation(ind1, data1, ind2, data2, n_features): mu_x = 0.0 mu_y = 0.0 dot_product = 0.0 if ind1.shape[0] == 0 and ind2.shape[0] == 0: return 0.0 elif ind1.shape[0] == 0 or ind2.shape[0] == 0: return 1.0 for i in range(data1.shape[0]): mu_x += data1[i] for i in range(data2.shape[0]): mu_y += data2[i] mu_x /= n_features mu_y /= n_features shifted_data1 = np.empty(data1.shape[0], dtype=np.float32) shifted_data2 = np.empty(data2.shape[0], dtype=np.float32) for i in range(data1.shape[0]): shifted_data1[i] = data1[i] - mu_x for i in range(data2.shape[0]): shifted_data2[i] = data2[i] - mu_y norm1 = np.sqrt( (norm(shifted_data1) ** 2) + (n_features - ind1.shape[0]) * (mu_x ** 2) ) norm2 = np.sqrt( (norm(shifted_data2) ** 2) + (n_features - ind2.shape[0]) * (mu_y ** 2) ) dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2) common_indices = set(dot_prod_inds) for i in range(dot_prod_data.shape[0]): dot_product += dot_prod_data[i] for i in range(ind1.shape[0]): if ind1[i] not in common_indices: dot_product -= shifted_data1[i] * (mu_y) for i in range(ind2.shape[0]): if ind2[i] not in common_indices: dot_product -= shifted_data2[i] * (mu_x) all_indices = arr_union(ind1, ind2) dot_product += mu_x * mu_y * (n_features - all_indices.shape[0]) if norm1 == 0.0 and norm2 == 0.0: return 0.0 elif dot_product == 0.0: return 1.0 else: return 1.0 - (dot_product / (norm1 * norm2))
def sparse_cosine(ind1, data1, ind2, data2): aux_inds, aux_data = sparse_mul(ind1, data1, ind2, data2) result = 0.0 norm1 = norm(data1) norm2 = norm(data2) for i in range(aux_data.shape[0]): result += aux_data[i] return 1.0 - (result / (norm1 * norm2))
def sparse_correlation(ind1, data1, ind2, data2, n_features): mu_x = 0.0 mu_y = 0.0 dot_product = 0.0 for i in range(data1.shape[0]): mu_x += data1[i] for i in range(data2.shape[0]): mu_y += data2[i] mu_x /= n_features mu_y /= n_features shifted_data1 = np.empty(data1.shape[0], dtype=np.float64) shifted_data2 = np.empty(data2.shape[0], dtype=np.float64) for i in range(data1.shape[0]): shifted_data1[i] = data1[i] - mu_x for i in range(data2.shape[0]): shifted_data2[i] = data2[i] - mu_y norm1 = norm(shifted_data1) norm2 = norm(shifted_data2) dot_prod_inds, dot_prod_data = sparse_mul(ind1, shifted_data1, ind2, shifted_data2) if dot_prod_data.shape[0] == 0: return 1.0 for i in range(dot_prod_data.shape[0]): dot_product += dot_prod_data[i] if dot_product == 0.0: return 1.0 else: return (1.0 - (dot_product / (norm1 * norm2)))
def angular_random_projection_split(data, indices, rng_state): """Given a set of ``indices`` for data points from ``data``, create a random hyperplane to split the data, returning two arrays indices that fall on either side of the hyperplane. This is the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses cosine distance to determine the hyperplane and which side each data sample falls on. Parameters ---------- data: array of shape (n_samples, n_features) The original data to be split indices: array of shape (tree_node_size,) The indices of the elements in the ``data`` array that are to be split in the current operation. rng_state: array of int64, shape (3,) The internal state of the rng Returns ------- indices_left: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. indices_right: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. """ dim = data.shape[1] # Select two random points, set the hyperplane between them left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_norm = norm(data[left]) right_norm = norm(data[right]) if abs(left_norm) < EPS: left_norm = 1.0 if abs(right_norm) < EPS: right_norm = 1.0 # Compute the normal vector to the hyperplane (the vector between # the two points) hyperplane_vector = np.empty(dim, dtype=np.float32) for d in range(dim): hyperplane_vector[d] = (data[left, d] / left_norm) - (data[right, d] / right_norm) hyperplane_norm = norm(hyperplane_vector) if abs(hyperplane_norm) < EPS: hyperplane_norm = 1.0 for d in range(dim): hyperplane_vector[d] = hyperplane_vector[d] / hyperplane_norm # For each point compute the margin (project into normal vector) # If we are on lower side of the hyperplane put in one pile, otherwise # put it in the other pile (if we hit hyperplane on the nose, flip a coin) n_left = 0 n_right = 0 side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = 0.0 for d in range(dim): margin += hyperplane_vector[d] * data[indices[i], d] if abs(margin) < EPS: side[i] = tau_rand_int(rng_state) % 2 if side[i] == 0: n_left += 1 else: n_right += 1 elif margin > 0: side[i] = 0 n_left += 1 else: side[i] = 1 n_right += 1 # Now that we have the counts allocate arrays indices_left = np.empty(n_left, dtype=np.int64) indices_right = np.empty(n_right, dtype=np.int64) # Populate the arrays with indices according to which side they fell on n_left = 0 n_right = 0 for i in range(side.shape[0]): if side[i] == 0: indices_left[n_left] = indices[i] n_left += 1 else: indices_right[n_right] = indices[i] n_right += 1 return indices_left, indices_right, hyperplane_vector, None
def sparse_angular_random_projection_split(inds, indptr, data, indices, rng_state): """Given a set of ``indices`` for data points from a sparse data set presented in csr sparse format as inds, indptr and data, create a random hyperplane to split the data, returning two arrays indices that fall on either side of the hyperplane. This is the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses cosine distance to determine the hyperplane and which side each data sample falls on. Parameters ---------- inds: array CSR format index array of the matrix indptr: array CSR format index pointer array of the matrix data: array CSR format data array of the matrix indices: array of shape (tree_node_size,) The indices of the elements in the ``data`` array that are to be split in the current operation. rng_state: array of int64, shape (3,) The internal state of the rng Returns ------- indices_left: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. indices_right: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. """ # Select two random points, set the hyperplane between them left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_inds = inds[indptr[left]:indptr[left + 1]] left_data = data[indptr[left]:indptr[left + 1]] right_inds = inds[indptr[right]:indptr[right + 1]] right_data = data[indptr[right]:indptr[right + 1]] left_norm = norm(left_data) right_norm = norm(right_data) if abs(left_norm) < EPS: left_norm = 1.0 if abs(right_norm) < EPS: right_norm = 1.0 # Compute the normal vector to the hyperplane (the vector between # the two points) normalized_left_data = left_data / left_norm normalized_right_data = right_data / right_norm hyperplane_inds, hyperplane_data = sparse_diff(left_inds, normalized_left_data, right_inds, normalized_right_data) hyperplane_norm = norm(hyperplane_data) if abs(hyperplane_norm) < EPS: hyperplane_norm = 1.0 for d in range(hyperplane_data.shape[0]): hyperplane_data[d] = hyperplane_data[d] / hyperplane_norm # For each point compute the margin (project into normal vector) # If we are on lower side of the hyperplane put in one pile, otherwise # put it in the other pile (if we hit hyperplane on the nose, flip a coin) n_left = 0 n_right = 0 side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = 0.0 i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]] i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]] mul_inds, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, i_inds, i_data) for d in range(mul_data.shape[0]): margin += mul_data[d] if abs(margin) < EPS: side[i] = tau_rand_int(rng_state) % 2 if side[i] == 0: n_left += 1 else: n_right += 1 elif margin > 0: side[i] = 0 n_left += 1 else: side[i] = 1 n_right += 1 # Now that we have the counts allocate arrays indices_left = np.empty(n_left, dtype=np.int64) indices_right = np.empty(n_right, dtype=np.int64) # Populate the arrays with indices according to which side they fell on n_left = 0 n_right = 0 for i in range(side.shape[0]): if side[i] == 0: indices_left[n_left] = indices[i] n_left += 1 else: indices_right[n_right] = indices[i] n_right += 1 hyperplane = np.vstack((hyperplane_inds, hyperplane_data)) return indices_left, indices_right, hyperplane, None