def make_angular_hyperplane(data, indices, rng_state): left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_norm = norm(data[left]) right_norm = norm(data[right]) if left_norm == 0.0: left_norm = 1.0 if right_norm == 0.0: right_norm = 1.0 # Compute the normal vector to the hyperplane (the vector between # the two points) and the offset from the origin hyperplane_offset = 0.0 hyperplane_vector = np.empty(data.shape[1], dtype=np.float32) for d in range(data.shape[1]): hyperplane_vector[d] = (data[left, d] / left_norm) - (data[right, d] / right_norm) return hyperplane_vector, hyperplane_offset
def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state): margin = offset hyperplane_size = hyperplane.shape[1] while hyperplane[0, hyperplane_size - 1] < 0.0: hyperplane_size -= 1 hyperplane_inds = hyperplane[0, :hyperplane_size].astype(np.int32) hyperplane_data = hyperplane[1, :hyperplane_size] _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds, point_data) for val in aux_data: margin += val if abs(margin) < EPS: side = tau_rand_int(rng_state) % 2 if side == 0: return 0 else: return 1 elif margin > 0: return 0 else: return 1
def sparse_current_graph_map_jit( heap, rows, n_neighbors, inds, indptr, data, rng_state, seed_per_row, sparse_dist, ): rng_state_local = rng_state.copy() for i in rows: if seed_per_row: seed(rng_state_local, i) if heap[0, i, 0] < 0.0: for j in range(n_neighbors - np.sum(heap[0, i] >= 0.0)): idx = np.abs(tau_rand_int(rng_state_local)) % data.shape[0] from_inds = inds[indptr[i]:indptr[i + 1]] from_data = data[indptr[i]:indptr[i + 1]] to_inds = inds[indptr[idx]:indptr[idx + 1]] to_data = data[indptr[idx]:indptr[idx + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data) heap_push(heap, i, d, idx, 1) return True
def apply_hyperplane( data, hyperplane_vector, hyperplane_offset, hyperplane_node_num, current_num_nodes, data_node_loc, rng_state, ): left_node = current_num_nodes right_node = current_num_nodes + 1 for i in range(data_node_loc.shape[0]): if data_node_loc[i] != hyperplane_node_num: continue margin = hyperplane_offset for d in range(hyperplane_vector.shape[0]): margin += hyperplane_vector[d] * data[i, d] if margin == 0: if abs(tau_rand_int(rng_state)) % 2 == 0: data_node_loc[i] = left_node else: data_node_loc[i] = right_node elif margin > 0: data_node_loc[i] = left_node else: data_node_loc[i] = right_node return
def search_init( query_inds, query_data, k, inds, indptr, data, forest, n_neighbors, tried, sparse_dist, rng_state, ): heap_priorities = np.float32(np.inf) + np.zeros(k, dtype=np.float32) heap_indices = np.int32(-1) + np.zeros(k, dtype=np.int32) n_samples = indptr.shape[0] - 1 n_random_samples = min(k, n_neighbors) for tree in forest: indices = search_sparse_flat_tree( query_inds, query_data, tree.hyperplanes, tree.offsets, tree.children, tree.indices, rng_state, ) n_initial_points = indices.shape[0] n_random_samples = min(k, n_neighbors) - n_initial_points for j in range(n_initial_points): candidate = indices[j] from_inds = inds[indptr[candidate] : indptr[candidate + 1]] from_data = data[indptr[candidate] : indptr[candidate + 1]] d = sparse_dist(from_inds, from_data, query_inds, query_data) # indices are guaranteed different simple_heap_push(heap_priorities, heap_indices, d, candidate) mark_visited(tried, candidate) if n_random_samples > 0: for i in range(n_random_samples): candidate = np.abs(tau_rand_int(rng_state)) % n_samples if has_been_visited(tried, candidate) == 0: from_inds = inds[indptr[candidate] : indptr[candidate + 1]] from_data = data[indptr[candidate] : indptr[candidate + 1]] d = sparse_dist(from_inds, from_data, query_inds, query_data,) simple_heap_push(heap_priorities, heap_indices, d, candidate) mark_visited(tried, candidate) return heap_priorities, heap_indices
def make_euclidean_hyperplane(data, indices, rng_state): left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] # Compute the normal vector to the hyperplane (the vector between # the two points) and the offset from the origin hyperplane_offset = 0.0 hyperplane_vector = np.empty(data.shape[1], dtype=np.float32) for d in range(data.shape[1]): hyperplane_vector[d] = data[left, d] - data[right, d] hyperplane_offset -= (hyperplane_vector[d] * (data[left, d] + data[right, d]) / 2.0) return hyperplane_vector, hyperplane_offset
def current_graph_map_jit(heap, rows, n_neighbors, data, rng_state, seed_per_row, dist, dist_args): rng_state_local = rng_state.copy() for i in rows: if seed_per_row: seed(rng_state_local, i) if heap[0, i, 0] < 0.0: for j in range(n_neighbors - np.sum(heap[0, i] >= 0.0)): idx = np.abs(tau_rand_int(rng_state_local)) % data.shape[0] d = dist(data[i], data[idx], *dist_args) heap_push(heap, i, d, idx, 1) return True
def select_side(hyperplane, offset, point, rng_state): margin = offset for d in range(point.shape[0]): margin += hyperplane[d] * point[d] if abs(margin) < EPS: side = tau_rand_int(rng_state) % 2 if side == 0: return 0 else: return 1 elif margin > 0: return 0 else: return 1
def init_random(n_neighbors, inds, indptr, data, heap, dist, rng_state): n_samples = indptr.shape[0] - 1 for i in range(n_samples): if heap[0][i, 0] < 0.0: for j in range(n_neighbors - np.sum(heap[0][i] >= 0.0)): idx = np.abs(tau_rand_int(rng_state)) % n_samples from_inds = inds[indptr[idx] : indptr[idx + 1]] from_data = data[indptr[idx] : indptr[idx + 1]] to_inds = inds[indptr[i] : indptr[i + 1]] to_data = data[indptr[i] : indptr[i + 1]] d = dist(from_inds, from_data, to_inds, to_data) heap_push(heap, i, d, idx, 1) return
def sparse_select_side(hyperplane, offset, point_inds, point_data, rng_state): margin = offset hyperplane_inds = arr_unique(hyperplane[0]) hyperplane_data = hyperplane[1, : hyperplane_inds.shape[0]] _, aux_data = sparse_mul(hyperplane_inds, hyperplane_data, point_inds, point_data) for d in range(aux_data.shape[0]): margin += aux_data[d] if abs(margin) < EPS: side = tau_rand_int(rng_state) % 2 if side == 0: return 0 else: return 1 elif margin > 0: return 0 else: return 1
def angular_random_projection_split(data, indices, rng_state): """Given a set of ``graph_indices`` for graph_data points from ``graph_data``, create a random hyperplane to split the graph_data, returning two arrays graph_indices that fall on either side of the hyperplane. This is the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses cosine distance to determine the hyperplane and which side each graph_data sample falls on. Parameters ---------- data: array of shape (n_samples, n_features) The original graph_data to be split indices: array of shape (tree_node_size,) The graph_indices of the elements in the ``graph_data`` array that are to be split in the current operation. rng_state: array of int64, shape (3,) The internal state of the rng Returns ------- indices_left: array The elements of ``graph_indices`` that fall on the "left" side of the random hyperplane. indices_right: array The elements of ``graph_indices`` that fall on the "left" side of the random hyperplane. """ dim = data.shape[1] # Select two random points, set the hyperplane between them left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_norm = norm(data[left]) right_norm = norm(data[right]) if abs(left_norm) < EPS: left_norm = 1.0 if abs(right_norm) < EPS: right_norm = 1.0 # Compute the normal vector to the hyperplane (the vector between # the two points) hyperplane_vector = np.empty(dim, dtype=np.float32) for d in range(dim): hyperplane_vector[d] = (data[left, d] / left_norm) - (data[right, d] / right_norm) hyperplane_norm = norm(hyperplane_vector) if abs(hyperplane_norm) < EPS: hyperplane_norm = 1.0 for d in range(dim): hyperplane_vector[d] = hyperplane_vector[d] / hyperplane_norm # For each point compute the margin (project into normal vector) # If we are on lower side of the hyperplane put in one pile, otherwise # put it in the other pile (if we hit hyperplane on the nose, flip a coin) n_left = 0 n_right = 0 side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = 0.0 for d in range(dim): margin += hyperplane_vector[d] * data[indices[i], d] if abs(margin) < EPS: side[i] = tau_rand_int(rng_state) % 2 if side[i] == 0: n_left += 1 else: n_right += 1 elif margin > 0: side[i] = 0 n_left += 1 else: side[i] = 1 n_right += 1 # Now that we have the counts allocate arrays indices_left = np.empty(n_left, dtype=np.int32) indices_right = np.empty(n_right, dtype=np.int32) # Populate the arrays with graph_indices according to which side they fell on n_left = 0 n_right = 0 for i in range(side.shape[0]): if side[i] == 0: indices_left[n_left] = indices[i] n_left += 1 else: indices_right[n_right] = indices[i] n_right += 1 return indices_left, indices_right, hyperplane_vector, 0.0
def sparse_euclidean_random_projection_split(inds, indptr, data, indices, rng_state): """Given a set of ``graph_indices`` for graph_data points from a sparse graph_data set presented in csr sparse format as inds, graph_indptr and graph_data, create a random hyperplane to split the graph_data, returning two arrays graph_indices that fall on either side of the hyperplane. This is the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses cosine distance to determine the hyperplane and which side each graph_data sample falls on. Parameters ---------- inds: array CSR format index array of the matrix indptr: array CSR format index pointer array of the matrix data: array CSR format graph_data array of the matrix indices: array of shape (tree_node_size,) The graph_indices of the elements in the ``graph_data`` array that are to be split in the current operation. rng_state: array of int64, shape (3,) The internal state of the rng Returns ------- indices_left: array The elements of ``graph_indices`` that fall on the "left" side of the random hyperplane. indices_right: array The elements of ``graph_indices`` that fall on the "left" side of the random hyperplane. """ # Select two random points, set the hyperplane between them left_index = np.abs(tau_rand_int(rng_state)) % indices.shape[0] right_index = np.abs(tau_rand_int(rng_state)) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_inds = inds[indptr[left]:indptr[left + 1]] left_data = data[indptr[left]:indptr[left + 1]] right_inds = inds[indptr[right]:indptr[right + 1]] right_data = data[indptr[right]:indptr[right + 1]] # Compute the normal vector to the hyperplane (the vector between # the two points) and the offset from the origin hyperplane_offset = 0.0 hyperplane_inds, hyperplane_data = sparse_diff(left_inds, left_data, right_inds, right_data) offset_inds, offset_data = sparse_sum(left_inds, left_data, right_inds, right_data) offset_data = offset_data / 2.0 offset_inds, offset_data = sparse_mul(hyperplane_inds, hyperplane_data, offset_inds, offset_data.astype(np.float32)) for val in offset_data: hyperplane_offset -= val # For each point compute the margin (project into normal vector, add offset) # If we are on lower side of the hyperplane put in one pile, otherwise # put it in the other pile (if we hit hyperplane on the nose, flip a coin) n_left = 0 n_right = 0 side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = hyperplane_offset i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]] i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]] _, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, i_inds, i_data) for val in mul_data: margin += val if abs(margin) < EPS: side[i] = abs(tau_rand_int(rng_state)) % 2 if side[i] == 0: n_left += 1 else: n_right += 1 elif margin > 0: side[i] = 0 n_left += 1 else: side[i] = 1 n_right += 1 # Now that we have the counts allocate arrays indices_left = np.empty(n_left, dtype=np.int32) indices_right = np.empty(n_right, dtype=np.int32) # Populate the arrays with graph_indices according to which side they fell on n_left = 0 n_right = 0 for i in range(side.shape[0]): if side[i] == 0: indices_left[n_left] = indices[i] n_left += 1 else: indices_right[n_right] = indices[i] n_right += 1 hyperplane = np.vstack((hyperplane_inds, hyperplane_data)) return indices_left, indices_right, hyperplane, hyperplane_offset
def sparse_angular_random_projection_split(inds, indptr, data, indices, rng_state): """Given a set of ``indices`` for data points from a sparse data set presented in csr sparse format as inds, indptr and data, create a random hyperplane to split the data, returning two arrays indices that fall on either side of the hyperplane. This is the basis for a random projection tree, which simply uses this splitting recursively. This particular split uses cosine distance to determine the hyperplane and which side each data sample falls on. Parameters ---------- inds: array CSR format index array of the matrix indptr: array CSR format index pointer array of the matrix data: array CSR format data array of the matrix indices: array of shape (tree_node_size,) The indices of the elements in the ``data`` array that are to be split in the current operation. rng_state: array of int64, shape (3,) The internal state of the rng Returns ------- indices_left: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. indices_right: array The elements of ``indices`` that fall on the "left" side of the random hyperplane. """ # Select two random points, set the hyperplane between them left_index = tau_rand_int(rng_state) % indices.shape[0] right_index = tau_rand_int(rng_state) % indices.shape[0] right_index += left_index == right_index right_index = right_index % indices.shape[0] left = indices[left_index] right = indices[right_index] left_inds = inds[indptr[left]:indptr[left + 1]] left_data = data[indptr[left]:indptr[left + 1]] right_inds = inds[indptr[right]:indptr[right + 1]] right_data = data[indptr[right]:indptr[right + 1]] left_norm = norm(left_data) right_norm = norm(right_data) if abs(left_norm) < EPS: left_norm = 1.0 if abs(right_norm) < EPS: right_norm = 1.0 # Compute the normal vector to the hyperplane (the vector between # the two points) normalized_left_data = left_data / left_norm normalized_right_data = right_data / right_norm hyperplane_inds, hyperplane_data = sparse_diff(left_inds, normalized_left_data, right_inds, normalized_right_data) hyperplane_norm = norm(hyperplane_data) if abs(hyperplane_norm) < EPS: hyperplane_norm = 1.0 for d in range(hyperplane_data.shape[0]): hyperplane_data[d] = hyperplane_data[d] / hyperplane_norm # For each point compute the margin (project into normal vector) # If we are on lower side of the hyperplane put in one pile, otherwise # put it in the other pile (if we hit hyperplane on the nose, flip a coin) n_left = 0 n_right = 0 side = np.empty(indices.shape[0], np.int8) for i in range(indices.shape[0]): margin = 0.0 i_inds = inds[indptr[indices[i]]:indptr[indices[i] + 1]] i_data = data[indptr[indices[i]]:indptr[indices[i] + 1]] _, mul_data = sparse_mul(hyperplane_inds, hyperplane_data, i_inds, i_data) for d in range(mul_data.shape[0]): margin += mul_data[d] if abs(margin) < EPS: side[i] = tau_rand_int(rng_state) % 2 if side[i] == 0: n_left += 1 else: n_right += 1 elif margin > 0: side[i] = 0 n_left += 1 else: side[i] = 1 n_right += 1 # Now that we have the counts allocate arrays indices_left = np.empty(n_left, dtype=np.int64) indices_right = np.empty(n_right, dtype=np.int64) # Populate the arrays with indices according to which side they fell on n_left = 0 n_right = 0 for i in range(side.shape[0]): if side[i] == 0: indices_left[n_left] = indices[i] n_left += 1 else: indices_right[n_right] = indices[i] n_right += 1 hyperplane = np.vstack((hyperplane_inds, hyperplane_data)) return indices_left, indices_right, hyperplane, None