def hnswlibTok(X,eps,min_Pts): #使用HNSW查找每个数据点的最近邻 # dim = len(X[0]) # data_lables=range(len(X)) # p = hnswlib.Index(space='l2', dim=dim) # p.init_index(max_elements=len(X), ef_construction=200, M=20) # p.add_items(X,data_lables) # p.set_ef(50) # labels,distance = p.knn_query(X, k=len(X)) #len(X) tree = KDTree(X, leaf_size=50) dist, labels = tree.query(X, k=len(X)) neighbor_list=[] omega_list=[] #核心对象集合 for i in labels: centers=X[i[0]] center_neighbor=i dist_list=[] for j in range(1,len(i)): curr=X[i[j]] dist = np.sqrt(np.sum(np.square(centers- curr))) dist_list.append(dist) if dist>eps: #找到小于半径的截至索引位置 center_neighbor=center_neighbor[0:j] break neighbor_list.append(set(center_neighbor)) if len(neighbor_list[-1]) >= min_Pts: omega_list.append(i[0]) # 将样本加入核心对象集合 omega_list = set(omega_list) # 转化为集合便于操作 return neighbor_list,omega_list
def metric(self, X, Y, n_features=None, dist_func=euclidean): small, big = (X, Y) if len(X) > len(Y) else (Y, X) small = small.reshape(-1, n_features) big = big.reshape(-1, n_features) kdtree = KDTree(big) result, _ = kdtree.query(small) result = self.linkage(result) return result
def test_kd_tree_two_point(dualtree): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) r = np.linspace(0, 1, 10) kdt = KDTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true)
def get_bags_of_words(image_paths): ''' This function should take in a list of image paths and calculate a bag of words histogram for each image, then return those histograms in an array. Inputs: image_paths: A Python list of strings, where each string is a complete path to one image on the disk. Outputs: An nxd numpy matrix, where n is the number of images in image_paths and d is size of the histogram built for each image. Use the same hog function to extract feature vectors as before (see build_vocabulary). It is important that you use the same hog settings for both build_vocabulary and get_bags_of_words! Otherwise, you will end up with different feature representations between your vocab and your test images, and you won't be able to match anything at all! After getting the feature vectors for an image, you will build up a histogram that represents what words are contained within the image. For each feature, find the closest vocab word, then add 1 to the histogram at the index of that word. For example, if the closest vector in the vocab is the 103rd word, then you should add 1 to the 103rd histogram bin. Your histogram should have as many bins as there are vocabulary words. Suggested functions: scipy.spatial.distance.cdist, np.argsort, np.linalg.norm, skimage.feature.hog ''' vocab = np.load('vocab.npy') print('Loaded vocab from file.') #TODO: Implement this function! vocab_mat = np.load('vocab.npy') vocab_size = len(image_paths) tree = KDTree(vocab_mat) cluster_SIFT_features = [] sift = cv2.xfeatures2d.SIFT_create() for image_path in tqdm(image_paths, desc='SIFT'): image_bag = [0] * vocab_size image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) locations, SIFT_features = sift.detectAndCompute(gray, None) temp = SIFT_features.tolist() nearest_dist, nearest_ind = tree.query(temp, k=1) for index in nearest_ind: image_bag[int(index)] += 1 cluster_SIFT_features.append(image_bag) return cluster_SIFT_features
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde rng = check_random_state(0) x_in = rng.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: kdt = KDTree(x_in[:, None]) gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3)
def test_kd_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_array_object_type(): """Check that we do not accept object dtype array.""" X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises( ValueError, match="setting an array element with a sequence" ): KDTree(X)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius([query_pt], r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind])**2).sum(1)) assert_array_almost_equal(d, dist)
def median_smallest_distance(points, tree=None): """Median over all points of the distance to their closest neighbor. This gives an idea of the "grid size" of a point dataset. """ points = numpy.array(points) if tree is None: # points = numpy.unique(points, axis=0) # Too slow points = numpy.array(list(set(tuple(p) for p in points))) tree = KDTree(points) # Get the minimum distances to neighbors for a sample of points rnd = numpy.random.RandomState(89) sample_size = min(len(points), 100) sample_idx = rnd.choice(len(points), sample_size, replace=False) sample = points[sample_idx] distances, _ = tree.query(sample, k=2, return_distance=True) # Return the median of that return numpy.median(distances[:, 1])
def test_kdtree_picklable_with_joblib(): """Make sure that KDTree queries work when joblib memmaps. Non-regression test for #21685 and #21228.""" rng = np.random.RandomState(0) X = rng.random_sample((10, 3)) tree = KDTree(X, leaf_size=2) # Call Parallel with max_nbytes=1 to trigger readonly memory mapping that # use to raise "ValueError: buffer source array is read-only" in a previous # version of the Cython code. Parallel(n_jobs=2, max_nbytes=1)(delayed(tree.query)(data) for data in 2 * [X])
def test_kd_tree_kde(kernel, h): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): check_results(kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true)
def prepare_data(): global __tree try: __tree = pickle.load( open(paths.models + 'alternative_actors/actors_kdtree.pkl', "rb")) except (OSError, IOError) as e: sample_size = 20000 latent_vector_generator, actors_id = get_latent_vector_generator() actors_id = actors_id[0:sample_size] vectors = np.array( [latent_vector_generator(actor_id) for actor_id in actors_id]) __tree = KDTree(vectors, leaf_size=2) pickle.dump( __tree, open(paths.models + 'alternative_actors/actors_kdtree.pkl', "wb"))
def join( original_data, augment_data_path, original_metadata, augment_metadata, writer, left_columns, right_columns, how='left', columns=None, agg_functions=None, temporal_resolution=None, ): """ Performs a join between original_data (pandas.DataFrame or path to CSV) and augment_data (pandas.DataFrame) using left_columns and right_columns. The result is written to the writer object. Returns the metadata for the result. """ if isinstance(original_data, pd.DataFrame): pass elif hasattr(original_data, 'read'): original_data = pd.read_csv( original_data, error_bad_lines=False, dtype=str, ) else: raise TypeError( "join() argument 1 should be a file or a DataFrame, got " "%r" % type(original_data)) augment_data_columns = [col['name'] for col in augment_metadata['columns']] # only converting data types for columns involved in augmentation original_join_columns_idx = [] augment_join_columns_idx = [] augment_columns_transform = [] for left, right in zip(left_columns, right_columns): if len(left) == 2 and len(right) == 2: # Spatial augmentation # Get those columns points = original_data.iloc[:, left] # De-duplicate points = pd.DataFrame(list(set(tuple(p) for p in points.values))) # Convert to numeric numpy array points = pd.DataFrame({ 'x': pd.to_numeric( points.iloc[:, 0], errors='coerce', downcast='float', ), 'y': pd.to_numeric( points.iloc[:, 1], errors='coerce', downcast='float', ), }).values # Build KDTree tree = KDTree(points) # Compute max distance for nearest join max_dist = 2 * median_smallest_distance(points, tree) logger.info("Using nearest spatial join, max=%r", max_dist) # Store transformation augment_columns_transform.append(( right, _tree_nearest(tree, max_dist), )) original_join_columns_idx.extend(left) augment_join_columns_idx.extend(right) elif len(left) > 1 or len(right) > 1: raise AugmentationError("Datamart currently does not support " "combination of columns for augmentation.") else: original_join_columns_idx.append(left[0]) augment_join_columns_idx.append(right[0]) original_data = set_data_index( original_data, original_join_columns_idx, original_metadata['columns'], drop=False, # Keep the values of join columns from this side ) # Add a column of unique indices which will be used to aggregate original_data[UNIQUE_INDEX_KEY] = pd.RangeIndex(len(original_data)) logger.info("Performing join...") # Stream the data in augment_data_chunks = pd.read_csv( augment_data_path, error_bad_lines=False, chunksize=CHUNK_SIZE_ROWS, ) try: first_augment_data = next(augment_data_chunks) except StopIteration: raise AugmentationError("Empty augmentation data") # Columns to drop drop_columns = None if columns: drop_columns = list( # Drop all the columns in augment_data set(augment_data_columns[c] for c in columns) # except - ( # the requested columns set(columns) # and the join columns | {col[0] for col in right_columns})) # Defer temporal alignment until reading the first block from companion # (and converting it to the right data types!) update_idx = None original_data_res = None # Streaming join start = time.perf_counter() join_ = [] # Iterate over chunks of augment data for augment_data in itertools.chain([first_augment_data], augment_data_chunks): # Run transforms for cols, transform in augment_columns_transform: augment_data.iloc[:, cols] = transform(augment_data.iloc[:, cols]) # Convert data types augment_data = set_data_index( augment_data, augment_join_columns_idx, augment_metadata['columns'], drop=True, # Drop the join columns on that side (avoid duplicates) ) if update_idx is None: # Guess temporal resolutions (on first chunk) update_idx = match_temporal_resolutions( original_data, augment_data, temporal_resolution, ) original_data_res = original_data.set_index( update_idx(original_data.index)) # Match temporal resolutions augment_data.index = update_idx(augment_data.index) # Filter columns if drop_columns: augment_data = augment_data.drop(drop_columns, axis=1) # Join joined_chunk = original_data_res.join(augment_data, how=how, rsuffix='_r') # Drop the join columns we set as index joined_chunk.reset_index(drop=True, inplace=True) join_.append(joined_chunk) join_ = pd.concat(join_) logger.info("Join completed in %.4fs", time.perf_counter() - start) intersection = set(original_data.columns).intersection( set(first_augment_data.columns)) # qualities qualities_list = [] # map column names for the augmentation data augment_columns_map = { name: name + '_r' if name in intersection else name for name in first_augment_data.columns } # aggregations join_ = perform_aggregations( join_, list(original_data.columns), agg_functions, augment_columns_map, ) # drop unique index join_.drop([UNIQUE_INDEX_KEY], axis=1, inplace=True) original_columns_set = set(original_data.columns) new_columns = [ col for col in join_.columns if col not in original_columns_set ] qualities_list.append( dict(qualName='augmentation_info', qualValue=dict(new_columns=new_columns, removed_columns=[], nb_rows_before=original_data.shape[0], nb_rows_after=join_.shape[0], augmentation_type='join'), qualValueType='dict')) with WriteCounter(writer.open_file('w')) as fout: join_.to_csv(fout, index=False, line_terminator='\r\n') size = fout.size # Build a dict of information about all columns columns_metadata = dict() for column in augment_metadata['columns']: for agg in itertools.chain( (None, ), AGGREGATION_FUNCTIONS, ): for name in (column['name'], column['name'] + '_r'): column_metadata = { k: v for k, v in column.items() if k in KEEP_COLUMN_FIELDS } if agg is not None: name = agg + ' ' + name column_metadata['name'] = name if agg in {'sum', 'mean'}: column_metadata['structural_type'] = types.FLOAT column_metadata['semantic_types'] = [] elif agg == 'count': column_metadata['structural_type'] = types.INTEGER column_metadata['semantic_types'] = [] columns_metadata[name] = column_metadata for column in original_metadata['columns']: columns_metadata[column['name']] = column # Then construct column metadata by looking them up in the dict columns_metadata = [columns_metadata[name] for name in join_.columns] return { 'columns': columns_metadata, 'size': size, 'qualities': qualities_list, }
def fit(self, X, y): if Version(sklearn_version) >= Version("1.0"): self._check_feature_names(X, reset=True) if self.metric_params is not None and 'p' in self.metric_params: if self.p is not None: warnings.warn( "Parameter p is found in metric_params. " "The corresponding parameter from __init__ " "is ignored.", SyntaxWarning, stacklevel=2) self.effective_metric_params_ = self.metric_params.copy() effective_p = self.metric_params["p"] else: self.effective_metric_params_ = {} effective_p = self.p if self.metric in ["minkowski"]: if effective_p < 1: raise ValueError( "p must be greater or equal to one for minkowski metric") self.effective_metric_params_["p"] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == "minkowski": p = self.effective_metric_params_.pop("p", 2) if p < 1: raise ValueError( "p must be greater or equal to one for minkowski metric") if p == 1: self.effective_metric_ = "manhattan" elif p == 2: self.effective_metric_ = "euclidean" elif p == np.inf: self.effective_metric_ = "chebyshev" else: self.effective_metric_params_["p"] = p if self.metric == "manhattan": self.p = 1 if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)): self._fit_X = _check_array(X, dtype=[np.float64, np.float32], accept_sparse=True) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) if self.algorithm == "auto": # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \ self.n_neighbors >= self._fit_X.shape[0] // 2 if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute: self._fit_method = "brute" else: if self.effective_metric_ in VALID_METRICS["kd_tree"]: self._fit_method = "kd_tree" elif callable(self.effective_metric_) or \ self.effective_metric_ in \ VALID_METRICS["ball_tree"]: self._fit_method = "ball_tree" else: self._fit_method = "brute" else: self._fit_method = self.algorithm if hasattr(self, '_onedal_estimator'): delattr(self, '_onedal_estimator') # To cover test case when we pass patched # estimator as an input for other estimator if isinstance(X, sklearn_NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self.n_samples_fit_ = X.n_samples_fit_ self.n_features_in_ = X.n_features_in_ if hasattr(X, '_onedal_estimator'): if self._fit_method == "ball_tree": X._tree = BallTree( X._fit_X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_, ) elif self._fit_method == "kd_tree": X._tree = KDTree( X._fit_X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_, ) elif self._fit_method == "brute": X._tree = None else: raise ValueError("algorithm = '%s' not recognized" % self.algorithm) elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] dispatch( self, 'neighbors.KNeighborsClassifier.fit', { 'onedal': self.__class__._onedal_fit, 'sklearn': sklearn_KNeighborsClassifier.fit, }, X, y) return self