def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean # The Cython routines used require contiguous arrays if not X.flags['C_CONTIGUOUS']: X = np.array(X, dtype=np.double, order='C') size = X.shape[0] min_samples = min(size - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) dist_metric = DistanceMetric.get_metric(metric, **kwargs) # Get distance to kth nearest neighbour core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1].copy(order='C') # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def DualTree(dataFlux,dDataFlux,modelFlux,modelParams,mcIts,columnsToScale=[]): ''' Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) modelParams = parameters of each model to be recorded, array of size (#models,#parameters) mcIts = number of times to perturb fluxes for each object, int columnsToScale = list of column indices in modelParams of parameters that need to be multiplied by scale factor Output: NumPy array of size (#objects,mcIts,#params) e.g. the zeroth element gives you a 2d array where each row represents the fit parameters from one monte carlo iteration ''' modelColors = modelFlux[:,1:] / modelFlux[:,:-1] tree = BallTree(modelColors) fitParams = [] for i in range(len(dataFlux)): newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts,len(dataFlux[i])) newColors = newFlux[:,1:] / newFlux[:,:-1] query = tree.query(newColors,k=1,dualtree=True) s = fit_tools.Scale(modelFlux[query[1][:,0]],newFlux,np.ones(np.shape(newFlux))) myParams = s for j in range(len(modelParams[0])): if j in columnsToScale: myParams = np.c_[myParams,np.multiply(s,modelParams[query[1][:,0]][:,j])] else: myParams = np.c_[myParams,modelParams[query[1][:,0]][:,j]] fitParams.append(myParams) return(np.array(fitParams))
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean size = X.shape[0] min_samples = min(size - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size) dist_metric = DistanceMetric.get_metric(metric) #Get distance to kth nearest neighbour core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] #Mutual reachability distance is implicite in mst_linkage_core_cdist min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) #Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] #Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def correrPruebaLocal(set_ampliado): print "corriendo prueba local" train,targetTrain,test,targetTest = cargarDatosPruebaLocal(set_ampliado,0.66) tree = BallTree(train,leaf_size=30) predictions=[] correctas=0 incorrectas=0 for x in range(len(test)): dist, ind = tree.query(test[x], k=4) resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel()) predictions.append(resultado) print progreso(x,len(test)) if resultado==targetTest[x]: correctas+=1 else: incorrectas+=1 print "Predicciones --> Correctas: " + str(correctas) + "Incorrectas: " + str(incorrectas)+ "Total: "+ str(len(test)) print('> predicted=' + repr(resultado) + ', actual=' + repr(targetTest[x]) + ' ' + progreso(x,len(test)) ) print "precision total" correct = 0 for x in range(len(test)): if targetTest[x] == predictions[x]: correct += 1 print (float(correct)/float(len(test))) * 100.0
def knn(a, b): "k nearest neighbors" b = np.array([bb[:-1] for bb in b]) tree = BallTree(b) __, indx = tree.query(a[:-1], k) return [b[i] for i in indx]
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = distances.dot(distances.T) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, False) kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] Pbh = _joint_probabilities_nn(distances, neighbors_nn, perplexity, False) kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=False) assert_array_almost_equal(Pbh, P, decimal=5) assert_array_almost_equal(gradex, gradbh, decimal=5)
def DualTree(dataFlux, dDataFlux, modelFlux, modelParams, mcIts): """ Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) modelParams = parameters of each model to be recorded, array of size (#models,#parameters) mcIts = number of times to perturb fluxes for each object, int Output: NumPy array of size (#objects,mcIts,#params) e.g. the zeroth element gives you a 2d array where each row represents the fit parameters from one monte carlo iteration """ modelColors = modelFlux[:, 1:] / modelFlux[:, :-1] tree = BallTree(modelColors) fitParams = [] for i in range(len(dataFlux)): newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts, len(dataFlux[i])) newColors = newFlux[:, 1:] / newFlux[:, :-1] query = tree.query(newColors, k=1, dualtree=True) s = Scale(modelFlux[query[1][:, 0]], newFlux, np.ones(np.shape(newFlux))) myParams = s for j in range(len(modelParams[0])): myParams = np.c_[myParams, modelParams[query[1][:, 0]][:, j]] fitParams.append(myParams) return np.array(fitParams)
def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data): print "Running PCA..." train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) train_a_sections = [x[0] for x in train_pairs_pca] train_b_sections = [x[1] for x in train_pairs_pca] test_a_sections = [x[0] for x in test_pairs_pca] test_b_sections = [x[1] for x in test_pairs_pca] train_given_sections = train_a_sections train_to_match_sections = train_b_sections test_given_sections = test_a_sections test_to_match_sections = test_b_sections if self.match_a_to_b: train_given_sections = train_b_sections train_to_match_sections = train_a_sections test_given_sections = test_b_sections test_to_match_sections = test_a_sections print "Constructing BallTrees..." train_bt = BallTree(train_to_match_sections, metric=dist) test_bt = BallTree(test_to_match_sections, metric=dist) train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction) test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction) print "Querying the BallTrees..." train_result = train_bt.query(train_given_sections, train_top_fraction) test_result = test_bt.query(test_given_sections, test_top_fraction) print "Looking at correctness of results..." train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))]) test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))]) print "Finding indices of correct matches..." test_result_full = test_bt.query(test_given_sections, len(test_given_sections)) def default_index(lst, i): ind = -1 try: ind = lst.index(i) except: pass return ind test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))] test_indices = [x for x in test_indices if x != -1] with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f: for i, index in enumerate(test_indices): if index == 0: successful_tunes_f.write(str(test_tune_data[i]) + '\n\n') return [[train_correct, len(train_given_sections)], [test_correct, len(test_given_sections)]], test_indices
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] min_samples = min(dim - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False): """Find nearest neighbors Note: The rows in xhs and rr must all be unit-length vectors, otherwise the result will be incorrect. Parameters ---------- xhs : array, shape=(n_samples, n_dim) Points of data set. rr : array, shape=(n_query, n_dim) Points to find nearest neighbors for. use_balltree : bool Use fast BallTree based search from scikit-learn. If scikit-learn is not installed it will fall back to the slow brute force search. return_dists : bool If True, return associated distances. Returns ------- nearest : array, shape=(n_query,) Index of nearest neighbor in xhs for every point in rr. distances : array, shape=(n_query,) The distances. Only returned if return_dists is True. """ if use_balltree: try: from sklearn.neighbors import BallTree except ImportError: logger.info('Nearest-neighbor searches will be significantly ' 'faster if scikit-learn is installed.') use_balltree = False if xhs.size == 0 or rr.size == 0: if return_dists: return np.array([], int), np.array([]) return np.array([], int) if use_balltree is True: ball_tree = BallTree(xhs) if return_dists: out = ball_tree.query(rr, k=1, return_distance=True) return out[1][:, 0], out[0][:, 0] else: nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0] return nearest else: from scipy.spatial.distance import cdist if return_dists: nearest = list() dists = list() for r in rr: d = cdist(r[np.newaxis, :], xhs) idx = np.argmin(d) nearest.append(idx) dists.append(d[0, idx]) return (np.array(nearest), np.array(dists)) else: nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs)) for r in rr]) return nearest
def compute_labels(X, C): """Compute the cluster labels for dataset X given centers C. """ # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X tree = BallTree(C) labels = tree.query(X, k=1, return_distance=False).squeeze() return labels
def md_nearest_from_centroids(seeding, centroids): # mean distance ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) sum_dist = sum(d[0] for d in dist) mean = sum_dist / len(centroids) return mean
def _rsl_prims_balltree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=k)[0][:,-1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
class BallTreeANN: def __init__(self): """ Constructor """ self.nbrs = None def build_index(self, dataset, leaf_size): self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean") return self.nbrs def build_store_index(self, dataset, path, leaf_size): self.build_index(dataset, leaf_size) self.store_index(path) def store_index(self, path): with open(path, "wb") as output1: pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL) def load_index(self, path): with open(path, "rb") as input1: self.nbrs = pickle.load(input1) def search_in_radious(self, vector, radious=2): distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True) return distances, indices def search_neighbors(self, vector, num_neighbors): distances, indices = self.nbrs.query(vector, k=num_neighbors) return distances, indices
def _calc_tree(xx, yy, radius): X = np.zeros((len(xx), 2), dtype='float') X[:, 0] = xx[:] X[:, 1] = yy[:] tree = BallTree(X, metric='euclidean') ind = tree.query_radius(X, r=radius) ind_sw = tree.query_radius(X, r=VARIANCE_RADIUS_SW) return ind, ind_sw
def rmsd_nearest_from_centroids(seeding, centroids): # root mean squared distance from each centroids to its closest seeding ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) # root mean squared distance sum_sqdist = sum(d[0] ** 2 for d in dist) mean = sum_sqdist / len(centroids) return mean ** 0.5
def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.metric == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel gamma = 0.5 / self.h / self.h D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma) D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1])) dens = D.sum(1) elif self.metric == 'tophat': # use Ball Tree to efficiently count neighbors bt = BallTree(self.X_) counts = bt.query_radius(X, self.h, count_only=True) dens = counts / n_volume(self.h, X.shape[1]) elif self.metric == 'exponential': D = pairwise_distances(X, self.X_) dens = np.exp(-abs(D) / self.h) dens = dens.sum(1) dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1]) elif self.metric == 'quadratic': D = pairwise_distances(X, self.X_) dens = (1 - (D / self.h) ** 2) dens[D > self.h] = 0 dens = dens.sum(1) dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2) else: D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs) dens = D.sum(1) return dens
def md_weighted_nearest_from_centroids(seeding, centroids, weights): assert len(centroids) == len(weights) sum_weight = sum(weights) ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) sum_weighted_dist = sum(d[0] * weight for d, weight in zip(dist, weights)) mean = sum_weighted_dist / sum_weight return mean
def predict(self, X): ball_tree = BallTree() ball_tree.fit(self.cluster_centers_) _, indexes = ball_tree.query(X) result = [] for idx, in indexes: result.append(self.labels_[idx]) return result
def get_centroid_weights(X, centroids): assert isinstance(X, np.ndarray) assert isinstance(centroids, np.ndarray) ball_tree = BallTree(centroids) dist, indexes = ball_tree.query(X) weights = [0 for i in centroids] for idx in indexes: weights[idx] += 1 return weights
def get_graph_topo(halos): x,y,z = cosmology.spherical_to_cartesian_with_redshift(halos['ra'],halos['dec'],halos['z']) box_coords = np.concatenate( [x,y,z] , axis=1) BT = BallTree(box_coords, leaf_size=5) list_conn = [] for ih,vh in enumerate(halos): n_connections=70 bt_dx,bt_id = BT.query(box_coords[ih,:],k=n_connections) for ic,vc in enumerate(halos[bt_id]): pass
def build_knn_matrix(data_matrix): neighbours_matrix = np.zeros((voxel_num,K_NN-1)) tree = BallTree(data_matrix[:,0:3]) for voxel in range(voxel_num): dist,ind = tree.query(data_matrix[voxel,0:3],k = K_NN) neighbours_matrix[voxel,:] = ind[0,1:] for cur_voxel in range(voxel_num): neighbours = neighbours_matrix[cur_voxel,:] for ind in range(len(neighbours)): neighbour = int(neighbours[ind]) if(cur_voxel not in neighbours_matrix[neighbour,:]): neighbours_matrix[cur_voxel,ind] = -1 return neighbours_matrix
def correrPruebaParaKaggle(set_ampliado): print "corriendo pruebas para kaggle" train,targetTrain,test = cargarDatosParaKaggle(set_ampliado) tree = BallTree(train,leaf_size=30) predictions=[] for x in range(len(test)): dist, ind = tree.query(test[x], k=4) resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel()) predictions.append(resultado) print progreso(x,len(test)) guardarPrediccionesParaKaggle(predictions)
def __init__(self, data_points = None, ai_history = None, threshold = THRESHOLD): self.state_list = [] self.weights_list = [] if data_points is None: data_points = [] if ai_history is None: ai_history = [] for state, weights in data_points: assert(len(state) == 32) self.state_list.append(state) self.weights_list.append(weights) self._threshold = threshold self._ai_history = cp.deepcopy(ai_history) #self._featureTransform() self.X = np.array(self.state_list) assert(self.X.shape == (len(data_points), 32) or len(data_points) == 0) #Think about different distance metrics. Manhattan or minkowski? P < 1? if len(data_points) > 0: self._tree = BallTree(self.X, metric='manhattan') else: self._tree = None
def construct_tree(self, alpha = 1, beta = 1, theta = 1): # Initialize tree size self.treeSize = 0 # construct a tree with the mfcc of the 1st frame of each seg, and beat chroma self.feature_size = self.mfcc_size + self.chroma_size + self.rms_size X = np.zeros((len(self.segments), self.feature_size)) for idx, seg in enumerate(self.segments): X [idx,:] = seg.get_seg_feature() #X[idx,0:self.mfcc_size] = seg.get_head_mfcc(n=1) #X[idx,self.mfcc_size:-self.rms_size] = seg.get_head_chroma(n=1) #X[idx,-self.rms_size:] = seg.get_head_rms(n=5) seg.idx = self.treeSize self.treeSize += 1 def mydist(x, y): xMFCC = x[0:self.mfcc_size] yMFCC = y[0:self.mfcc_size] xChroma = x[self.mfcc_size:self.mfcc_size+self.chroma_size] yChroma = y[self.mfcc_size:self.mfcc_size+self.chroma_size] xRms = x[-self.rms_size:] yRms = y[-self.rms_size:] dist1 = np.sum((xMFCC-yMFCC)**2) / 5000 dist2 = 1.0 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) #dist2 = spatial.distance.cosine(xChroma,yChroma) #unknown why this doesn't work dist3 = np.sum((xRms-yRms)**2) dist = alpha * dist1 + beta * dist2 + theta * dist3 return dist self.tree = BallTree(X, leaf_size=2, metric = 'pyfunc',func = mydist)
def calc_vert_vals(verts, pts, vals, method='max', k_points=100): ball_tree = BallTree(pts) dists, pts_inds = ball_tree.query(verts, k=k_points, return_distance=True) near_vals = vals[pts_inds] # sig_dists = dists[np.where(abs(near_vals)>2)] cover = len(np.unique(pts_inds.ravel()))/float(len(pts)) print('{}% of the points are covered'.format(cover*100)) if method=='dist': n_dists = 1/(dists**2) norm = 1/np.sum(n_dists, 1) norm = np.reshape(norm, (len(norm), 1)) n_dists = norm * n_dists verts_vals = np.sum(near_vals * n_dists, 1) elif method=='max': verts_vals = near_vals[range(near_vals.shape[0]), np.argmax(abs(near_vals), 1)] return verts_vals
def fit(self, X_cv, y_true=None, weights=None): from sklearn.neighbors import BallTree from sklearn.metrics import accuracy_score import random import time if y_true is None: raise ValueError('we need y labels to supervise-fit!') else: t0 = time.time() predictions = [] for name, model in self.models.iteritems(): #predictions.append(model.predict(X_cv)) # print len(predictions[-1]) if self.common_neigh: X_tr = self.counter.fit_transform(X_cv) self.gt_tree = BallTree(X_tr.toarray(), leaf_size=20) else: X_tr = self.models_tr[name].transform(X_cv) if hasattr(X_tr, "toarray"): self.trees[name] = BallTree(X_tr.toarray(), leaf_size=20) else: self.trees[name] = BallTree(X_tr, leaf_size=20) self.predictions[name] = model.predict(X_cv) self.true = y_true print 'Fitting time %0.2f' % (time.time() - t0)
class BallTreeRecommender(object): """ Given input terms, provide k recipe recommendations """ def __init__(self, k=3, **kwargs): self.k = k self.trans_path = "svd.pkl" self.tree_path = "tree.pkl" self.transformer = False self.tree = None self.load() def load(self): """ Load a pickled transformer and tree from disk, if they exist. """ if os.path.exists(self.trans_path): self.transformer = joblib.load(open(self.trans_path, 'rb')) self.tree = joblib.load(open(self.tree_path, 'rb')) else: self.transformer = False self.tree = None def save(self): """ It takes a long time to fit, so just do it once! """ joblib.dump(self.transformer, open(self.trans_path, 'wb')) joblib.dump(self.tree, open(self.tree_path, 'wb')) def fit_transform(self, documents): # Transformer will be False if pipeline hasn't been fit yet, # Trigger fit_transform and save the transformer and lexicon. if self.transformer == False: self.transformer = Pipeline([ ('norm', TextNormalizer(minimum=50, maximum=200)), ('transform', Pipeline([ ('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=200)) ]) ) ]) self.lexicon = self.transformer.fit_transform(documents) self.tree = BallTree(self.lexicon) self.save() def query(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.transformer.named_steps['transform'].fit_transform( wordpunct_tokenize(terms) ) dists, inds = self.tree.query(vect_doc, k=self.k) return inds[0]
def populate_vector_space(self, vocabulary): #todo the exact data structure used here will need optimisation """ Input is like: ('1-GRAM', ('Seattle/N',)), ('1-GRAM', ('Senate/N',)), ('1-GRAM', ('September/N',)), ('1-GRAM', ('Service/N',)), ('AN', ('similar/J', 'agreement/N')), ('AN', ('tough/J', 'stance/N')), ('AN', ('year-ago/J', 'period/N')) """ logging.debug('Populating vector space with vocabulary %s', vocabulary) vectors = [c._get_vector(data).A for (feature_type, data) in vocabulary for c in self.composer_mapping[feature_type] if feature_type in self.composer_mapping and (feature_type, data) in c] self.feature_matrix = vstack(vectors) logging.debug('Building BallTree for matrix of size %s', self.feature_matrix.shape) feature_list = [ngram for ngram in vocabulary for _ in self.composer_mapping[ngram[0]]] #todo test if this entry index is correct self.entry_index = {i: ngram for i, ngram in enumerate(feature_list)} #assert len(feature_list) == self.feature_matrix.shape[0] #todo BallTree/KDTree only work with dense inputs #self.nbrs = KDTree(n_neighbors=1, algorithm='kd_tree').fit(self.feature_matrix) self.nbrs = BallTree(self.feature_matrix, metric=cosine) logging.debug('Done building BallTree') return self.nbrs
def get_evidence_grid(points, res_pts, intr_prms, exact=False): """ Associate the "z-axis" value (evidence, overlap, etc...) res_pts with its corresponding point in the template bank (points). If exact is True, then the poit must exactly match the point in the bank. """ grid_tree = BallTree(selected) grid_idx = [] # Reorder the grid points to match their weight indices for res in res_pts: dist, idx = grid_tree.query(res, k=1) # Stupid floating point inexactitude... #print res, selected[idx[0][0]] #assert numpy.allclose(res, selected[idx[0][0]]) grid_idx.append(idx[0][0]) return points[grid_idx]
def trainSimilarCases(firstTime=False, type="lda", vector_size=70): if firstTime: # 1:读ygsc及对应xml名 ygscs = [] xml_names = [] labels = [] db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 2 }, { "_id": 1, "ygscWords2": 1, "label": 1 }, no_cursor_timeout=True).batch_size(20): ygscs.append(line["ygscWords2"]) xml_names.append(line["_id"]) labels.append(line["label"]) ################## 2:转换为onehot ##################### cv = CountVectorizer(max_df=0.95, min_df=1, stop_words=None, token_pattern=r"(?u)\b\w+\b") one_hot_matrix = cv.fit_transform(ygscs).toarray() # 3:保存one-hot with open("checkpoint/cv.pk", "wb") as file: joblib.dump(cv, file) with open("checkpoint/onehot.pk", "wb") as file: joblib.dump(one_hot_matrix, file) with open("checkpoint/xml_names.pk", "wb") as file: joblib.dump(xml_names, file) with open("checkpoint/label.pk", "wb") as file: import numpy joblib.dump(numpy.asarray(labels), file) ################## 2':转换为tfidf ###################### tfidf = TfidfVectorizer(max_df=0.95, min_df=1, stop_words=None, token_pattern=r"(?u)\b\w+\b") tf_idf_matrix = tfidf.fit_transform(ygscs).toarray() # 3':保存tf-idf with open("checkpoint/tfidf.pk", "wb") as file: joblib.dump(tfidf, file) # 4:建balltree ball_tree = BallTree(tf_idf_matrix) with open("checkpoint/tfidf_ball_tree.pk", "wb") as file: joblib.dump(ball_tree, file) else: with open("checkpoint/onehot.pk", "rb") as file: one_hot_matrix = joblib.load(file) if type == "lda": # 0: 获取测试集 test_ygscs = [] db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 4 }, { "ygscWords2": 1 }, no_cursor_timeout=True).batch_size(20): test_ygscs.append(line["ygscWords2"]) with open("checkpoint/cv.pk", "rb") as file: cv = joblib.load(file) test_matrix = cv.transform(test_ygscs).toarray() # 1:lda训练 lda = LatentDirichletAllocation(n_components=vector_size) lda_matrix = lda.fit_transform(one_hot_matrix) # 2: 评估 print("lda" + str(vector_size) + "train困惑度:" + str(lda.perplexity(one_hot_matrix))) # 查看困惑度 print("lda" + str(vector_size) + "test困惑度:" + str(lda.perplexity(test_matrix))) # 查看困惑度 # 3:保存 with open("checkpoint/lda.pk", "wb") as file: joblib.dump(lda, file) with open("checkpoint/feature_matric.pk", "wb") as file: joblib.dump(lda_matrix, file) # 4:建balltree ball_tree = BallTree(lda_matrix) with open("checkpoint/lda_ball_tree.pk", "wb") as file: joblib.dump(ball_tree, file) elif type == "svd": # 1:svd训练 svd = TruncatedSVD(n_components=vector_size) # 迭代次数 svd_matrix = svd.fit_transform(one_hot_matrix) # 2:保存 with open("checkpoint/svd.pk", "wb") as file: joblib.dump(svd, file) # 3:建balltree ball_tree = BallTree(svd_matrix) with open("checkpoint/svd_ball_tree.pk", "wb") as file: joblib.dump(ball_tree, file)
def create_ball_tree(self): entities = self.model.all_entity_vectors() self.tree = BallTree(entities, leaf_size=entities.shape[0])
else: word_vectors = {} with open("../embeddings/word_embeddings100.txt", 'r') as f: for line in f: item_id = line[line.index("(") + 2:line.index(",") - 1] vector = line[line.index("[") + 1:-3].split(",") vector = [float(x.strip()) for x in vector] word_vectors[item_id] = vector tags = ["Sentimental", "folk-rock", "Disney", "1980s", "Swing-Jazz"] k = 6 X = list(word_vectors.values()) ids = list(word_vectors.keys()) id_index_list = {} num_items = len(ids) for i in range(num_items): id_index_list[ids[i]] = i tree = BallTree(X, leaf_size=40) tag_recs = {} for tag in tags: _, recs_ind = tree.query([word_vectors[tag]], k=k) recs = [] for i in recs_ind[0]: # filter items in user history recs.append(ids[i]) tag_recs[tag] = recs for tag, recs in tag_recs.items(): print(recs)
def spatial_interaction_internal(adata_subset, x_coordinate, y_coordinate, phenotype, method, radius, knn, permutation, imageid, subset, pval_method): print("Processing Image: " + str(adata_subset.obs[imageid].unique())) # Create a dataFrame with the necessary inforamtion data = pd.DataFrame({ 'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate], 'phenotype': adata_subset.obs[phenotype] }) # Identify neighbourhoods based on the method used # a) KNN method if method == 'knn': print("Identifying the " + str(knn) + " nearest neighbours for every cell") tree = BallTree(data[['x', 'y']], leaf_size=2) ind = tree.query(data[['x', 'y']], k=knn, return_distance=False) neighbours = pd.DataFrame(ind.tolist(), index=data.index) # neighbour DF neighbours.drop(0, axis=1, inplace=True) # Remove self neighbour # b) Local radius method if method == 'radius': print("Identifying neighbours within " + str(radius) + " pixels of every cell") kdt = BallTree(data[['x', 'y']], metric='euclidean') ind = kdt.query_radius(data[['x', 'y']], r=radius, return_distance=False) for i in range(0, len(ind)): ind[i] = np.delete(ind[i], np.argwhere(ind[i] == i)) #remove self neighbours = pd.DataFrame(ind.tolist(), index=data.index) # neighbour DF # Map Phenotypes to Neighbours # Loop through (all functionized methods were very slow) phenomap = dict(zip(list(range(len(ind))), data['phenotype'])) # Used for mapping print("Mapping phenotype to neighbors") for i in neighbours.columns: neighbours[i] = neighbours[i].dropna().map(phenomap, na_action='ignore') # Drop NA neighbours = neighbours.dropna(how='all') # Collapse all the neighbours into a single column n = pd.DataFrame(neighbours.stack(), columns=["neighbour_phenotype"]) n.index = n.index.get_level_values(0) # Drop the multi index # Merge with real phenotype n = n.merge(data['phenotype'], how='inner', left_index=True, right_index=True) # Permutation print('Performing ' + str(permutation) + ' permutations') def permutation_pval(data): data = data.assign(neighbour_phenotype=np.random.permutation( data['neighbour_phenotype'])) #data['neighbour_phenotype'] = np.random.permutation(data['neighbour_phenotype']) data_freq = data.groupby(['phenotype', 'neighbour_phenotype']).size().unstack() data_freq = data_freq.fillna(0).stack().values return data_freq # Apply function final_scores = Parallel(n_jobs=-1)(delayed(permutation_pval)(data=n) for i in range(permutation)) perm = pd.DataFrame(final_scores).T # Consolidate the permutation results print('Consolidating the permutation results') # Calculate P value # real n_freq = n.groupby(['phenotype', 'neighbour_phenotype' ]).size().unstack().fillna(0).stack() # permutation mean = perm.mean(axis=1) std = perm.std(axis=1) # P-value calculation if pval_method == 'histocat': # real value - prem value / no of perm p_values = abs(n_freq.values - mean) / (permutation + 1) p_values = p_values[~np.isnan(p_values)].values if pval_method == 'zscore': z_scores = (n_freq.values - mean) / std z_scores[np.isnan(z_scores)] = 0 p_values = scipy.stats.norm.sf(abs(z_scores)) * 2 p_values = p_values[~np.isnan(p_values)] # Compute Direction of interaction (interaction or avoidance) direction = ((n_freq.values - mean) / abs(n_freq.values - mean)).fillna(1) # Normalize based on total cell count k = n.groupby(['phenotype', 'neighbour_phenotype']).size().unstack().fillna(0) # add neighbour phenotype that are not present to make k a square matrix columns_to_add = dict.fromkeys(np.setdiff1d(k.index, k.columns), 0) k = k.assign(**columns_to_add) total_cell_count = data['phenotype'].value_counts() total_cell_count = total_cell_count[ k. columns].values # keep only cell types that are present in the column of k # total_cell_count = total_cell_count.reindex(k.columns).values # replaced by above k_max = k.div(total_cell_count, axis=0) k_max = k_max.div(k_max.max(axis=1), axis=0).stack() # DataFrame with the neighbour frequency and P values count = (k_max.values * direction).values # adding directionallity to interaction neighbours = pd.DataFrame({ 'count': count, 'p_val': p_values }, index=k_max.index) #neighbours.loc[neighbours[neighbours['p_val'] > p_val].index,'count'] = np.NaN #del neighbours['p_val'] neighbours.columns = [ adata_subset.obs[imageid].unique()[0], 'pvalue_' + str(adata_subset.obs[imageid].unique()[0]) ] neighbours = neighbours.reset_index() #neighbours = neighbours['count'].unstack() # return return neighbours
def get_nearest_nodes(G, X, Y, method=None): """ Return the graph nodes nearest to a list of points. Pass in points as separate vectors of X and Y coordinates. The 'kdtree' method is by far the fastest with large data sets, but only finds approximate nearest nodes if working in unprojected coordinates like lat-lng (it precisely finds the nearest node if working in projected coordinates). The 'balltree' method is second fastest with large data sets, but it is precise if working in unprojected coordinates like lat-lng. Parameters ---------- G : networkx multidigraph X : list-like The vector of longitudes or x's for which we will find the nearest node in the graph Y : list-like The vector of latitudes or y's for which we will find the nearest node in the graph method : str {None, 'kdtree', 'balltree'} Which method to use for finding nearest node to each point. If None, we manually find each node one at a time using osmnx.utils.get_nearest_node and haversine. If 'kdtree' we use scipy.spatial.cKDTree for very fast euclidean search. If 'balltree', we use sklearn.neighbors.BallTree for fast haversine search. Returns ------- nn : array list of nearest node IDs """ start_time = time.time() if method is None: # calculate nearest node one at a time for each point nn = [ get_nearest_node(G, (y, x), method='haversine') for x, y in zip(X, Y) ] elif method == 'kdtree': # check if we were able to import scipy.spatial.cKDTree successfully if not cKDTree: raise ImportError( 'The scipy package must be installed to use this optional feature.' ) # build a k-d tree for euclidean nearest node search nodes = pd.DataFrame({ 'x': nx.get_node_attributes(G, 'x'), 'y': nx.get_node_attributes(G, 'y') }) tree = cKDTree(data=nodes[['x', 'y']], compact_nodes=True, balanced_tree=True) # query the tree for nearest node to each point points = np.array([X, Y]).T dist, idx = tree.query(points, k=1) nn = nodes.iloc[idx].index elif method == 'balltree': # check if we were able to import sklearn.neighbors.BallTree successfully if not BallTree: raise ImportError( 'The scikit-learn package must be installed to use this optional feature.' ) # haversine requires data in form of [lat, lng] and inputs/outputs in units of radians nodes = pd.DataFrame({ 'x': nx.get_node_attributes(G, 'x'), 'y': nx.get_node_attributes(G, 'y') }) nodes_rad = np.deg2rad(nodes[['y', 'x']].astype(np.float)) points = np.array([Y.astype(np.float), X.astype(np.float)]).T points_rad = np.deg2rad(points) # build a ball tree for haversine nearest node search tree = BallTree(nodes_rad, metric='haversine') # query the tree for nearest node to each point idx = tree.query(points_rad, k=1, return_distance=False) nn = nodes.iloc[idx[:, 0]].index else: raise ValueError('You must pass a valid method name, or None.') log('Found nearest nodes to {:,} points in {:,.2f} seconds'.format( len(X), time.time() - start_time)) return np.array(nn)
def knn_neighbors(data, lpts, n_neighbor): X_semi = np.concatenate([lpts, data], axis = 0) btree = BallTree(X_semi) nbr_id = btree.query(X_semi, k=n_neighbor + 1, return_distance=False) return nbr_id
d = np.sqrt(d1**2.0 + d2**2.0 - 2 * d1 * d2 * np.cos(phi)) return d #coord is list of (RA,DEC,D) coordinates of primaries, 871 points in 3D. coord = [] for i in range(len(primary_list)): coord.append([ float(primary_list[i][1]), float(primary_list[i][2]), float(primary_list[i][7]) ]) #Use Ball Tree with the above Haversine function as metric to find # nearest neighbors quickly. See sklearn documentation for details. tree = BallTree(coord, leaf_size=2, metric=D_dist) ind_list = [] #list of indices of the nearest neighbors dist_list = [] #list of distances (rad) of nearest neighbors for i in range(len(coord)): dist, ind = tree.query( coord[i], k=2) #k=2 since first "nearest neighbor" is sometimes itself. #sometimes, D_dist will return NaN when calculating D_dist for same # point. In this case, ind[0][0] will be the nearest neighbor index. # Otherwise ind[0][0] is i, and so ind[0][1] would be the nearest neighbor # index if ind[0][0] == i: ind_list.append(ind[0][1]) dist_list.append(dist[0][1]) else:
class Hospitals(Supergroup, MedicalFacilities): def __init__( self, hospitals: List["Hospital"], neighbour_hospitals: int = 5, box_mode: bool = False, ball_tree=True, ): """ Create a group of hospitals, and provide functionality to locate patients to a nearby hospital. It will check in order the first ```neighbour_hospitals```, when one has space available the patient is allocated to it. If none of the closest ones has beds available it will pick one of them at random and that hospital will overflow Parameters ---------- hospitals: list of hospitals to aggrupate neighbour_hospitals: number of closest hospitals to look for box_mode: whether to run in single box mode, or full simulation """ super().__init__(members=hospitals) self.box_mode = box_mode self.neighbour_hospitals = neighbour_hospitals if ball_tree and self.members: coordinates = np.array( [hospital.coordinates for hospital in hospitals]) self.init_trees(coordinates) @classmethod def for_box_mode(cls): hospitals = [] hospitals.append(Hospital( coordinates=None, n_beds=10, n_icu_beds=2, )) hospitals.append( Hospital( coordinates=None, n_beds=5000, n_icu_beds=5000, )) return cls(hospitals, neighbour_hospitals=None, box_mode=True, ball_tree=False) @classmethod def from_file( cls, filename: str = default_data_filename, config_filename: str = default_config_filename, ) -> "Hospitals": """ Initialize Hospitals from path to data frame, and path to config file. Parameters ---------- filename: path to hospital dataframe config_filename: path to hospital config dictionary Returns ------- Hospitals instance """ hospital_df = pd.read_csv(filename) with open(config_filename) as f: config = yaml.load(f, Loader=yaml.FullLoader) neighbour_hospitals = config["neighbour_hospitals"] logger.info(f"There are {len(hospital_df)} hospitals in the world.") hospitals = cls.init_hospitals(cls, hospital_df) return Hospitals(hospitals, neighbour_hospitals) @classmethod def for_geography( cls, geography, filename: str = default_data_filename, config_filename: str = default_config_filename, ): with open(config_filename) as f: config = yaml.load(f, Loader=yaml.FullLoader) neighbour_hospitals = config["neighbour_hospitals"] hospital_df = pd.read_csv(filename, index_col=4) area_names = [area.name for area in geography.areas] hospital_df = hospital_df.loc[hospital_df.index.isin(area_names)] logger.info( f"There are {len(hospital_df)} hospitals in this geography.") total_hospitals = len(hospital_df) hospitals = [] for area in geography.areas: if area.name in hospital_df.index: hospitals_in_area = hospital_df.loc[area.name] if isinstance(hospitals_in_area, pd.Series): hospital = cls.create_hospital_from_df_row( area, hospitals_in_area, ) hospitals.append(hospital) else: for _, row in hospitals_in_area.iterrows(): hospital = cls.create_hospital_from_df_row( area, row, ) hospitals.append(hospital) if len(hospitals) == total_hospitals: break return cls(hospitals, neighbour_hospitals, False) @classmethod def create_hospital_from_df_row( cls, area, row, ): coordinates = row[["latitude", "longitude"]].values.astype(np.float) n_beds = row["beds"] n_icu_beds = row["icu_beds"] trust_code = row["code"] hospital = Hospital( area=area, coordinates=coordinates, n_beds=n_beds, n_icu_beds=n_icu_beds, trust_code=trust_code, ) return hospital def init_hospitals( self, hospital_df: pd.DataFrame, ) -> List["Hospital"]: """ Create Hospital objects with the right characteristics, as given by dataframe. Parameters ---------- hospital_df: dataframe with hospital characteristics data """ hospitals = [] for index, row in hospital_df.iterrows(): n_beds = row["beds"] n_icu_beds = row["icu_beds"] trust_code = row["code"] coordinates = row[["latitude", "longitude"]].values.astype(np.float) hospital = Hospital( coordinates=coordinates, n_beds=n_beds, n_icu_beds=n_icu_beds, trust_code=trust_code, ) hospitals.append(hospital) return hospitals def init_trees(self, hospital_coordinates: np.array) -> BallTree: """ Reads hospital location and sizes, it initializes a KD tree on a sphere, to query the closest hospital to a given location. Parameters ---------- hospital_df: dataframe with hospital characteristics data Returns ------- Tree to query nearby schools """ self.hospital_trees = BallTree( np.deg2rad(hospital_coordinates), metric="haversine", ) def get_closest_hospitals_idx(self, coordinates: Tuple[float, float], k: int) -> Tuple[float, float]: """ Get the k-th closest hospital to a given coordinate Parameters --------- coordinates: latitude and longitude k: k-th neighbour Returns ------- ID of the k-th closest hospital """ k = min(k, len(list(self.hospital_trees.data))) distances, neighbours = self.hospital_trees.query( np.deg2rad(coordinates.reshape(1, -1)), k=k, sort_results=True, ) return neighbours[0] def get_closest_hospitals(self, coordinates: Tuple[float, float], k: int) -> Tuple[float, float]: """ Get the k-th closest hospital to a given coordinate Parameters --------- coordinates: latitude and longitude k: k-th neighbour Returns ------- ID of the k-th closest hospital """ k = min(k, len(list(self.hospital_trees.data))) distances, neighbours = self.hospital_trees.query( np.deg2rad(coordinates.reshape(1, -1)), k=k, sort_results=True, ) return [self.members[index] for index in neighbours[0]]
class XGBSEKaplanNeighbors(XGBSEBaseEstimator): """ Convert xgboost into a nearest neighbor model, where we use hamming distance to define similar elements as the ones that co-ocurred the most at the ensemble terminal nodes. Then, at each neighbor-set compute survival estimates with the Kaplan-Meier estimator. !!! Note * We recommend using dart as the booster to prevent any tree to dominate variance in the ensemble and break the leaf co-ocurrence similarity logic. * This method can be very expensive at scales of hundreds of thousands of samples, due to the nearest neighbor search, both on training (construction of search index) and scoring (actual search). Read more in [How XGBSE works](https://loft-br.github.io/xgboost-survival-embeddings/how_xgbse_works.html). """ def __init__(self, xgb_params=None, n_neighbors=30, radius=None): """ Args: xgb_params (Dict): Parameters for XGBoost model. If not passed, the following default parameters will be used: ``` DEFAULT_PARAMS = { "objective": "survival:aft", "eval_metric": "aft-nloglik", "aft_loss_distribution": "normal", "aft_loss_distribution_scale": 1, "tree_method": "hist", "learning_rate": 5e-2, "max_depth": 8, "booster": "dart", "subsample": 0.5, "min_child_weight": 50, "colsample_bynode": 0.5, } ``` Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for more options. n_neighbors (Int): Number of neighbors for computing KM estimates radius (Float): If set, uses a radius around the point for neighbors search """ if xgb_params is None: xgb_params = DEFAULT_PARAMS self.xgb_params = xgb_params self.n_neighbors = n_neighbors self.radius = radius self.persist_train = False self.index_id = None self.radius = None self.feature_importances_ = None def fit( self, X, y, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0, persist_train=True, index_id=None, time_bins=None, ): """ Transform feature space by fitting a XGBoost model and outputting its leaf indices. Build search index in the new space to allow nearest neighbor queries at scoring time. Args: X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. num_boost_round (Int): Number of boosting iterations. validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping early_stopping_rounds (Int): Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. See xgboost.train documentation. verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions Returns: XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors """ self.E_train, self.T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(self.T_train, self.E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # converting validation data to xgb format evals = () if validation_data: X_val, y_val = validation_data dvalid = convert_data_to_xgb_format( X_val, y_val, self.xgb_params["objective"] ) evals = [(dvalid, "validation")] # training XGB self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, evals=evals, verbose_eval=verbose_eval, ) self.feature_importances_ = self.bst.get_score() # creating nearest neighbor index leaves = self.bst.predict(dtrain, pred_leaf=True) self.tree = BallTree(leaves, metric="hamming", leaf_size=40) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() self.index_id = index_id return self def predict( self, X, time_bins=None, return_ci=False, ci_width=0.683, return_interval_probs=False, ): """ Make queries to nearest neighbor search index build on the transformed XGBoost space. Compute a Kaplan-Meier estimator for each neighbor-set. Predict the KM estimators. Args: X (pd.DataFrame): Dataframe with samples to generate predictions time_bins (np.array): Specified time windows to use when making survival predictions return_ci (Bool): Whether to return confidence intervals via the Exponential Greenwood formula ci_width (Float): Width of confidence interval return_interval_probs (Bool): Boolean indicating if interval probabilities are supposed to be returned. If False the cumulative survival is returned. Returns: (pd.DataFrame): A dataframe of survival probabilities for all times (columns), from a time_bins array, for all samples of X (rows). If return_interval_probs is True, the interval probabilities are returned instead of the cumulative survival probabilities. upper_ci (np.array): Upper confidence interval for the survival probability values lower_ci (np.array): Lower confidence interval for the survival probability values """ # converting to xgb format d_matrix = xgb.DMatrix(X) # getting leaves and extracting neighbors leaves = self.bst.predict(d_matrix, pred_leaf=True) if self.radius: assert self.radius > 0, "Radius must be positive" neighs, _ = self.tree.query_radius( leaves, r=self.radius, return_distance=True ) number_of_neighbors = np.array([len(neigh) for neigh in neighs]) if np.argwhere(number_of_neighbors == 1).shape[0] > 0: # If there is at least one sample without neighbors apart from itself # a warning is raised suggesting a radius increase warnings.warn( "Warning: Some samples don't have neighbors apart from itself. Increase the radius", RuntimeWarning, ) else: _, neighs = self.tree.query(leaves, k=self.n_neighbors) # gathering times and events/censors for neighbor sets T_neighs = self.T_train[neighs] E_neighs = self.E_train[neighs] # vectorized (very fast!) implementation of Kaplan Meier curves if time_bins is None: time_bins = self.time_bins # calculating z-score from width z = st.norm.ppf(0.5 + ci_width / 2) preds_df, upper_ci, lower_ci = calculate_kaplan_vectorized( T_neighs, E_neighs, time_bins, z ) if return_ci and return_interval_probs: raise ValueError( "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs." ) if return_interval_probs: preds_df = calculate_interval_failures(preds_df) return preds_df if return_ci: return preds_df, upper_ci, lower_ci return preds_df
dist[:, bi] = 1 - np.abs(np.dot(a, b[bi])) return dist @jit(nopython=True) def quatMetricNumba2(a, b): """ from DOI 10.1007/s10851-009-0161-2, #4 """ return 1 - np.abs(np.dot(a, b)) """ distance =~ ( 1 - cos(θ) ) / 2 """ tree = BallTree(qgrid, metric=quatMetricNumba2) theta = np.deg2rad(7.5) rad = (1 - np.cos(theta)) / 2 # rad = np.max(tree.query(qgrid,k=4)[0]) # print(rad) """ start loop """ fibre_e = {} fibre_q = {} nn_gridPts = {} nn_gridDist = {} for fi, fam in enumerate(pf.symHKL): fibre_e[fi] = {}
def indexBallTree(X, leafSize): tree = BallTree(X, leaf_size=leafSize) return tree
#print (np.mean(tt)) #np.mean(similarities2) #print (synsets) from nltk.metrics import * #print(accuracy( similarities, similarities2)) #print (precision( similarities, similarities2)) #print(recall(similarities, similarities2)) #print() from sklearn.neighbors import BallTree as BallTree BT = BallTree(similarities, leaf_size=5, p=2) dx, idx = BT.query(similarities[500,:], k=3) print (BT.query(dx, idx = BT.query(similarities[500,:], k=3)[500,:], k=3)) tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords) # Apply the vectoriser to the training set Cardinality=0 for files in document: if files.endswith('.txt'): Cardinality+=1 counts = CountVectorizer(input='nouns') dtm = counts.fit_transform(document) # a sparse matrix vocab = counts.get_feature_names() # a list #type(dtm)
def train(self, df_gt): mean_imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) mean_imputer.fit(df_gt.as_matrix()) self.imputer = BallTree(mean_imputer.transform(df_gt.as_matrix()))
def build_tree(points): if points.shape[1] >= 20: # for large dimensions, use BallTree return BallTree(points, metric='chebyshev') return KDTree(points, metric='chebyshev')
def get_nearest_edges(G, X, Y, method=None, dist=0.0001): """ Return the graph edges nearest to a list of points. Pass in points as separate vectors of X and Y coordinates. The 'kdtree' method is by far the fastest with large data sets, but only finds approximate nearest edges if working in unprojected coordinates like lat-lng (it precisely finds the nearest edge if working in projected coordinates). The 'balltree' method is second fastest with large data sets, but it is precise if working in unprojected coordinates like lat-lng. Parameters ---------- G : networkx multidigraph X : list-like The vector of longitudes or x's for which we will find the nearest edge in the graph. For projected graphs, use the projected coordinates, usually in meters. Y : list-like The vector of latitudes or y's for which we will find the nearest edge in the graph. For projected graphs, use the projected coordinates, usually in meters. method : str {None, 'kdtree', 'balltree'} Which method to use for finding nearest edge to each point. If None, we manually find each edge one at a time using osmnx.utils.get_nearest_edge. If 'kdtree' we use scipy.spatial.cKDTree for very fast euclidean search. Recommended for projected graphs. If 'balltree', we use sklearn.neighbors.BallTree for fast haversine search. Recommended for unprojected graphs. dist : float spacing length along edges. Units are the same as the geom; Degrees for unprojected geometries and meters for projected geometries. The smaller the value, the more points are created. Returns ------- ne : ndarray array of nearest edges represented by their startpoint and endpoint ids, u and v, the OSM ids of the nodes. Info ---- The method creates equally distanced points along the edges of the network. Then, these points are used in a kdTree or BallTree search to identify which is nearest.Note that this method will not give the exact perpendicular point along the edge, but the smaller the *dist* parameter, the closer the solution will be. Code is adapted from an answer by JHuw from this original question: https://gis.stackexchange.com/questions/222315/geopandas-find-nearest-point -in-other-dataframe """ start_time = time.time() if method is None: # calculate nearest edge one at a time for each point ne = [get_nearest_edge(G, (x, y)) for x, y in zip(X, Y)] ne = [(u, v) for _, u, v in ne] elif method == 'kdtree': # check if we were able to import scipy.spatial.cKDTree successfully if not cKDTree: raise ImportError( 'The scipy package must be installed to use this optional feature.' ) # transform graph into DataFrame edges = graph_to_gdfs(G, nodes=False, fill_edge_geometry=True) # transform edges into evenly spaced points edges['points'] = edges.apply( lambda x: redistribute_vertices(x.geometry, dist), axis=1) # develop edges data for each created points extended = edges['points'].apply([pd.Series]).stack().reset_index( level=1, drop=True).join(edges).reset_index() # Prepare btree arrays nbdata = np.array( list( zip(extended['Series'].apply(lambda x: x.x), extended['Series'].apply(lambda x: x.y)))) # build a k-d tree for euclidean nearest node search btree = cKDTree(data=nbdata, compact_nodes=True, balanced_tree=True) # query the tree for nearest node to each point points = np.array([X, Y]).T dist, idx = btree.query(points, k=1) # Returns ids of closest point eidx = extended.loc[idx, 'index'] ne = edges.loc[eidx, ['u', 'v']] elif method == 'balltree': # check if we were able to import sklearn.neighbors.BallTree successfully if not BallTree: raise ImportError( 'The scikit-learn package must be installed to use this optional feature.' ) # transform graph into DataFrame edges = graph_to_gdfs(G, nodes=False, fill_edge_geometry=True) # transform edges into evenly spaced points edges['points'] = edges.apply( lambda x: redistribute_vertices(x.geometry, dist), axis=1) # develop edges data for each created points extended = edges['points'].apply([pd.Series]).stack().reset_index( level=1, drop=True).join(edges).reset_index() # haversine requires data in form of [lat, lng] and inputs/outputs in units of radians nodes = pd.DataFrame({ 'x': extended['Series'].apply(lambda x: x.x), 'y': extended['Series'].apply(lambda x: x.y) }) nodes_rad = np.deg2rad(nodes[['y', 'x']].values.astype(np.float)) points = np.array([Y, X]).T points_rad = np.deg2rad(points) # build a ball tree for haversine nearest node search tree = BallTree(nodes_rad, metric='haversine') # query the tree for nearest node to each point idx = tree.query(points_rad, k=1, return_distance=False) eidx = extended.loc[idx[:, 0], 'index'] ne = edges.loc[eidx, ['u', 'v']] else: raise ValueError('You must pass a valid method name, or None.') log('Found nearest edges to {:,} points in {:,.2f} seconds'.format( len(X), time.time() - start_time)) return np.array(ne)
def nearest_correspondance(pts_src, pts_dest, data_src): tree = BallTree(pts_src, leaf_size=2) _, indices = tree.query(pts_dest, k=1) indices = indices.ravel() data_dest = data_src[indices] return data_dest
def fit_point_cloud(src_pts, tgt_pts, rotate=True, translate=True, scale=0, x0=None, leastsq_args={}, out='params'): """Find a transform between unmatched sets of points. This minimizes the squared distance from each source point to its closest target point, using :func:`scipy.optimize.leastsq` to find a transformation using rotation, translation, and scaling (in that order). Parameters ---------- src_pts : array, shape = (n, 3) Points to which the transform should be applied. tgt_pts : array, shape = (m, 3) Points to which src_pts should be fitted. Each point in tgt_pts should correspond to the point in src_pts with the same index. rotate : bool Allow rotation of the ``src_pts``. translate : bool Allow translation of the ``src_pts``. scale : 0 | 1 | 3 Number of scaling parameters. With 0, points are not scaled. With 1, points are scaled by the same factor along all axes. With 3, points are scaled by a separate factor along each axis. x0 : None | tuple Initial values for the fit parameters. leastsq_args : dict Additional parameters to submit to :func:`scipy.optimize.leastsq`. out : 'params' | 'trans' In what format to return the estimate: 'params' returns a tuple with the fit parameters; 'trans' returns a transformation matrix of shape (4, 4). Returns ------- x : array, shape = (n_params, ) Estimated parameters for the transformation. Notes ----- Assumes that the target points form a dense enough point cloud so that the distance of each src_pt to the closest tgt_pt can be used as an estimate of the distance of src_pt to tgt_pts. """ from scipy.optimize import leastsq kwargs = {'epsfcn': 0.01} kwargs.update(leastsq_args) # assert correct argument types src_pts = np.atleast_2d(src_pts) tgt_pts = np.atleast_2d(tgt_pts) translate = bool(translate) rotate = bool(rotate) scale = int(scale) if translate: src_pts = np.hstack((src_pts, np.ones((len(src_pts), 1)))) try: from sklearn.neighbors import BallTree tgt_pts = BallTree(tgt_pts) errfunc = _point_cloud_error_balltree except ImportError: warn("Sklearn could not be imported. Fitting points will be slower. " "To improve performance, install the sklearn module.") errfunc = _point_cloud_error # for efficiency, define parameter specific error function param_info = (rotate, translate, scale) if param_info == (True, False, 0): x0 = x0 or (0, 0, 0) def error(x): rx, ry, rz = x trans = rotation3d(rx, ry, rz) est = dot(src_pts, trans.T) err = errfunc(est, tgt_pts) return err elif param_info == (True, False, 1): x0 = x0 or (0, 0, 0, 1) def error(x): rx, ry, rz, s = x trans = rotation3d(rx, ry, rz) * s est = dot(src_pts, trans.T) err = errfunc(est, tgt_pts) return err elif param_info == (True, False, 3): x0 = x0 or (0, 0, 0, 1, 1, 1) def error(x): rx, ry, rz, sx, sy, sz = x trans = rotation3d(rx, ry, rz) * [sx, sy, sz] est = dot(src_pts, trans.T) err = errfunc(est, tgt_pts) return err elif param_info == (True, True, 0): x0 = x0 or (0, 0, 0, 0, 0, 0) def error(x): rx, ry, rz, tx, ty, tz = x trans = dot(translation(tx, ty, tz), rotation(rx, ry, rz)) est = dot(src_pts, trans.T) err = errfunc(est[:, :3], tgt_pts) return err else: raise NotImplementedError( "The specified parameter combination is not implemented: " "rotate=%r, translate=%r, scale=%r" % param_info) est, _, info, msg, _ = leastsq(error, x0, full_output=True, **kwargs) logger.debug("fit_point_cloud leastsq (%i calls) info: %s", info['nfev'], msg) if out == 'params': return est elif out == 'trans': return _trans_from_params(param_info, est) else: raise ValueError("Invalid out parameter: %r. Needs to be 'params' or " "'trans'." % out)
class SNNAP: def __init__(self, clip_runtime=True, feature_selection='chi-squared', top_n=3, k_neighbours=60): self._name = 'snnap' self._clip_runtime = clip_runtime self._feature_selection = feature_selection self._top_n = top_n self._k_neighbours = k_neighbours self._imputer = SimpleImputer() self._scaler = MaxAbsScaler() self._runtime_scaler = StandardScaler() self._models = [] self._rfr_params = { 'n_estimators': 100, 'criterion': 'mse', 'max_depth': None, 'min_samples_split': 2 } def get_name(self): return self._name def fit(self, scenario: ASlibScenario, fold: int, num_instances: int): self._num_algorithms = len(scenario.algorithms) self._top_n = min(self._num_algorithms, self._top_n) # resample `amount_of_training_instances` instances and preprocess them accordingly features, performances = self._resample_instances( scenario.feature_data.values, scenario.performance_data.values, num_instances, random_state=fold) # TODO: apply feature filtering such as chi-squared based selection technique features, performances = self._preprocess_scenario( scenario, features, performances) # train runtime prediction model for each model self._models = [ RandomForestRegressor(random_state=fold, **self._rfr_params) for alg in range(self._num_algorithms) ] for num, model in enumerate(self._models): model.fit(features, performances[:, num]) # build index to retrieve k nearest neighbours based on Jaccard distance of best n solvers self._index = BallTree(performances, leaf_size=30, metric='pyfunc', func=SNNAP._top_n_jaccard, metric_params={'top_n': self._top_n}) self._performances = np.copy(performances) def predict(self, features, instance_id: int): assert (features.ndim == 1), '`features` must be one dimensional' features = np.expand_dims(features, axis=0) features = self._imputer.transform(features) features = self._scaler.transform(features) # predict runtimes and get k nearest neighbours based on Jaccard distance of best n solvers predicted = np.asarray([ model.predict(features) for model in self._models ]).reshape(1, -1) neighbour_idx = np.squeeze( self._index.query(predicted, self._k_neighbours, return_distance=False)) # find best solver on the instance's k nearest neighbours (best avg. runtime / PAR10 score) sub_performances = self._performances[neighbour_idx, :] # the summed performance induces a valid ranking return np.sum(sub_performances, axis=0) def _resample_instances(self, feature_data, performance_data, num_instances, random_state): num_instances = min(num_instances, np.size( performance_data, axis=0)) if num_instances > 0 else np.size( performance_data, axis=0) return resample(feature_data, performance_data, n_samples=num_instances, random_state=random_state) def _preprocess_scenario(self, scenario: ASlibScenario, features, performances): # TODO: paper does not explicitly mention feature imputation & feature scaling features = self._imputer.fit_transform(features) features = self._scaler.fit_transform(features) # train predictors and select algorithms on running time instead of PAR10 if warranted if self._clip_runtime: performances = np.clip(performances, a_min=np.NINF, a_max=scenario.algorithm_cutoff_time) # scale performances to zero mean and unitary standard deviation performances = self._runtime_scaler.fit_transform(performances) return features, performances @staticmethod def _top_n_jaccard(x, y, **kwargs): top_n = kwargs['metric_params']['top_n'] top_n_1 = set(np.argpartition(x, top_n)[:top_n]) top_n_2 = set(np.argpartition(y, top_n)[:top_n]) return len(top_n_1.intersection(top_n_2)) / float( len(top_n_1.union(top_n_2)))
class BallsBound(Bound): def __init__(self, points, scale=1.0): """Simple mask based on coverage balls around inducing points. Args: points (Array): shape (num_points, n_dim) scale (float): Scale ball size (default 1.0) """ assert len(points.shape) == 2 self.X = points self._udim = self.X.shape[-1] self.bt = BallTree(self.X, leaf_size=2) self.epsilon = self._set_epsilon(self.X, self.bt, scale) self._volume = self._get_volume(self.X, self.epsilon, self.bt) @property def volume(self): return self._volume @property def udim(self): return self._udim @staticmethod def _set_epsilon(X, bt, scale): dims = X.shape[-1] k = [4, 5, 6] dist, ind = bt.query(X, k=k[dims - 1]) # 4th NN epsilon = np.median(dist[:, -1]) * scale * 1.5 return epsilon @staticmethod def _get_volume(X, epsilon, bt): N = 100 vol_est = [] d = X.shape[-1] area = {1: 2 * epsilon, 2: np.pi * epsilon**2}[d] for i in range(N): n = np.random.randn(*X.shape) norm = (n**2).sum(axis=1)**0.5 n = n / norm.reshape(-1, 1) r = np.random.rand(len(X))**(1 / d) * epsilon Y = X + n * r.reshape(-1, 1) in_bounds = ((Y >= 0.0) & (Y <= 1.0)).prod(axis=1, dtype="bool") Y = Y[in_bounds] counts = bt.query_radius(Y, epsilon, count_only=True) vol_est.append(area * sum(1.0 / counts)) vol_est = np.array(vol_est) out, err = vol_est.mean(), vol_est.std() / np.sqrt(N) rel = err / out if rel > 0.01: log.debug("WARNING: Rel volume uncertainty is %.4g" % rel) return out def sample(self, N): counter = 0 samples = [] d = self.X.shape[-1] while counter < N: n = np.random.randn(*self.X.shape) norm = (n**2).sum(axis=1)**0.5 n = n / norm.reshape(-1, 1) r = np.random.rand(len(self.X))**(1 / d) * self.epsilon Y = self.X + n * r.reshape(-1, 1) in_bounds = ((Y >= 0.0) & (Y <= 1.0)).prod(axis=1, dtype="bool") Y = Y[in_bounds] counts = self.bt.query_radius(Y, r=self.epsilon, count_only=True) p = 1.0 / counts w = np.random.rand(len(p)) Y = Y[p >= w] samples.append(Y) counter += len(Y) samples = np.vstack(samples) ind = np.random.choice(range(len(samples)), size=N, replace=False) return samples[ind] def __call__(self, u): u = u.reshape(len(u), -1) dist, ind = self.bt.query(u, k=1) return (dist < self.epsilon)[:, 0] @classmethod def from_IsolatedRatio(cls, ratio, obs, bound, n=10000, th=-13): """Generate new BallsBound object based on IsolatedRatio. Args: ratio (IsolatedRatio): Single ratio. obs (dict): Reference observation. bound (Bound): Bound of RatioEstimator. th (float): Threshold value, default -13 n (int): Number of random samples from bound to determine parameter boundaries. Note: All components of the RatioEstimator will be used. Avoid overlapping ratios. """ u = bound.sample(n) r = ratio(u) mask = r.max() - r < -th ind_points = u[mask] return cls(ind_points) def state_dict(self): return dict(tag="BallsBound", points=self.X, epsilon=self.epsilon, volume=self._volume) @classmethod def from_state_dict(cls, state_dict): obj = cls.__new__(cls) obj.X = state_dict["points"] assert len(obj.X.shape) == 2 obj._udim = obj.X.shape[-1] obj.epsilon = state_dict["epsilon"] obj._volume = state_dict["volume"] obj.bt = BallTree(obj.X, leaf_size=2) return obj
def __init__(self, features, labels, k=5): self.features = features self._kdtree = BallTree(features) self._y = labels self._k = k
class Knearest: """ kNN classifier """ def __init__(self, x, y, k=5): """ Creates a kNN instance :param x: Training data input :param y: Training data output :param k: The number of nearest points to consider in classification """ # You can modify the constructor, but you shouldn't need to. # Do not use another datastructure from anywhere else to # complete the assignment. self._kdtree = BallTree(x) self._y = y self._k = k def majority(self, item_indices): """ Given the indices of training examples, return the majority label. If there's a tie, return the median value (as implemented in numpy). :param item_indices: The indices of the k nearest neighbors """ assert len(item_indices) == self._k, "Did not get k inputs" # Finish this function to return the most common y value for # these indices # # http://docs.scipy.org/doc/numpy/reference/generated/numpy.median.html # If only one neighbor, the majority is definitely its label if len(item_indices) == 1: return self._y[item_indices[0]] labels = [self._y[x] for x in item_indices] u = numpy.unique( labels, return_counts=True ) # tuple, first element is the labels, second element is the count sorted_counts = numpy.argsort( u[1] ) # last element (i.e., sorted_counts[-1]) is the index of highest, etc majority_labels = [] for i in range(len(sorted_counts)): if u[1][sorted_counts[i]] == u[1][sorted_counts[-1]]: majority_labels.append(u[0][sorted_counts[i]]) return numpy.median(majority_labels) def classify(self, example): """ Given an example, classify the example. :param example: A representation of an example in the same format as training data """ # Finish this function to find the k closest points, query the # majority function, and return the value. dist, ind = self._kdtree.query([example], self._k) return self.majority(ind[0]) # return self.majority(list(random.randrange(len(self._y)) \ # for x in range(self._k))) def confusion_matrix(self, test_x, test_y): """ Given a matrix of test examples and labels, compute the confusion matrixfor the current classifier. Should return a dictionary of dictionaries where d[ii][jj] is the number of times an example with true label ii was labeled as jj. :param test_x: Test data representation :param test_y: Test data answers """ # Finish this function to build a dictionary with the # mislabeled examples. You'll need to call the classify # function for each example. d = defaultdict(dict) data_index = 0 for xx, yy in zip(test_x, test_y): data_index += 1 our_label = self.classify(xx) d[yy][our_label] = d.get(yy, {}).get(our_label, 0) + 1 # if data_index % 100 == 0: # print("%i/%i for confusion matrix" % (data_index, len(test_x))) return d @staticmethod def accuracy(confusion_matrix): """ Given a confusion matrix, compute the accuracy of the underlying classifier. """ # You do not need to modify this function total = 0 correct = 0 for ii in confusion_matrix: total += sum(confusion_matrix[ii].values()) correct += confusion_matrix[ii].get(ii, 0) if total: return float(correct) / float(total) else: return 0.0
def two_point_angular_window(coords_D, coords_R, bins, D_idx=None, R_idx=None, random_state=None): """Two-point correlation function Parameters ---------- data_D: data ra, dec in deg, shape = [2, n_samples] data_R: random ra, dec in deg, shape = [2, n_samples] D_idx: idx of data in field, None if all in field R_idx: idx of random in field, None if all in field random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins corr_in: """ rng = check_random_state(random_state) coords_D, coords_R = np.asanyarray(coords_D), np.asanyarray(coords_R) if D_idx is None: D_idx = np.arange(coords_D.shape[1], dtype=int) if R_idx is None: R_idx = np.arange(coords_R.shape[1], dtype=int) if bins.ndim != 1: raise ValueError("bins must be a 1D array") data = np.asarray(ra_dec_to_xyz(coords_D[0], coords_D[1]), order='F').T data_R = np.asarray(ra_dec_to_xyz(coords_R[0], coords_R[1]), order='F').T bins = angular_dist_to_euclidean_dist(bins) Nbins = len(bins) - 1 factor = len(data_R) * 1. / len(data) factor_in = len(R_idx) * 1. / len(D_idx) BT_D = BallTree(data) BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1, dtype=int) counts_RR = np.zeros(Nbins + 1, dtype=int) counts_DD_in = np.zeros(Nbins + 1, dtype=int) counts_RR_in = np.zeros(Nbins + 1, dtype=int) for i in range(Nbins + 1): count_listD = BT_D.query_radius(data, bins[i]) count_listR = BT_R.query_radius(data_R, bins[i]) countD = np.sum([len(count) for count in count_listD]) countR = np.sum([len(count) for count in count_listR]) countD_in = np.sum([len(count) for count in count_listD[D_idx]]) countR_in = np.sum([len(count) for count in count_listR[R_idx]]) counts_DD[i], counts_RR[i] = countD, countR counts_DD_in[i], counts_RR_in[i] = countD_in, countR_in DD = np.diff(counts_DD) RR = np.diff(counts_RR) DD_in = np.diff(counts_DD_in) RR_in = np.diff(counts_RR_in) # check for zero in the denominator RR_zero = np.where(RR == 0)[0] RR_in_zero = np.where(RR_in == 0)[0] RR[RR_zero] = 1 RR_in[RR_in_zero] = 1 corr = factor**2 * DD / RR - 1 corr_in = factor_in**2 * DD_in / RR_in - 1 corr[RR_zero] = np.nan corr_in[RR_in_zero] = np.nan return corr, corr_in
def in_hull(p, hull): """ Test if points in `p` are in `hull` `p` should be a `NxK` coordinates of `N` points in `K` dimensions `hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the coordinates of `M` points in `K`dimensions for which Delaunay triangulation will be computed """ if not isinstance(hull,Delaunay): hull = Delaunay(hull) return hull.find_simplex(p)>=0 xyz = np.vstack([eastings, northings, elevations]).T neighbour_tree = BallTree(xyz) ref_elevation= -6380 east_grid = np.arange(min(eastings),max(eastings),grid_spacing) north_grid = np.arange(min(northings), max(northings), grid_spacing) elevation_grid = np.arange(min(elevations), max(elevations), grid_spacing) i_slice = np.where(elevation_grid<ref_elevation)[0][-1] grid_points = [] for i_east,e_grid in enumerate(east_grid): for i_north, n_grid in enumerate(north_grid): grid_points.append([e_grid, n_grid, ref_elevation]) ne_grid, nn_grid,nz_grid = len(east_grid), len(north_grid),len(elevation_grid)
from data import main from termination_criterion import cluster_evaluation from sklearn.neighbors import BallTree from identify_centroid import centroid, determine_radius from tqdm import tqdm import numpy as np all_latent, low_d, labels = main() # centroids = [np.mean(low_d[labels == l], axis=0) for l in range(2)] dists = [low_d[labels == l] for l in np.unique(labels)] dense_centroids = np.array([centroid(d, BallTree(d))[2] for d in dists]) print(f'Two? {cluster_evaluation(low_d, labels, dense_centroids)}') # ones = low_d[labels == 0] # tree = BallTree(ones) # one_points, radius, proposal = centroid(ones, tree) # def approx_equal(one: np.ndarray, two: np.ndarray) -> bool: # '''Are the two arrays approximately equal?''' # return (one - two < 1).all() # from matplotlib import pyplot as plt
class Vectors: def __init__(self, filename=None, optimize=True): self.word_index = {} self.vectors = [] self.words = [] self.ball_tree = None if filename: with open(filename) as infile: print('Reading vectors...') index = 0 for line in infile: line = line.split() if len(line) == 301: self.word_index[line[0]] = index self.vectors.append([float(x) for x in line[1:]]) self.words.append(line[0]) index += 1 print('Finished') if optimize and filename: self.optimize() def optimize(self): if not self.ball_tree: print('Optimizing search...') self.ball_tree = BallTree(self.vectors, metric=cosine) print('Finished.') else: print('Already optimized.') def save(self, filename): if filename.split('.')[-1] != 'vecs': filename += '.vecs' with open(filename, 'wb') as outfile: cPickle.dump( (self.word_index, self.vectors, self.words, self.ball_tree), outfile) print('saved as ' + filename) def load(self, filename): if filename.split('.')[-1] != 'vecs': filename += '.vecs' with open(filename, 'rb') as infile: self.word_index, self.vectors, self.words, self.ball_tree = cPickle.load( infile) def get(self, string, errors=True): # type: (str, bool) -> list return_vec = [0] * len(self.vectors[0]) for word in string.split(): try: return_vec = numpy.add(self.vectors[self.word_index[word]], return_vec) except Exception as e: if errors: raise e return return_vec def search(self, vector, k=1, return_distance=False): if self.ball_tree: a = self.ball_tree.query(numpy.array(vector).reshape(1, -1), k=k) dist, ind = a[0][0], a[1][0] else: dists = [cosine(vector, vec) for vec in self.vectors] ind = numpy.argsort(dists)[:k] dist = [dists[i] for i in ind] del dists if return_distance: return tuple( ((self.words[ind[i]], dist[i]) for i in xrange(len(ind)))) else: return tuple(self.words[ind[i]] for i in xrange(len(ind))) def distance(self, item1, item2, errors=True): if isinstance(item1, str): item1 = self.get(item1, errors=errors) if isinstance(item2, str): item2 = self.get(item2, errors=errors) return cosine(item1, item2)
class SpatioTemporalModel(nn.Module): def __init__(self, u_size, v_size, t_size, emb_dim_u=32, emb_dim_v=32, emb_dim_t=16, hidden_dim=32, nb_cnt=100, sampling_list=None, vid_coor_rad=None, vid_pop=None, dropout=0.5): super(SpatioTemporalModel, self).__init__() self.emb_dim_u = emb_dim_u self.emb_dim_v = emb_dim_v self.emb_dim_t = emb_dim_t self.hidden_dim = hidden_dim self.u_size = u_size self.v_size = v_size self.t_size = t_size self.nb_cnt = nb_cnt self.dropout = dropout self.sampling_list = sampling_list self.vid_coor_rad = vid_coor_rad self.vid_pop = vid_pop self.tree = BallTree(vid_coor_rad.values(), leaf_size=40, metric='haversine') self.dist_metric = DistanceMetric.get_metric('haversine') self.uid_rid_sampling_info = {} for uid in range(0, u_size): self.uid_rid_sampling_info[uid] = {} self.rnn_short = nn.RNNCell(self.emb_dim_v, self.hidden_dim) #TODO check GRU self.rnn_long = nn.GRUCell(self.emb_dim_v, self.hidden_dim) self.embedder_u = nn.Embedding(self.u_size, self.emb_dim_u) self.embedder_v = nn.Embedding(self.v_size, self.emb_dim_v) self.embedder_t = nn.Embedding(self.t_size, self.emb_dim_t) self.embedder_v_context = nn.Embedding(self.v_size, self.hidden_dim * 2 + self.emb_dim_u + self.emb_dim_t) def forward(self, records_u, is_train, mod=0): predicted_scores = Variable(torch.zeros(records_u.get_predicting_records_cnt(mod=0), 1)) if is_train else [] rid_vids_true = [] rid_vids = [] vids_visited = set() records_al = records_u.get_records(mod=0) if is_train else records_u.get_records(mod=2) emb_u = self.embedder_u(Variable(torch.LongTensor([records_u.uid])).view(1, -1)).view(1, -1) hidden_long = self.init_hidden() idx = 0 for rid, record in enumerate(records_al[: -1]): if record.is_first: hidden_short = self.init_hidden() vids_visited.add(record.vid) emb_v = self.embedder_v(Variable(torch.LongTensor([record.vid])).view(1, -1)).view(1, -1) emb_t_next = self.embedder_t(Variable(torch.LongTensor([record.tid_next])).view(1, -1)).view(1, -1) hidden_long = self.rnn_long(emb_v, hidden_long) hidden_short = self.rnn_short(emb_v, hidden_short) if record.is_last: continue hidden = torch.cat((hidden_long.view(1, -1), hidden_short.view(1, -1), emb_u.view(1, -1), emb_t_next.view(1, -1)), 1) if is_train: rid_vids_true.append(record.vid_next) vid_candidates = self.get_vids_candidate(records_u.uid, rid, record.vid_next, vids_visited, True, False) scores = Variable(torch.zeros(1, self.nb_cnt + 1)) else: if rid >= records_u.test_idx: rid_vids_true.append(record.vid_next) vid_candidates = self.get_vids_candidate(records_u.uid, rid, record.vid_next, vids_visited, False, False) scores = Variable(torch.zeros(1, self.v_size)) predicted_scores.append([]) else: continue for vid_idx, vid_candidate in enumerate(vid_candidates): emb_v_context = self.embedder_v_context(Variable(torch.LongTensor([vid_candidate])).view(1, -1)).view(-1, 1) scores[0, vid_idx] = torch.mm(hidden, emb_v_context) predicted_scores[idx] = F.softmax(scores)[0, 0] if is_train else F.softmax(scores) rid_vids.append(vid_candidates) idx += 1 return predicted_scores, rid_vids, rid_vids_true def get_vids_candidate(self, uid, rid, vid_true=None, vids_visited=None, is_train=True, use_distance=True): if not use_distance: if is_train: vid_candidates = [vid_true] while len(vid_candidates) <= self.nb_cnt: vid_candidate = self.sampling_list[random.randint(0, len(self.sampling_list) - 1)] if vid_candidate != vid_true: vid_candidates.append(vid_candidate) return vid_candidates else: return range(self.v_size) else: if rid in self.uid_rid_sampling_info[uid]: vids, probs = self.uid_rid_sampling_info[uid][rid] else: nbs = set() for vid_visited in vids_visited: vids = self.tree.query_radius([self.vid_coor_rad[vid_visited]], r=0.000172657) for vid in vids[0]: if (not is_train) or (is_train and vid != vid_true): nbs.add(vid) vids = list(nbs) probs = np.array([self.vid_pop[vid] for vid in vids], dtype=np.float64) probs /= probs.sum() self.uid_rid_sampling_info[uid][rid] = (vids, probs) if is_train: id_cnt = np.random.multinomial(self.nb_cnt, probs) vid_candidates = [vid_true] for id, cnt in enumerate(id_cnt): for _ in range(cnt): vid_candidates.append(vids[id]) return vid_candidates else: return vids def init_hidden(self): return Variable(torch.zeros(1, self.hidden_dim))
class Knearest: """ kNN classifier """ def __init__(self, X, y, k=5): """ Creates a kNN instance :param x: Training data input :param y: Training data output :param k: The number of nearest points to consider in classification """ self._kdtree = BallTree(X) self._y = y self._k = k self._counts = self.label_counts() def label_counts(self): """ Given the training labels, return a dictionary d where d[y] is the number of times that label y appears in the training set. """ dictionary = { } for labels in self._y: if labels not in dictionary: dictionary[labels] = 0 dictionary[labels] += 1 return dictionary def majority(self, neighbor_indices): """ Given the indices of training examples, return the majority label. Break ties by choosing the tied label that appears most often in the training data. :param neighbor_indices: The indices of the k nearest neighbors """ assert len(neighbor_indices) == self._k, "Did not get k neighbor indices" neighbor_labels = [self._y[i] for i in neighbor_indices] #given indices, grab the corresponding labels from self._y labels_frequency = { } for labels in neighbor_labels: if labels not in labels_frequency: labels_frequency[labels] = 0 labels_frequency[labels] += 1 maximum = max(labels_frequency, key = labels_frequency.get) maximum_label = 0 if labels_frequency[maximum] == 1: for x in labels_frequency: if self._counts[x] > maximum_label: maximum_label = self._counts[x] maximum = x return maximum def classify(self, example): """ Given an example, return the predicted label. :param example: A representation of an example in the same format as a row of the training data """ dist, ind = self._kdtree.query(np.array(example).reshape(1, -1), k=self._k) return self.majority(ind[0]) def confusion_matrix(self, test_x, test_y): """ Given a matrix of test examples and labels, compute the confusion matrix for the current classifier. Should return a 2-dimensional numpy array of ints, C, where C[ii,jj] is the number of times an example with true label ii was labeled as jj. :param test_x: test data :param test_y: true test labels """ C = np.zeros((10,10), dtype=int) for xx, yy in zip(test_x, test_y): jj = self.classify(xx) #return the predicted label C[yy][jj] += 1 #increase by 1 where true label and predicted label intersect #print(C) return C @staticmethod def accuracy(C): """ Given a confusion matrix C, compute the accuracy of the underlying classifier. :param C: a confusion matrix """ return np.sum(C.diagonal()) / C.sum()
def knn_forward(coord_array): X = np.array(list(map(lambda x: [x[0] * PI_R, x[1] * PI_R], coord_array))) tree = BallTree(X, leaf_size=2, metric='haversine') res = tree.query_radius(X[-1].reshape(1, -1), r=R) return res[0]
def fit( self, X, y, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0, persist_train=True, index_id=None, time_bins=None, ): """ Transform feature space by fitting a XGBoost model and outputting its leaf indices. Build search index in the new space to allow nearest neighbor queries at scoring time. Args: X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. num_boost_round (Int): Number of boosting iterations. validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping early_stopping_rounds (Int): Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. See xgboost.train documentation. verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions Returns: XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors """ self.E_train, self.T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(self.T_train, self.E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # converting validation data to xgb format evals = () if validation_data: X_val, y_val = validation_data dvalid = convert_data_to_xgb_format( X_val, y_val, self.xgb_params["objective"] ) evals = [(dvalid, "validation")] # training XGB self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, evals=evals, verbose_eval=verbose_eval, ) self.feature_importances_ = self.bst.get_score() # creating nearest neighbor index leaves = self.bst.predict(dtrain, pred_leaf=True) self.tree = BallTree(leaves, metric="hamming", leaf_size=40) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() self.index_id = index_id return self
def fit( self, X, y, persist_train=True, index_id=None, time_bins=None, ci_width=0.683, **xgb_kwargs, ): """ Fit a single decision tree using xgboost. For each leaf in the tree, build a Kaplan-Meier estimator. !!! Note * Differently from `XGBSEKaplanNeighbors`, in `XGBSEKaplanTree`, the width of the confidence interval (`ci_width`) must be specified at fit time. Args: X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field, and time of event or time of censoring as second field. persist_train (Bool): Whether or not to persist training data to use explainability through prototypes index_id (pd.Index): User defined index if intended to use explainability through prototypes time_bins (np.array): Specified time windows to use when making survival predictions ci_width (Float): Width of confidence interval Returns: XGBSEKaplanTree: Trained instance of XGBSEKaplanTree """ E_train, T_train = convert_y(y) if time_bins is None: time_bins = get_time_bins(T_train, E_train) self.time_bins = time_bins # converting data to xgb format dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"]) # training XGB self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs) self.feature_importances_ = self.bst.get_score() # getting leaves leaves = self.bst.predict(dtrain, pred_leaf=True) # organizing elements per leaf leaf_neighs = ( pd.DataFrame({"leaf": leaves}) .groupby("leaf") .apply(lambda x: list(x.index)) ) # getting T and E for each leaf T_leaves = _align_leaf_target(leaf_neighs, T_train) E_leaves = _align_leaf_target(leaf_neighs, E_train) # calculating z-score from width z = st.norm.ppf(0.5 + ci_width / 2) # vectorized (very fast!) implementation of Kaplan Meier curves ( self._train_survival, self._train_upper_ci, self._train_lower_ci, ) = calculate_kaplan_vectorized(T_leaves, E_leaves, time_bins, z) # adding leaf indexes self._train_survival = self._train_survival.set_index(leaf_neighs.index) self._train_upper_ci = self._train_upper_ci.set_index(leaf_neighs.index) self._train_lower_ci = self._train_lower_ci.set_index(leaf_neighs.index) if persist_train: self.persist_train = True if index_id is None: index_id = X.index.copy() self.tree = BallTree(leaves.reshape(-1, 1), metric="hamming", leaf_size=40) self.index_id = index_id return self