Пример #1
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    # The Cython routines used require contiguous arrays
    if not X.flags['C_CONTIGUOUS']:
        X = np.array(X, dtype=np.double, order='C')

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)

    dist_metric = DistanceMetric.get_metric(metric, **kwargs)

    # Get distance to kth nearest neighbour
    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1].copy(order='C')

    # Mutual reachability distance is implicit in mst_linkage_core_vector
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)
    # Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
Пример #2
0
def DualTree(dataFlux,dDataFlux,modelFlux,modelParams,mcIts,columnsToScale=[]):
    '''
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            modelParams = parameters of each model to be recorded, array of size (#models,#parameters)
            mcIts = number of times to perturb fluxes for each object, int
            columnsToScale = list of column indices in modelParams of parameters that need to be multiplied by scale factor
            
    Output:
            NumPy array of size (#objects,mcIts,#params)
            e.g. the zeroth element gives you a 2d array where each row represents the
            fit parameters from one monte carlo iteration 
    '''
    modelColors = modelFlux[:,1:] / modelFlux[:,:-1]
    tree = BallTree(modelColors)
    fitParams = []
    for i in range(len(dataFlux)):
        newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts,len(dataFlux[i]))
        newColors = newFlux[:,1:] / newFlux[:,:-1]
        query = tree.query(newColors,k=1,dualtree=True)
        s = fit_tools.Scale(modelFlux[query[1][:,0]],newFlux,np.ones(np.shape(newFlux)))
        myParams = s
        for j in range(len(modelParams[0])):
            if j in columnsToScale:
                myParams = np.c_[myParams,np.multiply(s,modelParams[query[1][:,0]][:,j])]                
            else:
                myParams = np.c_[myParams,modelParams[query[1][:,0]][:,j]]
        fitParams.append(myParams)
    return(np.array(fitParams))
Пример #3
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size)

    dist_metric = DistanceMetric.get_metric(metric)

    #Get distance to kth nearest neighbour
    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]

    #Mutual reachability distance is implicite in mst_linkage_core_cdist
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
    #Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    #Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
def correrPruebaLocal(set_ampliado):
	
	print "corriendo prueba local"

	train,targetTrain,test,targetTest = cargarDatosPruebaLocal(set_ampliado,0.66)

	tree = BallTree(train,leaf_size=30) 
	predictions=[]
	correctas=0
	incorrectas=0
	for x in range(len(test)):
		dist, ind = tree.query(test[x], k=4)
		resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel())
		predictions.append(resultado)
		print progreso(x,len(test))	
		if resultado==targetTest[x]: 
			correctas+=1
		else:
			incorrectas+=1
		print "Predicciones -->  Correctas: " + str(correctas) + "Incorrectas: " + str(incorrectas)+ "Total: "+ str(len(test))
		print('> predicted=' + repr(resultado) + ', actual=' + repr(targetTest[x]) + ' ' + progreso(x,len(test)) )
	print "precision total"
	correct = 0
	for x in range(len(test)):
		if targetTest[x] == predictions[x]:
			correct += 1
	print (float(correct)/float(len(test))) * 100.0
Пример #5
0
    def knn(a, b):
        "k nearest neighbors"
        b = np.array([bb[:-1] for bb in b])
        tree = BallTree(b)
        __, indx = tree.query(a[:-1], k)

        return [b[i] for i in indx]
Пример #6
0
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        distances = random_state.randn(n_samples, n_features)
        distances = distances.astype(np.float32)
        distances = distances.dot(distances.T)
        np.fill_diagonal(distances, 0.0)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, False)
        kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples,
                                    n_components)

        k = n_samples - 1
        bt = BallTree(distances)
        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
        neighbors_nn = neighbors_nn[:, 1:]
        Pbh = _joint_probabilities_nn(distances, neighbors_nn,
                                      perplexity, False)
        kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn,
                                       degrees_of_freedom, n_samples,
                                       n_components, angle=angle,
                                       skip_num_points=0, verbose=False)
        assert_array_almost_equal(Pbh, P, decimal=5)
        assert_array_almost_equal(gradex, gradbh, decimal=5)
Пример #7
0
def DualTree(dataFlux, dDataFlux, modelFlux, modelParams, mcIts):
    """
    Inputs:
            dataFlux = observed fluxes, array of size (#objects,#filters)
            dDataFlux = flux uncertainties, array of size (#objects,#filters)
            modelFlux = fluxes of models, array of size (#models,#filters)
            modelParams = parameters of each model to be recorded, array of size (#models,#parameters)
            mcIts = number of times to perturb fluxes for each object, int
            
    Output:
            NumPy array of size (#objects,mcIts,#params)
            e.g. the zeroth element gives you a 2d array where each row represents the
            fit parameters from one monte carlo iteration 
    """
    modelColors = modelFlux[:, 1:] / modelFlux[:, :-1]
    tree = BallTree(modelColors)
    fitParams = []
    for i in range(len(dataFlux)):
        newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts, len(dataFlux[i]))
        newColors = newFlux[:, 1:] / newFlux[:, :-1]
        query = tree.query(newColors, k=1, dualtree=True)
        s = Scale(modelFlux[query[1][:, 0]], newFlux, np.ones(np.shape(newFlux)))
        myParams = s
        for j in range(len(modelParams[0])):
            myParams = np.c_[myParams, modelParams[query[1][:, 0]][:, j]]
        fitParams.append(myParams)
    return np.array(fitParams)
Пример #8
0
    def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
        print "Running PCA..."
        train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
        ys = ys_from_pairs(train_pairs_pca)

        file_id = str(random.random())[2:]

        save_cvx_params(ys, file_id)
        run_cvx(file_id)
        M = load_cvx_result(file_id)
        dist = DistanceMetric.get_metric('mahalanobis', VI = M)
        train_a_sections = [x[0] for x in train_pairs_pca]
        train_b_sections = [x[1] for x in train_pairs_pca]
        test_a_sections = [x[0] for x in test_pairs_pca]
        test_b_sections = [x[1] for x in test_pairs_pca]

        train_given_sections = train_a_sections
        train_to_match_sections = train_b_sections
        test_given_sections = test_a_sections
        test_to_match_sections = test_b_sections
        if self.match_a_to_b:
            train_given_sections = train_b_sections
            train_to_match_sections = train_a_sections
            test_given_sections = test_b_sections
            test_to_match_sections = test_a_sections

        print "Constructing BallTrees..."
        train_bt = BallTree(train_to_match_sections, metric=dist)
        test_bt = BallTree(test_to_match_sections, metric=dist)

        train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
        test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)

        print "Querying the BallTrees..."
        train_result = train_bt.query(train_given_sections, train_top_fraction)
        test_result = test_bt.query(test_given_sections, test_top_fraction)

        print "Looking at correctness of results..."
        train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
        test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])

        print "Finding indices of correct matches..."
        test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
        def default_index(lst, i):
          ind = -1
          try:
            ind = lst.index(i)
          except:
            pass
          return ind
        test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
        test_indices = [x for x in test_indices if x != -1]

        with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
          for i, index in enumerate(test_indices):
            if index == 0:
              successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')

        return [[train_correct, len(train_given_sections)],
            [test_correct, len(test_given_sections)]], test_indices
Пример #9
0
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
                            metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    min_samples = min(dim - 1, min_samples)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
Пример #10
0
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False):
    """Find nearest neighbors

    Note: The rows in xhs and rr must all be unit-length vectors, otherwise
    the result will be incorrect.

    Parameters
    ----------
    xhs : array, shape=(n_samples, n_dim)
        Points of data set.
    rr : array, shape=(n_query, n_dim)
        Points to find nearest neighbors for.
    use_balltree : bool
        Use fast BallTree based search from scikit-learn. If scikit-learn
        is not installed it will fall back to the slow brute force search.
    return_dists : bool
        If True, return associated distances.

    Returns
    -------
    nearest : array, shape=(n_query,)
        Index of nearest neighbor in xhs for every point in rr.
    distances : array, shape=(n_query,)
        The distances. Only returned if return_dists is True.
    """
    if use_balltree:
        try:
            from sklearn.neighbors import BallTree
        except ImportError:
            logger.info('Nearest-neighbor searches will be significantly '
                        'faster if scikit-learn is installed.')
            use_balltree = False

    if xhs.size == 0 or rr.size == 0:
        if return_dists:
            return np.array([], int), np.array([])
        return np.array([], int)
    if use_balltree is True:
        ball_tree = BallTree(xhs)
        if return_dists:
            out = ball_tree.query(rr, k=1, return_distance=True)
            return out[1][:, 0], out[0][:, 0]
        else:
            nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0]
            return nearest
    else:
        from scipy.spatial.distance import cdist
        if return_dists:
            nearest = list()
            dists = list()
            for r in rr:
                d = cdist(r[np.newaxis, :], xhs)
                idx = np.argmin(d)
                nearest.append(idx)
                dists.append(d[0, idx])
            return (np.array(nearest), np.array(dists))
        else:
            nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs))
                                for r in rr])
            return nearest
Пример #11
0
def compute_labels(X, C):
    """Compute the cluster labels for dataset X given centers C.
    """
    # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X
    tree = BallTree(C)
    labels = tree.query(X, k=1, return_distance=False).squeeze()
    return labels
Пример #12
0
def md_nearest_from_centroids(seeding, centroids):
    # mean distance
    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)
    sum_dist = sum(d[0] for d in dist)
    mean = sum_dist / len(centroids)
    return mean
Пример #13
0
def _rsl_prims_balltree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2 # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=k)[0][:,-1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
Пример #14
0
class BallTreeANN:
    def __init__(self):
        """
        Constructor
        """
        self.nbrs = None

    def build_index(self, dataset, leaf_size):
        self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean")
        return self.nbrs

    def build_store_index(self, dataset, path, leaf_size):
        self.build_index(dataset, leaf_size)
        self.store_index(path)

    def store_index(self, path):
        with open(path, "wb") as output1:
            pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL)

    def load_index(self, path):
        with open(path, "rb") as input1:
            self.nbrs = pickle.load(input1)

    def search_in_radious(self, vector, radious=2):
        distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True)
        return distances, indices

    def search_neighbors(self, vector, num_neighbors):
        distances, indices = self.nbrs.query(vector, k=num_neighbors)
        return distances, indices
Пример #15
0
def _calc_tree(xx, yy, radius):
    X = np.zeros((len(xx), 2), dtype='float')
    X[:, 0] = xx[:]
    X[:, 1] = yy[:]
    tree = BallTree(X, metric='euclidean')
    ind = tree.query_radius(X, r=radius)
    ind_sw = tree.query_radius(X, r=VARIANCE_RADIUS_SW)
    return ind, ind_sw
Пример #16
0
def rmsd_nearest_from_centroids(seeding, centroids):
    # root mean squared distance from each centroids to its closest seeding
    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)

    # root mean squared distance
    sum_sqdist = sum(d[0] ** 2 for d in dist)
    mean = sum_sqdist / len(centroids)
    return mean ** 0.5
Пример #17
0
    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
Пример #18
0
def md_weighted_nearest_from_centroids(seeding, centroids, weights):
    assert len(centroids) == len(weights)

    sum_weight = sum(weights)

    ball_tree = BallTree(seeding)
    dist, idx = ball_tree.query(centroids)
    sum_weighted_dist = sum(d[0] * weight for d, weight in zip(dist, weights))
    mean = sum_weighted_dist / sum_weight
    return mean
Пример #19
0
    def predict(self, X):
        ball_tree = BallTree()
        ball_tree.fit(self.cluster_centers_)

        _, indexes = ball_tree.query(X)
        result = []
        for idx, in indexes:
            result.append(self.labels_[idx])

        return result
Пример #20
0
def get_centroid_weights(X, centroids):
    assert isinstance(X, np.ndarray)
    assert isinstance(centroids, np.ndarray)

    ball_tree = BallTree(centroids)
    dist, indexes = ball_tree.query(X)
    weights = [0 for i in centroids]
    for idx in indexes:
        weights[idx] += 1

    return weights
Пример #21
0
def get_graph_topo(halos):

    x,y,z = cosmology.spherical_to_cartesian_with_redshift(halos['ra'],halos['dec'],halos['z'])
    box_coords = np.concatenate( [x,y,z] , axis=1)
    BT = BallTree(box_coords, leaf_size=5)
    list_conn = []
    for ih,vh in enumerate(halos):
       
        n_connections=70
        bt_dx,bt_id = BT.query(box_coords[ih,:],k=n_connections)

        for ic,vc in enumerate(halos[bt_id]):

            pass
Пример #22
0
def build_knn_matrix(data_matrix):
	neighbours_matrix = np.zeros((voxel_num,K_NN-1))
	tree = BallTree(data_matrix[:,0:3])
	for voxel in range(voxel_num):
		dist,ind = tree.query(data_matrix[voxel,0:3],k = K_NN)
		neighbours_matrix[voxel,:] = ind[0,1:]

	for cur_voxel in range(voxel_num):
		neighbours = neighbours_matrix[cur_voxel,:]
		for ind in range(len(neighbours)):
			neighbour = int(neighbours[ind])
			if(cur_voxel not in neighbours_matrix[neighbour,:]):
				neighbours_matrix[cur_voxel,ind] = -1

	return neighbours_matrix
def correrPruebaParaKaggle(set_ampliado):
	print "corriendo pruebas para kaggle"

	train,targetTrain,test = cargarDatosParaKaggle(set_ampliado)


	tree = BallTree(train,leaf_size=30) 
	predictions=[]
	for x in range(len(test)):
		dist, ind = tree.query(test[x], k=4)
		resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel())
		predictions.append(resultado)
		print progreso(x,len(test))

	guardarPrediccionesParaKaggle(predictions)
Пример #24
0
	def __init__(self, data_points = None, ai_history = None, threshold = THRESHOLD):
		self.state_list = []
		self.weights_list = []

		if data_points is None:
			data_points = []
		if ai_history is None:
			ai_history = []

		for state, weights in data_points:
			assert(len(state) == 32)
			self.state_list.append(state)
			self.weights_list.append(weights)

		self._threshold = threshold
		self._ai_history = cp.deepcopy(ai_history)

		#self._featureTransform()
		self.X = np.array(self.state_list)

		assert(self.X.shape == (len(data_points), 32) or len(data_points) == 0)
		#Think about different distance metrics. Manhattan or minkowski? P < 1?
		if len(data_points) > 0:
			self._tree = BallTree(self.X, metric='manhattan')
		else:
			self._tree = None
Пример #25
0
    def construct_tree(self, alpha = 1, beta = 1, theta = 1):
        # Initialize tree size
        self.treeSize = 0
        # construct a tree with the mfcc of the 1st frame of each seg, and beat chroma 
        self.feature_size = self.mfcc_size + self.chroma_size + self.rms_size
        X = np.zeros((len(self.segments), self.feature_size))
        for idx, seg in enumerate(self.segments):
            X [idx,:] = seg.get_seg_feature()
            #X[idx,0:self.mfcc_size] = seg.get_head_mfcc(n=1)
            #X[idx,self.mfcc_size:-self.rms_size] = seg.get_head_chroma(n=1)
            #X[idx,-self.rms_size:] = seg.get_head_rms(n=5)
            seg.idx = self.treeSize
            self.treeSize += 1

        def mydist(x, y):
            
            xMFCC = x[0:self.mfcc_size]
            yMFCC = y[0:self.mfcc_size]
            xChroma = x[self.mfcc_size:self.mfcc_size+self.chroma_size]
            yChroma = y[self.mfcc_size:self.mfcc_size+self.chroma_size]
            xRms = x[-self.rms_size:]
            yRms = y[-self.rms_size:]
            
            dist1 = np.sum((xMFCC-yMFCC)**2) / 5000
            dist2 = 1.0 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
            #dist2 = spatial.distance.cosine(xChroma,yChroma)     #unknown why this doesn't work
            dist3 = np.sum((xRms-yRms)**2)
            dist = alpha * dist1 + beta * dist2 + theta * dist3
            return dist

        self.tree = BallTree(X, leaf_size=2, metric = 'pyfunc',func = mydist)
Пример #26
0
def calc_vert_vals(verts, pts, vals, method='max', k_points=100):
    ball_tree = BallTree(pts)
    dists, pts_inds = ball_tree.query(verts, k=k_points, return_distance=True)
    near_vals = vals[pts_inds]
    # sig_dists = dists[np.where(abs(near_vals)>2)]
    cover = len(np.unique(pts_inds.ravel()))/float(len(pts))
    print('{}% of the points are covered'.format(cover*100))
    if method=='dist':
        n_dists = 1/(dists**2)
        norm = 1/np.sum(n_dists, 1)
        norm = np.reshape(norm, (len(norm), 1))
        n_dists = norm * n_dists
        verts_vals = np.sum(near_vals * n_dists, 1)
    elif method=='max':
        verts_vals = near_vals[range(near_vals.shape[0]), np.argmax(abs(near_vals), 1)]
    return verts_vals
Пример #27
0
    def fit(self, X_cv, y_true=None, weights=None):
        from sklearn.neighbors import BallTree
        from sklearn.metrics import accuracy_score
        import random
        import time

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            t0 = time.time()
            predictions = []
            for name, model in self.models.iteritems():
                #predictions.append(model.predict(X_cv))
                # print len(predictions[-1])
                if self.common_neigh:
                    X_tr = self.counter.fit_transform(X_cv)
                    self.gt_tree = BallTree(X_tr.toarray(), leaf_size=20)
                else:
                    X_tr = self.models_tr[name].transform(X_cv)
                    if hasattr(X_tr, "toarray"):
                        self.trees[name] = BallTree(X_tr.toarray(), leaf_size=20)
                    else:
                        self.trees[name] = BallTree(X_tr, leaf_size=20)    
                self.predictions[name] = model.predict(X_cv)
            self.true = y_true
            print 'Fitting time %0.2f' % (time.time() - t0)
Пример #28
0
class BallTreeRecommender(object):
    """
    Given input terms, provide k recipe recommendations
    """
    def __init__(self, k=3, **kwargs):
        self.k = k
        self.trans_path = "svd.pkl"
        self.tree_path = "tree.pkl"
        self.transformer = False
        self.tree = None
        self.load()

    def load(self):
        """
        Load a pickled transformer and tree from disk,
        if they exist.
        """
        if os.path.exists(self.trans_path):
            self.transformer = joblib.load(open(self.trans_path, 'rb'))
            self.tree = joblib.load(open(self.tree_path, 'rb'))
        else:
            self.transformer = False
            self.tree = None

    def save(self):
        """
        It takes a long time to fit, so just do it once!
        """
        joblib.dump(self.transformer, open(self.trans_path, 'wb'))
        joblib.dump(self.tree, open(self.tree_path, 'wb'))

    def fit_transform(self, documents):
        # Transformer will be False if pipeline hasn't been fit yet,
        # Trigger fit_transform and save the transformer and lexicon.
        if self.transformer == False:
            self.transformer = Pipeline([
                ('norm', TextNormalizer(minimum=50, maximum=200)),
                ('transform', Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    ('svd', TruncatedSVD(n_components=200))
                ])
                 )
            ])
            self.lexicon = self.transformer.fit_transform(documents)
            self.tree = BallTree(self.lexicon)
            self.save()

    def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0]
Пример #29
0
 def populate_vector_space(self, vocabulary):
     #todo the exact data structure used here will need optimisation
     """
     Input is like:
      ('1-GRAM', ('Seattle/N',)),
      ('1-GRAM', ('Senate/N',)),
      ('1-GRAM', ('September/N',)),
      ('1-GRAM', ('Service/N',)),
      ('AN', ('similar/J', 'agreement/N')),
      ('AN', ('tough/J', 'stance/N')),
      ('AN', ('year-ago/J', 'period/N'))
     """
     logging.debug('Populating vector space with vocabulary %s', vocabulary)
     vectors = [c._get_vector(data).A
                for (feature_type, data) in vocabulary
                for c in self.composer_mapping[feature_type]
                if feature_type in self.composer_mapping and (feature_type, data) in c]
     self.feature_matrix = vstack(vectors)
     logging.debug('Building BallTree for matrix of size %s', self.feature_matrix.shape)
     feature_list = [ngram for ngram in vocabulary for _ in self.composer_mapping[ngram[0]]]
     #todo test if this entry index is correct
     self.entry_index = {i: ngram for i, ngram in enumerate(feature_list)}
     #assert len(feature_list) == self.feature_matrix.shape[0]
     #todo BallTree/KDTree only work with dense inputs
     #self.nbrs = KDTree(n_neighbors=1, algorithm='kd_tree').fit(self.feature_matrix)
     self.nbrs = BallTree(self.feature_matrix, metric=cosine)
     logging.debug('Done building BallTree')
     return self.nbrs
def get_evidence_grid(points, res_pts, intr_prms, exact=False):
    """
    Associate the "z-axis" value (evidence, overlap, etc...) res_pts with its
    corresponding point in the template bank (points). If exact is True, then
    the poit must exactly match the point in the bank.
    """
    grid_tree = BallTree(selected)
    grid_idx = []
    # Reorder the grid points to match their weight indices
    for res in res_pts:
        dist, idx = grid_tree.query(res, k=1)
        # Stupid floating point inexactitude...
        #print res, selected[idx[0][0]]
        #assert numpy.allclose(res, selected[idx[0][0]])
        grid_idx.append(idx[0][0])
    return points[grid_idx]
Пример #31
0
def trainSimilarCases(firstTime=False, type="lda", vector_size=70):
    if firstTime:
        # 1:读ygsc及对应xml名
        ygscs = []
        xml_names = []
        labels = []
        db = dbutil.get_mongodb_conn()
        cases_set = db.cases
        for line in cases_set.find({
                "flag": 2
        }, {
                "_id": 1,
                "ygscWords2": 1,
                "label": 1
        },
                                   no_cursor_timeout=True).batch_size(20):
            ygscs.append(line["ygscWords2"])
            xml_names.append(line["_id"])
            labels.append(line["label"])

        ################## 2:转换为onehot   #####################
        cv = CountVectorizer(max_df=0.95,
                             min_df=1,
                             stop_words=None,
                             token_pattern=r"(?u)\b\w+\b")
        one_hot_matrix = cv.fit_transform(ygscs).toarray()

        # 3:保存one-hot
        with open("checkpoint/cv.pk", "wb") as file:
            joblib.dump(cv, file)
        with open("checkpoint/onehot.pk", "wb") as file:
            joblib.dump(one_hot_matrix, file)
        with open("checkpoint/xml_names.pk", "wb") as file:
            joblib.dump(xml_names, file)
        with open("checkpoint/label.pk", "wb") as file:
            import numpy
            joblib.dump(numpy.asarray(labels), file)

        ################## 2':转换为tfidf   ######################
        tfidf = TfidfVectorizer(max_df=0.95,
                                min_df=1,
                                stop_words=None,
                                token_pattern=r"(?u)\b\w+\b")
        tf_idf_matrix = tfidf.fit_transform(ygscs).toarray()

        # 3':保存tf-idf
        with open("checkpoint/tfidf.pk", "wb") as file:
            joblib.dump(tfidf, file)

        # 4:建balltree
        ball_tree = BallTree(tf_idf_matrix)
        with open("checkpoint/tfidf_ball_tree.pk", "wb") as file:
            joblib.dump(ball_tree, file)
    else:
        with open("checkpoint/onehot.pk", "rb") as file:
            one_hot_matrix = joblib.load(file)

    if type == "lda":
        # 0: 获取测试集
        test_ygscs = []
        db = dbutil.get_mongodb_conn()
        cases_set = db.cases
        for line in cases_set.find({
                "flag": 4
        }, {
                "ygscWords2": 1
        },
                                   no_cursor_timeout=True).batch_size(20):
            test_ygscs.append(line["ygscWords2"])

        with open("checkpoint/cv.pk", "rb") as file:
            cv = joblib.load(file)
        test_matrix = cv.transform(test_ygscs).toarray()

        # 1:lda训练
        lda = LatentDirichletAllocation(n_components=vector_size)
        lda_matrix = lda.fit_transform(one_hot_matrix)

        # 2: 评估
        print("lda" + str(vector_size) + "train困惑度:" +
              str(lda.perplexity(one_hot_matrix)))  # 查看困惑度
        print("lda" + str(vector_size) + "test困惑度:" +
              str(lda.perplexity(test_matrix)))  # 查看困惑度

        # 3:保存
        with open("checkpoint/lda.pk", "wb") as file:
            joblib.dump(lda, file)
        with open("checkpoint/feature_matric.pk", "wb") as file:
            joblib.dump(lda_matrix, file)
        # 4:建balltree
        ball_tree = BallTree(lda_matrix)
        with open("checkpoint/lda_ball_tree.pk", "wb") as file:
            joblib.dump(ball_tree, file)
    elif type == "svd":
        # 1:svd训练
        svd = TruncatedSVD(n_components=vector_size)  # 迭代次数
        svd_matrix = svd.fit_transform(one_hot_matrix)

        # 2:保存
        with open("checkpoint/svd.pk", "wb") as file:
            joblib.dump(svd, file)

        # 3:建balltree
        ball_tree = BallTree(svd_matrix)
        with open("checkpoint/svd_ball_tree.pk", "wb") as file:
            joblib.dump(ball_tree, file)
Пример #32
0
 def create_ball_tree(self):
     entities = self.model.all_entity_vectors()
     self.tree = BallTree(entities, leaf_size=entities.shape[0])
else:
    word_vectors = {}
    with open("../embeddings/word_embeddings100.txt", 'r') as f:
        for line in f:
            item_id = line[line.index("(") + 2:line.index(",") - 1]
            vector = line[line.index("[") + 1:-3].split(",")
            vector = [float(x.strip()) for x in vector]
            word_vectors[item_id] = vector
    tags = ["Sentimental", "folk-rock", "Disney", "1980s", "Swing-Jazz"]

k = 6
X = list(word_vectors.values())
ids = list(word_vectors.keys())
id_index_list = {}
num_items = len(ids)
for i in range(num_items):
    id_index_list[ids[i]] = i
tree = BallTree(X, leaf_size=40)
tag_recs = {}

for tag in tags:
    _, recs_ind = tree.query([word_vectors[tag]], k=k)
    recs = []
    for i in recs_ind[0]:
        # filter items in user history
        recs.append(ids[i])
    tag_recs[tag] = recs

for tag, recs in tag_recs.items():
    print(recs)
Пример #34
0
    def spatial_interaction_internal(adata_subset, x_coordinate, y_coordinate,
                                     phenotype, method, radius, knn,
                                     permutation, imageid, subset,
                                     pval_method):

        print("Processing Image: " + str(adata_subset.obs[imageid].unique()))

        # Create a dataFrame with the necessary inforamtion
        data = pd.DataFrame({
            'x': adata_subset.obs[x_coordinate],
            'y': adata_subset.obs[y_coordinate],
            'phenotype': adata_subset.obs[phenotype]
        })

        # Identify neighbourhoods based on the method used
        # a) KNN method
        if method == 'knn':
            print("Identifying the " + str(knn) +
                  " nearest neighbours for every cell")
            tree = BallTree(data[['x', 'y']], leaf_size=2)
            ind = tree.query(data[['x', 'y']], k=knn, return_distance=False)
            neighbours = pd.DataFrame(ind.tolist(),
                                      index=data.index)  # neighbour DF
            neighbours.drop(0, axis=1, inplace=True)  # Remove self neighbour

        # b) Local radius method
        if method == 'radius':
            print("Identifying neighbours within " + str(radius) +
                  " pixels of every cell")
            kdt = BallTree(data[['x', 'y']], metric='euclidean')
            ind = kdt.query_radius(data[['x', 'y']],
                                   r=radius,
                                   return_distance=False)
            for i in range(0, len(ind)):
                ind[i] = np.delete(ind[i],
                                   np.argwhere(ind[i] == i))  #remove self
            neighbours = pd.DataFrame(ind.tolist(),
                                      index=data.index)  # neighbour DF

        # Map Phenotypes to Neighbours
        # Loop through (all functionized methods were very slow)
        phenomap = dict(zip(list(range(len(ind))),
                            data['phenotype']))  # Used for mapping
        print("Mapping phenotype to neighbors")
        for i in neighbours.columns:
            neighbours[i] = neighbours[i].dropna().map(phenomap,
                                                       na_action='ignore')

        # Drop NA
        neighbours = neighbours.dropna(how='all')

        # Collapse all the neighbours into a single column
        n = pd.DataFrame(neighbours.stack(), columns=["neighbour_phenotype"])
        n.index = n.index.get_level_values(0)  # Drop the multi index

        # Merge with real phenotype
        n = n.merge(data['phenotype'],
                    how='inner',
                    left_index=True,
                    right_index=True)

        # Permutation
        print('Performing ' + str(permutation) + ' permutations')

        def permutation_pval(data):
            data = data.assign(neighbour_phenotype=np.random.permutation(
                data['neighbour_phenotype']))
            #data['neighbour_phenotype'] = np.random.permutation(data['neighbour_phenotype'])
            data_freq = data.groupby(['phenotype',
                                      'neighbour_phenotype']).size().unstack()
            data_freq = data_freq.fillna(0).stack().values
            return data_freq

        # Apply function
        final_scores = Parallel(n_jobs=-1)(delayed(permutation_pval)(data=n)
                                           for i in range(permutation))
        perm = pd.DataFrame(final_scores).T

        # Consolidate the permutation results
        print('Consolidating the permutation results')
        # Calculate P value
        # real
        n_freq = n.groupby(['phenotype', 'neighbour_phenotype'
                            ]).size().unstack().fillna(0).stack()
        # permutation
        mean = perm.mean(axis=1)
        std = perm.std(axis=1)
        # P-value calculation
        if pval_method == 'histocat':
            # real value - prem value / no of perm
            p_values = abs(n_freq.values - mean) / (permutation + 1)
            p_values = p_values[~np.isnan(p_values)].values
        if pval_method == 'zscore':
            z_scores = (n_freq.values - mean) / std
            z_scores[np.isnan(z_scores)] = 0
            p_values = scipy.stats.norm.sf(abs(z_scores)) * 2
            p_values = p_values[~np.isnan(p_values)]

        # Compute Direction of interaction (interaction or avoidance)
        direction = ((n_freq.values - mean) /
                     abs(n_freq.values - mean)).fillna(1)

        # Normalize based on total cell count
        k = n.groupby(['phenotype',
                       'neighbour_phenotype']).size().unstack().fillna(0)
        # add neighbour phenotype that are not present to make k a square matrix
        columns_to_add = dict.fromkeys(np.setdiff1d(k.index, k.columns), 0)
        k = k.assign(**columns_to_add)

        total_cell_count = data['phenotype'].value_counts()
        total_cell_count = total_cell_count[
            k.
            columns].values  # keep only cell types that are present in the column of k
        # total_cell_count = total_cell_count.reindex(k.columns).values # replaced by above
        k_max = k.div(total_cell_count, axis=0)
        k_max = k_max.div(k_max.max(axis=1), axis=0).stack()

        # DataFrame with the neighbour frequency and P values
        count = (k_max.values *
                 direction).values  # adding directionallity to interaction
        neighbours = pd.DataFrame({
            'count': count,
            'p_val': p_values
        },
                                  index=k_max.index)
        #neighbours.loc[neighbours[neighbours['p_val'] > p_val].index,'count'] = np.NaN
        #del neighbours['p_val']
        neighbours.columns = [
            adata_subset.obs[imageid].unique()[0],
            'pvalue_' + str(adata_subset.obs[imageid].unique()[0])
        ]
        neighbours = neighbours.reset_index()
        #neighbours = neighbours['count'].unstack()

        # return
        return neighbours
Пример #35
0
def get_nearest_nodes(G, X, Y, method=None):
    """
    Return the graph nodes nearest to a list of points. Pass in points
    as separate vectors of X and Y coordinates. The 'kdtree' method
    is by far the fastest with large data sets, but only finds approximate
    nearest nodes if working in unprojected coordinates like lat-lng (it
    precisely finds the nearest node if working in projected coordinates).
    The 'balltree' method is second fastest with large data sets, but it 
    is precise if working in unprojected coordinates like lat-lng.
    
    Parameters
    ----------
    G : networkx multidigraph
    X : list-like
        The vector of longitudes or x's for which we will find the nearest
        node in the graph
    Y : list-like
        The vector of latitudes or y's for which we will find the nearest
        node in the graph
    method : str {None, 'kdtree', 'balltree'}
        Which method to use for finding nearest node to each point.
        If None, we manually find each node one at a time using 
        osmnx.utils.get_nearest_node and haversine. If 'kdtree' we use 
        scipy.spatial.cKDTree for very fast euclidean search. If
        'balltree', we use sklearn.neighbors.BallTree for fast 
        haversine search.
        
    Returns
    -------
    nn : array
        list of nearest node IDs
    """

    start_time = time.time()

    if method is None:

        # calculate nearest node one at a time for each point
        nn = [
            get_nearest_node(G, (y, x), method='haversine')
            for x, y in zip(X, Y)
        ]

    elif method == 'kdtree':

        # check if we were able to import scipy.spatial.cKDTree successfully
        if not cKDTree:
            raise ImportError(
                'The scipy package must be installed to use this optional feature.'
            )

        # build a k-d tree for euclidean nearest node search
        nodes = pd.DataFrame({
            'x': nx.get_node_attributes(G, 'x'),
            'y': nx.get_node_attributes(G, 'y')
        })
        tree = cKDTree(data=nodes[['x', 'y']],
                       compact_nodes=True,
                       balanced_tree=True)

        # query the tree for nearest node to each point
        points = np.array([X, Y]).T
        dist, idx = tree.query(points, k=1)
        nn = nodes.iloc[idx].index

    elif method == 'balltree':

        # check if we were able to import sklearn.neighbors.BallTree successfully
        if not BallTree:
            raise ImportError(
                'The scikit-learn package must be installed to use this optional feature.'
            )

        # haversine requires data in form of [lat, lng] and inputs/outputs in units of radians
        nodes = pd.DataFrame({
            'x': nx.get_node_attributes(G, 'x'),
            'y': nx.get_node_attributes(G, 'y')
        })
        nodes_rad = np.deg2rad(nodes[['y', 'x']].astype(np.float))
        points = np.array([Y.astype(np.float), X.astype(np.float)]).T
        points_rad = np.deg2rad(points)

        # build a ball tree for haversine nearest node search
        tree = BallTree(nodes_rad, metric='haversine')

        # query the tree for nearest node to each point
        idx = tree.query(points_rad, k=1, return_distance=False)
        nn = nodes.iloc[idx[:, 0]].index

    else:
        raise ValueError('You must pass a valid method name, or None.')

    log('Found nearest nodes to {:,} points in {:,.2f} seconds'.format(
        len(X),
        time.time() - start_time))

    return np.array(nn)
Пример #36
0
def knn_neighbors(data, lpts, n_neighbor):
    X_semi = np.concatenate([lpts, data], axis = 0)
    btree = BallTree(X_semi)
    nbr_id = btree.query(X_semi, k=n_neighbor + 1, return_distance=False)
    return nbr_id
Пример #37
0
    d = np.sqrt(d1**2.0 + d2**2.0 - 2 * d1 * d2 * np.cos(phi))
    return d


#coord is list of (RA,DEC,D) coordinates of primaries, 871 points in 3D.
coord = []
for i in range(len(primary_list)):
    coord.append([
        float(primary_list[i][1]),
        float(primary_list[i][2]),
        float(primary_list[i][7])
    ])

#Use Ball Tree with the above Haversine function as metric to find
# nearest neighbors quickly. See sklearn documentation for details.
tree = BallTree(coord, leaf_size=2, metric=D_dist)

ind_list = []  #list of indices of the nearest neighbors
dist_list = []  #list of distances (rad) of nearest neighbors
for i in range(len(coord)):
    dist, ind = tree.query(
        coord[i],
        k=2)  #k=2 since first "nearest neighbor" is sometimes itself.
    #sometimes, D_dist will return NaN when calculating D_dist for same
    # point. In this case, ind[0][0] will be the nearest neighbor index.
    # Otherwise ind[0][0] is i, and so ind[0][1] would be the nearest neighbor
    # index
    if ind[0][0] == i:
        ind_list.append(ind[0][1])
        dist_list.append(dist[0][1])
    else:
Пример #38
0
class Hospitals(Supergroup, MedicalFacilities):
    def __init__(
        self,
        hospitals: List["Hospital"],
        neighbour_hospitals: int = 5,
        box_mode: bool = False,
        ball_tree=True,
    ):
        """
        Create a group of hospitals, and provide functionality to locate patients
        to a nearby hospital. It will check in order the first ```neighbour_hospitals```,
        when one has space available the patient is allocated to it. If none of the closest
        ones has beds available it will pick one of them at random and that hospital will
        overflow

        Parameters
        ----------
        hospitals:
            list of hospitals to aggrupate
        neighbour_hospitals:
            number of closest hospitals to look for
        box_mode:
            whether to run in single box mode, or full simulation
        """
        super().__init__(members=hospitals)
        self.box_mode = box_mode
        self.neighbour_hospitals = neighbour_hospitals
        if ball_tree and self.members:
            coordinates = np.array(
                [hospital.coordinates for hospital in hospitals])
            self.init_trees(coordinates)

    @classmethod
    def for_box_mode(cls):
        hospitals = []
        hospitals.append(Hospital(
            coordinates=None,
            n_beds=10,
            n_icu_beds=2,
        ))
        hospitals.append(
            Hospital(
                coordinates=None,
                n_beds=5000,
                n_icu_beds=5000,
            ))
        return cls(hospitals,
                   neighbour_hospitals=None,
                   box_mode=True,
                   ball_tree=False)

    @classmethod
    def from_file(
        cls,
        filename: str = default_data_filename,
        config_filename: str = default_config_filename,
    ) -> "Hospitals":
        """
        Initialize Hospitals from path to data frame, and path to config file.

        Parameters
        ----------
        filename:
            path to hospital dataframe
        config_filename:
            path to hospital config dictionary

        Returns
        -------
        Hospitals instance
        """

        hospital_df = pd.read_csv(filename)
        with open(config_filename) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        neighbour_hospitals = config["neighbour_hospitals"]
        logger.info(f"There are {len(hospital_df)} hospitals in the world.")
        hospitals = cls.init_hospitals(cls, hospital_df)
        return Hospitals(hospitals, neighbour_hospitals)

    @classmethod
    def for_geography(
        cls,
        geography,
        filename: str = default_data_filename,
        config_filename: str = default_config_filename,
    ):
        with open(config_filename) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        neighbour_hospitals = config["neighbour_hospitals"]
        hospital_df = pd.read_csv(filename, index_col=4)
        area_names = [area.name for area in geography.areas]
        hospital_df = hospital_df.loc[hospital_df.index.isin(area_names)]
        logger.info(
            f"There are {len(hospital_df)} hospitals in this geography.")
        total_hospitals = len(hospital_df)
        hospitals = []
        for area in geography.areas:
            if area.name in hospital_df.index:
                hospitals_in_area = hospital_df.loc[area.name]
                if isinstance(hospitals_in_area, pd.Series):
                    hospital = cls.create_hospital_from_df_row(
                        area,
                        hospitals_in_area,
                    )
                    hospitals.append(hospital)
                else:
                    for _, row in hospitals_in_area.iterrows():
                        hospital = cls.create_hospital_from_df_row(
                            area,
                            row,
                        )
                        hospitals.append(hospital)
                if len(hospitals) == total_hospitals:
                    break
        return cls(hospitals, neighbour_hospitals, False)

    @classmethod
    def create_hospital_from_df_row(
        cls,
        area,
        row,
    ):
        coordinates = row[["latitude", "longitude"]].values.astype(np.float)
        n_beds = row["beds"]
        n_icu_beds = row["icu_beds"]
        trust_code = row["code"]
        hospital = Hospital(
            area=area,
            coordinates=coordinates,
            n_beds=n_beds,
            n_icu_beds=n_icu_beds,
            trust_code=trust_code,
        )
        return hospital

    def init_hospitals(
        self,
        hospital_df: pd.DataFrame,
    ) -> List["Hospital"]:
        """
        Create Hospital objects with the right characteristics,
        as given by dataframe.

        Parameters
        ----------
        hospital_df:
            dataframe with hospital characteristics data
        """
        hospitals = []
        for index, row in hospital_df.iterrows():
            n_beds = row["beds"]
            n_icu_beds = row["icu_beds"]
            trust_code = row["code"]
            coordinates = row[["latitude",
                               "longitude"]].values.astype(np.float)
            hospital = Hospital(
                coordinates=coordinates,
                n_beds=n_beds,
                n_icu_beds=n_icu_beds,
                trust_code=trust_code,
            )
            hospitals.append(hospital)
        return hospitals

    def init_trees(self, hospital_coordinates: np.array) -> BallTree:
        """
        Reads hospital location and sizes, it initializes a KD tree on a sphere,
        to query the closest hospital to a given location.

        Parameters
        ----------
        hospital_df: 
            dataframe with hospital characteristics data

        Returns
        -------
        Tree to query nearby schools
        """
        self.hospital_trees = BallTree(
            np.deg2rad(hospital_coordinates),
            metric="haversine",
        )

    def get_closest_hospitals_idx(self, coordinates: Tuple[float, float],
                                  k: int) -> Tuple[float, float]:
        """
        Get the k-th closest hospital to a given coordinate

        Parameters
        ---------
        coordinates: 
            latitude and longitude
        k:
            k-th neighbour

        Returns
        -------
        ID of the k-th closest hospital

        """
        k = min(k, len(list(self.hospital_trees.data)))
        distances, neighbours = self.hospital_trees.query(
            np.deg2rad(coordinates.reshape(1, -1)),
            k=k,
            sort_results=True,
        )
        return neighbours[0]

    def get_closest_hospitals(self, coordinates: Tuple[float, float],
                              k: int) -> Tuple[float, float]:
        """
        Get the k-th closest hospital to a given coordinate

        Parameters
        ---------
        coordinates: 
            latitude and longitude
        k:
            k-th neighbour

        Returns
        -------
        ID of the k-th closest hospital

        """
        k = min(k, len(list(self.hospital_trees.data)))
        distances, neighbours = self.hospital_trees.query(
            np.deg2rad(coordinates.reshape(1, -1)),
            k=k,
            sort_results=True,
        )
        return [self.members[index] for index in neighbours[0]]
class XGBSEKaplanNeighbors(XGBSEBaseEstimator):
    """
    Convert xgboost into a nearest neighbor model, where we use hamming distance to define
    similar elements as the ones that co-ocurred the most at the ensemble terminal nodes.

    Then, at each neighbor-set compute survival estimates with the Kaplan-Meier estimator.

    !!! Note
        * We recommend using dart as the booster to prevent any tree
        to dominate variance in the ensemble and break the leaf co-ocurrence similarity logic.

        * This method can be very expensive at scales of hundreds of thousands of samples,
        due to the nearest neighbor search, both on training (construction of search index) and scoring (actual search).

    Read more in [How XGBSE works](https://loft-br.github.io/xgboost-survival-embeddings/how_xgbse_works.html).

    """

    def __init__(self, xgb_params=None, n_neighbors=30, radius=None):
        """
        Args:
            xgb_params (Dict): Parameters for XGBoost model.
                If not passed, the following default parameters will be used:

                ```
                DEFAULT_PARAMS = {
                    "objective": "survival:aft",
                    "eval_metric": "aft-nloglik",
                    "aft_loss_distribution": "normal",
                    "aft_loss_distribution_scale": 1,
                    "tree_method": "hist",
                    "learning_rate": 5e-2,
                    "max_depth": 8,
                    "booster": "dart",
                    "subsample": 0.5,
                    "min_child_weight": 50,
                    "colsample_bynode": 0.5,
                }
                ```

                Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for more options.

            n_neighbors (Int): Number of neighbors for computing KM estimates

            radius (Float): If set, uses a radius around the point for neighbors search
        """
        if xgb_params is None:
            xgb_params = DEFAULT_PARAMS

        self.xgb_params = xgb_params
        self.n_neighbors = n_neighbors
        self.radius = radius
        self.persist_train = False
        self.index_id = None
        self.radius = None
        self.feature_importances_ = None

    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=True,
        index_id=None,
        time_bins=None,
    ):
        """
        Transform feature space by fitting a XGBoost model and outputting its leaf indices.
        Build search index in the new space to allow nearest neighbor queries at scoring time.

        Args:
            X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors
        """

        self.E_train, self.T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(self.T_train, self.E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(
                X_val, y_val, self.xgb_params["objective"]
            )
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )
        self.feature_importances_ = self.bst.get_score()

        # creating nearest neighbor index
        leaves = self.bst.predict(dtrain, pred_leaf=True)

        self.tree = BallTree(leaves, metric="hamming", leaf_size=40)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
        self.index_id = index_id

        return self

    def predict(
        self,
        X,
        time_bins=None,
        return_ci=False,
        ci_width=0.683,
        return_interval_probs=False,
    ):
        """
        Make queries to nearest neighbor search index build on the transformed XGBoost space.
        Compute a Kaplan-Meier estimator for each neighbor-set. Predict the KM estimators.

        Args:
            X (pd.DataFrame): Dataframe with samples to generate predictions

            time_bins (np.array): Specified time windows to use when making survival predictions

            return_ci (Bool): Whether to return confidence intervals via the Exponential Greenwood formula

            ci_width (Float): Width of confidence interval

            return_interval_probs (Bool): Boolean indicating if interval probabilities are
                supposed to be returned. If False the cumulative survival is returned.


        Returns:
            (pd.DataFrame): A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are returned
            instead of the cumulative survival probabilities.

            upper_ci (np.array): Upper confidence interval for the survival
            probability values

            lower_ci (np.array): Lower confidence interval for the survival
            probability values
        """

        # converting to xgb format
        d_matrix = xgb.DMatrix(X)

        # getting leaves and extracting neighbors
        leaves = self.bst.predict(d_matrix, pred_leaf=True)

        if self.radius:
            assert self.radius > 0, "Radius must be positive"

            neighs, _ = self.tree.query_radius(
                leaves, r=self.radius, return_distance=True
            )

            number_of_neighbors = np.array([len(neigh) for neigh in neighs])

            if np.argwhere(number_of_neighbors == 1).shape[0] > 0:
                # If there is at least one sample without neighbors apart from itself
                # a warning is raised suggesting a radius increase
                warnings.warn(
                    "Warning: Some samples don't have neighbors apart from itself. Increase the radius",
                    RuntimeWarning,
                )
        else:
            _, neighs = self.tree.query(leaves, k=self.n_neighbors)

        # gathering times and events/censors for neighbor sets
        T_neighs = self.T_train[neighs]
        E_neighs = self.E_train[neighs]

        # vectorized (very fast!) implementation of Kaplan Meier curves
        if time_bins is None:
            time_bins = self.time_bins

        # calculating z-score from width
        z = st.norm.ppf(0.5 + ci_width / 2)

        preds_df, upper_ci, lower_ci = calculate_kaplan_vectorized(
            T_neighs, E_neighs, time_bins, z
        )

        if return_ci and return_interval_probs:
            raise ValueError(
                "Confidence intervals for interval probabilities is not supported. Choose between return_ci and return_interval_probs."
            )

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)
            return preds_df

        if return_ci:
            return preds_df, upper_ci, lower_ci

        return preds_df
Пример #40
0
        dist[:, bi] = 1 - np.abs(np.dot(a, b[bi]))

    return dist


@jit(nopython=True)
def quatMetricNumba2(a, b):
    """ from DOI 10.1007/s10851-009-0161-2, #4 """

    return 1 - np.abs(np.dot(a, b))


""" distance =~ ( 1 - cos(θ) ) / 2 """

tree = BallTree(qgrid, metric=quatMetricNumba2)
theta = np.deg2rad(7.5)
rad = (1 - np.cos(theta)) / 2
# rad = np.max(tree.query(qgrid,k=4)[0])
# print(rad)
""" start loop """

fibre_e = {}
fibre_q = {}

nn_gridPts = {}
nn_gridDist = {}

for fi, fam in enumerate(pf.symHKL):

    fibre_e[fi] = {}
Пример #41
0
def indexBallTree(X, leafSize):
    tree = BallTree(X, leaf_size=leafSize)
    return tree
Пример #42
0
    #print (np.mean(tt))


    #np.mean(similarities2)
    #print (synsets)

from nltk.metrics import *
#print(accuracy( similarities, similarities2))
#print (precision( similarities, similarities2))
#print(recall(similarities, similarities2))
#print()


from sklearn.neighbors import BallTree as BallTree
BT = BallTree(similarities, leaf_size=5, p=2)
dx, idx = BT.query(similarities[500,:], k=3)
print (BT.query(dx, idx = BT.query(similarities[500,:], k=3)[500,:], k=3))



tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)
# Apply the vectoriser to the training set
Cardinality=0
for files in document:
    if files.endswith('.txt'):
        Cardinality+=1
counts = CountVectorizer(input='nouns')
dtm = counts.fit_transform(document)  # a sparse matrix
vocab = counts.get_feature_names()  # a list
#type(dtm)
	def train(self, df_gt):
		mean_imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
		mean_imputer.fit(df_gt.as_matrix())
		self.imputer = BallTree(mean_imputer.transform(df_gt.as_matrix()))
Пример #44
0
def build_tree(points):
    if points.shape[1] >= 20:
        # for large dimensions, use BallTree
        return BallTree(points, metric='chebyshev')
    return KDTree(points, metric='chebyshev')
Пример #45
0
def get_nearest_edges(G, X, Y, method=None, dist=0.0001):
    """
    Return the graph edges nearest to a list of points. Pass in points
    as separate vectors of X and Y coordinates. The 'kdtree' method
    is by far the fastest with large data sets, but only finds approximate
    nearest edges if working in unprojected coordinates like lat-lng (it
    precisely finds the nearest edge if working in projected coordinates).
    The 'balltree' method is second fastest with large data sets, but it
    is precise if working in unprojected coordinates like lat-lng.

    Parameters
    ----------
    G : networkx multidigraph
    X : list-like
        The vector of longitudes or x's for which we will find the nearest
        edge in the graph. For projected graphs, use the projected coordinates,
        usually in meters.
    Y : list-like
        The vector of latitudes or y's for which we will find the nearest
        edge in the graph. For projected graphs, use the projected coordinates,
        usually in meters.
    method : str {None, 'kdtree', 'balltree'}
        Which method to use for finding nearest edge to each point.
        If None, we manually find each edge one at a time using
        osmnx.utils.get_nearest_edge. If 'kdtree' we use
        scipy.spatial.cKDTree for very fast euclidean search. Recommended for
        projected graphs. If 'balltree', we use sklearn.neighbors.BallTree for
        fast haversine search. Recommended for unprojected graphs.

    dist : float
        spacing length along edges. Units are the same as the geom; Degrees for
        unprojected geometries and meters for projected geometries. The smaller
        the value, the more points are created.

    Returns
    -------
    ne : ndarray
        array of nearest edges represented by their startpoint and endpoint ids,
        u and v, the OSM ids of the nodes.

    Info
    ----
    The method creates equally distanced points along the edges of the network.
    Then, these points are used in a kdTree or BallTree search to identify which
    is nearest.Note that this method will not give the exact perpendicular point
    along the edge, but the smaller the *dist* parameter, the closer the solution
    will be.

    Code is adapted from an answer by JHuw from this original question:
    https://gis.stackexchange.com/questions/222315/geopandas-find-nearest-point
    -in-other-dataframe
    """
    start_time = time.time()

    if method is None:
        # calculate nearest edge one at a time for each point
        ne = [get_nearest_edge(G, (x, y)) for x, y in zip(X, Y)]
        ne = [(u, v) for _, u, v in ne]

    elif method == 'kdtree':

        # check if we were able to import scipy.spatial.cKDTree successfully
        if not cKDTree:
            raise ImportError(
                'The scipy package must be installed to use this optional feature.'
            )

        # transform graph into DataFrame
        edges = graph_to_gdfs(G, nodes=False, fill_edge_geometry=True)

        # transform edges into evenly spaced points
        edges['points'] = edges.apply(
            lambda x: redistribute_vertices(x.geometry, dist), axis=1)

        # develop edges data for each created points
        extended = edges['points'].apply([pd.Series]).stack().reset_index(
            level=1, drop=True).join(edges).reset_index()

        # Prepare btree arrays
        nbdata = np.array(
            list(
                zip(extended['Series'].apply(lambda x: x.x),
                    extended['Series'].apply(lambda x: x.y))))

        # build a k-d tree for euclidean nearest node search
        btree = cKDTree(data=nbdata, compact_nodes=True, balanced_tree=True)

        # query the tree for nearest node to each point
        points = np.array([X, Y]).T
        dist, idx = btree.query(points, k=1)  # Returns ids of closest point
        eidx = extended.loc[idx, 'index']
        ne = edges.loc[eidx, ['u', 'v']]

    elif method == 'balltree':

        # check if we were able to import sklearn.neighbors.BallTree successfully
        if not BallTree:
            raise ImportError(
                'The scikit-learn package must be installed to use this optional feature.'
            )

        # transform graph into DataFrame
        edges = graph_to_gdfs(G, nodes=False, fill_edge_geometry=True)

        # transform edges into evenly spaced points
        edges['points'] = edges.apply(
            lambda x: redistribute_vertices(x.geometry, dist), axis=1)

        # develop edges data for each created points
        extended = edges['points'].apply([pd.Series]).stack().reset_index(
            level=1, drop=True).join(edges).reset_index()

        # haversine requires data in form of [lat, lng] and inputs/outputs in units of radians
        nodes = pd.DataFrame({
            'x': extended['Series'].apply(lambda x: x.x),
            'y': extended['Series'].apply(lambda x: x.y)
        })
        nodes_rad = np.deg2rad(nodes[['y', 'x']].values.astype(np.float))
        points = np.array([Y, X]).T
        points_rad = np.deg2rad(points)

        # build a ball tree for haversine nearest node search
        tree = BallTree(nodes_rad, metric='haversine')

        # query the tree for nearest node to each point
        idx = tree.query(points_rad, k=1, return_distance=False)
        eidx = extended.loc[idx[:, 0], 'index']
        ne = edges.loc[eidx, ['u', 'v']]

    else:
        raise ValueError('You must pass a valid method name, or None.')

    log('Found nearest edges to {:,} points in {:,.2f} seconds'.format(
        len(X),
        time.time() - start_time))

    return np.array(ne)
Пример #46
0
def nearest_correspondance(pts_src, pts_dest, data_src):
    tree = BallTree(pts_src, leaf_size=2)
    _, indices = tree.query(pts_dest, k=1)
    indices = indices.ravel()
    data_dest = data_src[indices]
    return data_dest
Пример #47
0
def fit_point_cloud(src_pts,
                    tgt_pts,
                    rotate=True,
                    translate=True,
                    scale=0,
                    x0=None,
                    leastsq_args={},
                    out='params'):
    """Find a transform between unmatched sets of points.

    This minimizes the squared distance from each source point to its closest
    target point, using :func:`scipy.optimize.leastsq` to find a
    transformation using rotation, translation, and scaling (in that order).

    Parameters
    ----------
    src_pts : array, shape = (n, 3)
        Points to which the transform should be applied.
    tgt_pts : array, shape = (m, 3)
        Points to which src_pts should be fitted. Each point in tgt_pts should
        correspond to the point in src_pts with the same index.
    rotate : bool
        Allow rotation of the ``src_pts``.
    translate : bool
        Allow translation of the ``src_pts``.
    scale : 0 | 1 | 3
        Number of scaling parameters. With 0, points are not scaled. With 1,
        points are scaled by the same factor along all axes. With 3, points are
        scaled by a separate factor along each axis.
    x0 : None | tuple
        Initial values for the fit parameters.
    leastsq_args : dict
        Additional parameters to submit to :func:`scipy.optimize.leastsq`.
    out : 'params' | 'trans'
        In what format to return the estimate: 'params' returns a tuple with
        the fit parameters; 'trans' returns a transformation matrix of shape
        (4, 4).

    Returns
    -------
    x : array, shape = (n_params, )
        Estimated parameters for the transformation.

    Notes
    -----
    Assumes that the target points form a dense enough point cloud so that
    the distance of each src_pt to the closest tgt_pt can be used as an
    estimate of the distance of src_pt to tgt_pts.
    """
    from scipy.optimize import leastsq
    kwargs = {'epsfcn': 0.01}
    kwargs.update(leastsq_args)

    # assert correct argument types
    src_pts = np.atleast_2d(src_pts)
    tgt_pts = np.atleast_2d(tgt_pts)
    translate = bool(translate)
    rotate = bool(rotate)
    scale = int(scale)

    if translate:
        src_pts = np.hstack((src_pts, np.ones((len(src_pts), 1))))

    try:
        from sklearn.neighbors import BallTree
        tgt_pts = BallTree(tgt_pts)
        errfunc = _point_cloud_error_balltree
    except ImportError:
        warn("Sklearn could not be imported. Fitting points will be slower. "
             "To improve performance, install the sklearn module.")
        errfunc = _point_cloud_error

    # for efficiency, define parameter specific error function
    param_info = (rotate, translate, scale)
    if param_info == (True, False, 0):
        x0 = x0 or (0, 0, 0)

        def error(x):
            rx, ry, rz = x
            trans = rotation3d(rx, ry, rz)
            est = dot(src_pts, trans.T)
            err = errfunc(est, tgt_pts)
            return err
    elif param_info == (True, False, 1):
        x0 = x0 or (0, 0, 0, 1)

        def error(x):
            rx, ry, rz, s = x
            trans = rotation3d(rx, ry, rz) * s
            est = dot(src_pts, trans.T)
            err = errfunc(est, tgt_pts)
            return err
    elif param_info == (True, False, 3):
        x0 = x0 or (0, 0, 0, 1, 1, 1)

        def error(x):
            rx, ry, rz, sx, sy, sz = x
            trans = rotation3d(rx, ry, rz) * [sx, sy, sz]
            est = dot(src_pts, trans.T)
            err = errfunc(est, tgt_pts)
            return err
    elif param_info == (True, True, 0):
        x0 = x0 or (0, 0, 0, 0, 0, 0)

        def error(x):
            rx, ry, rz, tx, ty, tz = x
            trans = dot(translation(tx, ty, tz), rotation(rx, ry, rz))
            est = dot(src_pts, trans.T)
            err = errfunc(est[:, :3], tgt_pts)
            return err
    else:
        raise NotImplementedError(
            "The specified parameter combination is not implemented: "
            "rotate=%r, translate=%r, scale=%r" % param_info)

    est, _, info, msg, _ = leastsq(error, x0, full_output=True, **kwargs)
    logger.debug("fit_point_cloud leastsq (%i calls) info: %s", info['nfev'],
                 msg)

    if out == 'params':
        return est
    elif out == 'trans':
        return _trans_from_params(param_info, est)
    else:
        raise ValueError("Invalid out parameter: %r. Needs to be 'params' or "
                         "'trans'." % out)
class SNNAP:
    def __init__(self,
                 clip_runtime=True,
                 feature_selection='chi-squared',
                 top_n=3,
                 k_neighbours=60):
        self._name = 'snnap'
        self._clip_runtime = clip_runtime
        self._feature_selection = feature_selection
        self._top_n = top_n
        self._k_neighbours = k_neighbours
        self._imputer = SimpleImputer()
        self._scaler = MaxAbsScaler()
        self._runtime_scaler = StandardScaler()
        self._models = []
        self._rfr_params = {
            'n_estimators': 100,
            'criterion': 'mse',
            'max_depth': None,
            'min_samples_split': 2
        }

    def get_name(self):
        return self._name

    def fit(self, scenario: ASlibScenario, fold: int, num_instances: int):
        self._num_algorithms = len(scenario.algorithms)
        self._top_n = min(self._num_algorithms, self._top_n)

        # resample `amount_of_training_instances` instances and preprocess them accordingly
        features, performances = self._resample_instances(
            scenario.feature_data.values,
            scenario.performance_data.values,
            num_instances,
            random_state=fold)
        # TODO: apply feature filtering such as chi-squared based selection technique
        features, performances = self._preprocess_scenario(
            scenario, features, performances)

        # train runtime prediction model for each model
        self._models = [
            RandomForestRegressor(random_state=fold, **self._rfr_params)
            for alg in range(self._num_algorithms)
        ]
        for num, model in enumerate(self._models):
            model.fit(features, performances[:, num])

        # build index to retrieve k nearest neighbours based on Jaccard distance of best n solvers
        self._index = BallTree(performances,
                               leaf_size=30,
                               metric='pyfunc',
                               func=SNNAP._top_n_jaccard,
                               metric_params={'top_n': self._top_n})
        self._performances = np.copy(performances)

    def predict(self, features, instance_id: int):
        assert (features.ndim == 1), '`features` must be one dimensional'
        features = np.expand_dims(features, axis=0)
        features = self._imputer.transform(features)
        features = self._scaler.transform(features)

        # predict runtimes and get k nearest neighbours based on Jaccard distance of best n solvers
        predicted = np.asarray([
            model.predict(features) for model in self._models
        ]).reshape(1, -1)
        neighbour_idx = np.squeeze(
            self._index.query(predicted,
                              self._k_neighbours,
                              return_distance=False))

        # find best solver on the instance's k nearest neighbours (best avg. runtime / PAR10 score)
        sub_performances = self._performances[neighbour_idx, :]

        # the summed performance induces a valid ranking
        return np.sum(sub_performances, axis=0)

    def _resample_instances(self, feature_data, performance_data,
                            num_instances, random_state):
        num_instances = min(num_instances, np.size(
            performance_data, axis=0)) if num_instances > 0 else np.size(
                performance_data, axis=0)
        return resample(feature_data,
                        performance_data,
                        n_samples=num_instances,
                        random_state=random_state)

    def _preprocess_scenario(self, scenario: ASlibScenario, features,
                             performances):
        # TODO: paper does not explicitly mention feature imputation & feature scaling
        features = self._imputer.fit_transform(features)
        features = self._scaler.fit_transform(features)

        # train predictors and select algorithms on running time instead of PAR10 if warranted
        if self._clip_runtime:
            performances = np.clip(performances,
                                   a_min=np.NINF,
                                   a_max=scenario.algorithm_cutoff_time)

        # scale performances to zero mean and unitary standard deviation
        performances = self._runtime_scaler.fit_transform(performances)

        return features, performances

    @staticmethod
    def _top_n_jaccard(x, y, **kwargs):
        top_n = kwargs['metric_params']['top_n']
        top_n_1 = set(np.argpartition(x, top_n)[:top_n])
        top_n_2 = set(np.argpartition(y, top_n)[:top_n])

        return len(top_n_1.intersection(top_n_2)) / float(
            len(top_n_1.union(top_n_2)))
Пример #49
0
class BallsBound(Bound):
    def __init__(self, points, scale=1.0):
        """Simple mask based on coverage balls around inducing points.

        Args:
            points (Array): shape (num_points, n_dim)
            scale (float): Scale ball size (default 1.0)
        """
        assert len(points.shape) == 2
        self.X = points
        self._udim = self.X.shape[-1]
        self.bt = BallTree(self.X, leaf_size=2)
        self.epsilon = self._set_epsilon(self.X, self.bt, scale)
        self._volume = self._get_volume(self.X, self.epsilon, self.bt)

    @property
    def volume(self):
        return self._volume

    @property
    def udim(self):
        return self._udim

    @staticmethod
    def _set_epsilon(X, bt, scale):
        dims = X.shape[-1]
        k = [4, 5, 6]
        dist, ind = bt.query(X, k=k[dims - 1])  # 4th NN
        epsilon = np.median(dist[:, -1]) * scale * 1.5
        return epsilon

    @staticmethod
    def _get_volume(X, epsilon, bt):
        N = 100
        vol_est = []
        d = X.shape[-1]
        area = {1: 2 * epsilon, 2: np.pi * epsilon**2}[d]
        for i in range(N):
            n = np.random.randn(*X.shape)
            norm = (n**2).sum(axis=1)**0.5
            n = n / norm.reshape(-1, 1)
            r = np.random.rand(len(X))**(1 / d) * epsilon
            Y = X + n * r.reshape(-1, 1)
            in_bounds = ((Y >= 0.0) & (Y <= 1.0)).prod(axis=1, dtype="bool")
            Y = Y[in_bounds]
            counts = bt.query_radius(Y, epsilon, count_only=True)
            vol_est.append(area * sum(1.0 / counts))
        vol_est = np.array(vol_est)
        out, err = vol_est.mean(), vol_est.std() / np.sqrt(N)
        rel = err / out
        if rel > 0.01:
            log.debug("WARNING: Rel volume uncertainty is %.4g" % rel)
        return out

    def sample(self, N):
        counter = 0
        samples = []
        d = self.X.shape[-1]
        while counter < N:
            n = np.random.randn(*self.X.shape)
            norm = (n**2).sum(axis=1)**0.5
            n = n / norm.reshape(-1, 1)
            r = np.random.rand(len(self.X))**(1 / d) * self.epsilon
            Y = self.X + n * r.reshape(-1, 1)
            in_bounds = ((Y >= 0.0) & (Y <= 1.0)).prod(axis=1, dtype="bool")
            Y = Y[in_bounds]
            counts = self.bt.query_radius(Y, r=self.epsilon, count_only=True)
            p = 1.0 / counts
            w = np.random.rand(len(p))
            Y = Y[p >= w]
            samples.append(Y)
            counter += len(Y)
        samples = np.vstack(samples)
        ind = np.random.choice(range(len(samples)), size=N, replace=False)
        return samples[ind]

    def __call__(self, u):
        u = u.reshape(len(u), -1)
        dist, ind = self.bt.query(u, k=1)
        return (dist < self.epsilon)[:, 0]

    @classmethod
    def from_IsolatedRatio(cls, ratio, obs, bound, n=10000, th=-13):
        """Generate new BallsBound object based on IsolatedRatio.

        Args:
            ratio (IsolatedRatio): Single ratio.
            obs (dict): Reference observation.
            bound (Bound): Bound of RatioEstimator.
            th (float): Threshold value, default -13
            n (int): Number of random samples from bound to determine parameter boundaries.

        Note: All components of the RatioEstimator will be used.  Avoid overlapping ratios.
        """
        u = bound.sample(n)
        r = ratio(u)
        mask = r.max() - r < -th
        ind_points = u[mask]
        return cls(ind_points)

    def state_dict(self):
        return dict(tag="BallsBound",
                    points=self.X,
                    epsilon=self.epsilon,
                    volume=self._volume)

    @classmethod
    def from_state_dict(cls, state_dict):
        obj = cls.__new__(cls)
        obj.X = state_dict["points"]
        assert len(obj.X.shape) == 2
        obj._udim = obj.X.shape[-1]
        obj.epsilon = state_dict["epsilon"]
        obj._volume = state_dict["volume"]
        obj.bt = BallTree(obj.X, leaf_size=2)
        return obj
Пример #50
0
 def __init__(self, features, labels, k=5):
     self.features = features
     self._kdtree = BallTree(features)
     self._y = labels
     self._k = k
Пример #51
0
class Knearest:
    """
    kNN classifier
    """
    def __init__(self, x, y, k=5):
        """
        Creates a kNN instance

        :param x: Training data input
        :param y: Training data output
        :param k: The number of nearest points to consider in classification
        """

        # You can modify the constructor, but you shouldn't need to.
        # Do not use another datastructure from anywhere else to
        # complete the assignment.

        self._kdtree = BallTree(x)
        self._y = y
        self._k = k

    def majority(self, item_indices):
        """
        Given the indices of training examples, return the majority label.  If
        there's a tie, return the median value (as implemented in numpy).

        :param item_indices: The indices of the k nearest neighbors
        """
        assert len(item_indices) == self._k, "Did not get k inputs"

        # Finish this function to return the most common y value for
        # these indices
        #
        # http://docs.scipy.org/doc/numpy/reference/generated/numpy.median.html

        # If only one neighbor, the majority is definitely its label
        if len(item_indices) == 1:
            return self._y[item_indices[0]]

        labels = [self._y[x] for x in item_indices]

        u = numpy.unique(
            labels, return_counts=True
        )  # tuple, first element is the labels, second element is the count

        sorted_counts = numpy.argsort(
            u[1]
        )  # last element (i.e., sorted_counts[-1]) is the index of highest, etc

        majority_labels = []

        for i in range(len(sorted_counts)):
            if u[1][sorted_counts[i]] == u[1][sorted_counts[-1]]:
                majority_labels.append(u[0][sorted_counts[i]])

        return numpy.median(majority_labels)

    def classify(self, example):
        """
        Given an example, classify the example.

        :param example: A representation of an example in the same
        format as training data
        """

        # Finish this function to find the k closest points, query the
        # majority function, and return the value.

        dist, ind = self._kdtree.query([example], self._k)

        return self.majority(ind[0])

        # return self.majority(list(random.randrange(len(self._y)) \
        #                           for x in range(self._k)))

    def confusion_matrix(self, test_x, test_y):
        """
        Given a matrix of test examples and labels, compute the confusion
        matrixfor the current classifier.  Should return a dictionary of
        dictionaries where d[ii][jj] is the number of times an example
        with true label ii was labeled as jj.

        :param test_x: Test data representation
        :param test_y: Test data answers
        """

        # Finish this function to build a dictionary with the
        # mislabeled examples.  You'll need to call the classify
        # function for each example.

        d = defaultdict(dict)
        data_index = 0
        for xx, yy in zip(test_x, test_y):
            data_index += 1
            our_label = self.classify(xx)
            d[yy][our_label] = d.get(yy, {}).get(our_label, 0) + 1
            # if data_index % 100 == 0:
            #     print("%i/%i for confusion matrix" % (data_index, len(test_x)))
        return d

    @staticmethod
    def accuracy(confusion_matrix):
        """
        Given a confusion matrix, compute the accuracy of the underlying classifier.
        """

        # You do not need to modify this function

        total = 0
        correct = 0
        for ii in confusion_matrix:
            total += sum(confusion_matrix[ii].values())
            correct += confusion_matrix[ii].get(ii, 0)

        if total:
            return float(correct) / float(total)
        else:
            return 0.0
Пример #52
0
def two_point_angular_window(coords_D,
                             coords_R,
                             bins,
                             D_idx=None,
                             R_idx=None,
                             random_state=None):
    """Two-point correlation function

    Parameters
    ----------
    data_D: data ra, dec in deg, shape = [2, n_samples]
    data_R: random ra, dec in deg, shape = [2, n_samples]
    D_idx: idx of data in field, None if all in field
    R_idx: idx of random in field, None if all in field
    random_state : integer, np.random.RandomState, or None
        specify the random state to use for generating background

    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    corr_in:
    
    """

    rng = check_random_state(random_state)

    coords_D, coords_R = np.asanyarray(coords_D), np.asanyarray(coords_R)

    if D_idx is None:
        D_idx = np.arange(coords_D.shape[1], dtype=int)

    if R_idx is None:
        R_idx = np.arange(coords_R.shape[1], dtype=int)

    if bins.ndim != 1:
        raise ValueError("bins must be a 1D array")

    data = np.asarray(ra_dec_to_xyz(coords_D[0], coords_D[1]), order='F').T
    data_R = np.asarray(ra_dec_to_xyz(coords_R[0], coords_R[1]), order='F').T

    bins = angular_dist_to_euclidean_dist(bins)
    Nbins = len(bins) - 1

    factor = len(data_R) * 1. / len(data)
    factor_in = len(R_idx) * 1. / len(D_idx)

    BT_D = BallTree(data)
    BT_R = BallTree(data_R)

    counts_DD = np.zeros(Nbins + 1, dtype=int)
    counts_RR = np.zeros(Nbins + 1, dtype=int)
    counts_DD_in = np.zeros(Nbins + 1, dtype=int)
    counts_RR_in = np.zeros(Nbins + 1, dtype=int)
    for i in range(Nbins + 1):
        count_listD = BT_D.query_radius(data, bins[i])
        count_listR = BT_R.query_radius(data_R, bins[i])
        countD = np.sum([len(count) for count in count_listD])
        countR = np.sum([len(count) for count in count_listR])
        countD_in = np.sum([len(count) for count in count_listD[D_idx]])
        countR_in = np.sum([len(count) for count in count_listR[R_idx]])
        counts_DD[i], counts_RR[i] = countD, countR
        counts_DD_in[i], counts_RR_in[i] = countD_in, countR_in

    DD = np.diff(counts_DD)
    RR = np.diff(counts_RR)
    DD_in = np.diff(counts_DD_in)
    RR_in = np.diff(counts_RR_in)

    # check for zero in the denominator
    RR_zero = np.where(RR == 0)[0]
    RR_in_zero = np.where(RR_in == 0)[0]
    RR[RR_zero] = 1
    RR_in[RR_in_zero] = 1
    corr = factor**2 * DD / RR - 1
    corr_in = factor_in**2 * DD_in / RR_in - 1

    corr[RR_zero] = np.nan
    corr_in[RR_in_zero] = np.nan

    return corr, corr_in
Пример #53
0
def in_hull(p, hull):
    """
    Test if points in `p` are in `hull`

    `p` should be a `NxK` coordinates of `N` points in `K` dimensions
    `hull` is either a scipy.spatial.Delaunay object or the `MxK` array of the 
    coordinates of `M` points in `K`dimensions for which Delaunay triangulation
    will be computed
    """
    if not isinstance(hull,Delaunay):
        hull = Delaunay(hull)

    return hull.find_simplex(p)>=0

xyz = np.vstack([eastings, northings, elevations]).T
neighbour_tree = BallTree(xyz)

ref_elevation= -6380
east_grid = np.arange(min(eastings),max(eastings),grid_spacing)
north_grid = np.arange(min(northings), max(northings), grid_spacing)
elevation_grid = np.arange(min(elevations), max(elevations), grid_spacing)

i_slice = np.where(elevation_grid<ref_elevation)[0][-1]


grid_points = []
for i_east,e_grid in enumerate(east_grid):
    for i_north, n_grid in enumerate(north_grid):
        grid_points.append([e_grid, n_grid, ref_elevation])

ne_grid, nn_grid,nz_grid = len(east_grid), len(north_grid),len(elevation_grid)
Пример #54
0

from data import main
from termination_criterion import cluster_evaluation

from sklearn.neighbors import BallTree
from identify_centroid import centroid, determine_radius
from tqdm import tqdm
import  numpy as np


all_latent, low_d, labels = main()
# centroids = [np.mean(low_d[labels == l], axis=0) for l in range(2)]

dists = [low_d[labels == l] for l in np.unique(labels)]
dense_centroids = np.array([centroid(d, BallTree(d))[2] for d in dists])

print(f'Two? {cluster_evaluation(low_d, labels, dense_centroids)}')

# ones = low_d[labels == 0]
# tree = BallTree(ones)
# one_points, radius, proposal = centroid(ones, tree)


# def approx_equal(one: np.ndarray, two: np.ndarray) -> bool:
#     '''Are the two arrays approximately equal?'''
#     return (one - two < 1).all()


# from matplotlib import pyplot as plt
Пример #55
0
class Vectors:
    def __init__(self, filename=None, optimize=True):
        self.word_index = {}
        self.vectors = []
        self.words = []
        self.ball_tree = None
        if filename:
            with open(filename) as infile:
                print('Reading vectors...')
                index = 0
                for line in infile:
                    line = line.split()
                    if len(line) == 301:
                        self.word_index[line[0]] = index
                        self.vectors.append([float(x) for x in line[1:]])
                        self.words.append(line[0])
                        index += 1
                print('Finished')
        if optimize and filename:
            self.optimize()

    def optimize(self):
        if not self.ball_tree:
            print('Optimizing search...')
            self.ball_tree = BallTree(self.vectors, metric=cosine)
            print('Finished.')
        else:
            print('Already optimized.')

    def save(self, filename):
        if filename.split('.')[-1] != 'vecs':
            filename += '.vecs'
        with open(filename, 'wb') as outfile:
            cPickle.dump(
                (self.word_index, self.vectors, self.words, self.ball_tree),
                outfile)
        print('saved as ' + filename)

    def load(self, filename):
        if filename.split('.')[-1] != 'vecs':
            filename += '.vecs'
        with open(filename, 'rb') as infile:
            self.word_index, self.vectors, self.words, self.ball_tree = cPickle.load(
                infile)

    def get(self, string, errors=True):
        # type: (str, bool) -> list
        return_vec = [0] * len(self.vectors[0])
        for word in string.split():
            try:
                return_vec = numpy.add(self.vectors[self.word_index[word]],
                                       return_vec)
            except Exception as e:
                if errors:
                    raise e
        return return_vec

    def search(self, vector, k=1, return_distance=False):
        if self.ball_tree:
            a = self.ball_tree.query(numpy.array(vector).reshape(1, -1), k=k)
            dist, ind = a[0][0], a[1][0]
        else:
            dists = [cosine(vector, vec) for vec in self.vectors]
            ind = numpy.argsort(dists)[:k]
            dist = [dists[i] for i in ind]
            del dists
        if return_distance:
            return tuple(
                ((self.words[ind[i]], dist[i]) for i in xrange(len(ind))))
        else:
            return tuple(self.words[ind[i]] for i in xrange(len(ind)))

    def distance(self, item1, item2, errors=True):
        if isinstance(item1, str):
            item1 = self.get(item1, errors=errors)
        if isinstance(item2, str):
            item2 = self.get(item2, errors=errors)
        return cosine(item1, item2)
Пример #56
0
class SpatioTemporalModel(nn.Module):
    def __init__(self, u_size, v_size, t_size, emb_dim_u=32, emb_dim_v=32, emb_dim_t=16, hidden_dim=32, nb_cnt=100, sampling_list=None, vid_coor_rad=None, vid_pop=None, dropout=0.5):
        super(SpatioTemporalModel, self).__init__()
        self.emb_dim_u = emb_dim_u
        self.emb_dim_v = emb_dim_v
        self.emb_dim_t = emb_dim_t
        self.hidden_dim = hidden_dim
        self.u_size = u_size
        self.v_size = v_size
        self.t_size = t_size
        self.nb_cnt = nb_cnt
        self.dropout = dropout
        self.sampling_list = sampling_list
        self.vid_coor_rad = vid_coor_rad
        self.vid_pop = vid_pop
        self.tree = BallTree(vid_coor_rad.values(), leaf_size=40, metric='haversine')
        self.dist_metric = DistanceMetric.get_metric('haversine')
        self.uid_rid_sampling_info = {}
        for uid in range(0, u_size):
            self.uid_rid_sampling_info[uid] = {}

        self.rnn_short = nn.RNNCell(self.emb_dim_v, self.hidden_dim) #TODO check GRU
        self.rnn_long = nn.GRUCell(self.emb_dim_v, self.hidden_dim)
        self.embedder_u = nn.Embedding(self.u_size, self.emb_dim_u)
        self.embedder_v = nn.Embedding(self.v_size, self.emb_dim_v)
        self.embedder_t = nn.Embedding(self.t_size, self.emb_dim_t)
        self.embedder_v_context = nn.Embedding(self.v_size, self.hidden_dim * 2 + self.emb_dim_u + self.emb_dim_t)

    def forward(self, records_u, is_train, mod=0):
        predicted_scores = Variable(torch.zeros(records_u.get_predicting_records_cnt(mod=0), 1)) if is_train else []
        rid_vids_true = []
        rid_vids = []
        vids_visited = set()

        records_al = records_u.get_records(mod=0) if is_train else records_u.get_records(mod=2)
        emb_u = self.embedder_u(Variable(torch.LongTensor([records_u.uid])).view(1, -1)).view(1, -1)
        hidden_long = self.init_hidden()
        idx = 0
        for rid, record in enumerate(records_al[: -1]):
            if record.is_first:
                hidden_short = self.init_hidden()
            vids_visited.add(record.vid)
            emb_v = self.embedder_v(Variable(torch.LongTensor([record.vid])).view(1, -1)).view(1, -1)
            emb_t_next = self.embedder_t(Variable(torch.LongTensor([record.tid_next])).view(1, -1)).view(1, -1)
            hidden_long = self.rnn_long(emb_v, hidden_long)
            hidden_short = self.rnn_short(emb_v, hidden_short)
            if record.is_last:
                continue

            hidden = torch.cat((hidden_long.view(1, -1), hidden_short.view(1, -1), emb_u.view(1, -1), emb_t_next.view(1, -1)), 1)
            if is_train:
                rid_vids_true.append(record.vid_next)
                vid_candidates = self.get_vids_candidate(records_u.uid, rid, record.vid_next, vids_visited, True, False)
                scores = Variable(torch.zeros(1, self.nb_cnt + 1))
            else:
                if rid >= records_u.test_idx:
                    rid_vids_true.append(record.vid_next)
                    vid_candidates = self.get_vids_candidate(records_u.uid, rid, record.vid_next, vids_visited, False, False)
                    scores = Variable(torch.zeros(1, self.v_size))
                    predicted_scores.append([])
                else:
                    continue
            for vid_idx, vid_candidate in enumerate(vid_candidates):
                emb_v_context = self.embedder_v_context(Variable(torch.LongTensor([vid_candidate])).view(1, -1)).view(-1, 1)
                scores[0, vid_idx] = torch.mm(hidden, emb_v_context)
            predicted_scores[idx] = F.softmax(scores)[0, 0] if is_train else F.softmax(scores)
            rid_vids.append(vid_candidates)
            idx += 1
        return predicted_scores, rid_vids, rid_vids_true

    def get_vids_candidate(self, uid, rid, vid_true=None, vids_visited=None, is_train=True, use_distance=True):
        if not use_distance:
            if is_train:
                vid_candidates = [vid_true]
                while len(vid_candidates) <= self.nb_cnt:
                    vid_candidate = self.sampling_list[random.randint(0, len(self.sampling_list) - 1)]
                    if vid_candidate != vid_true:
                        vid_candidates.append(vid_candidate)
                return vid_candidates
            else:
                return range(self.v_size)
        else:
            if rid in self.uid_rid_sampling_info[uid]:
                vids, probs = self.uid_rid_sampling_info[uid][rid]
            else:
                nbs = set()
                for vid_visited in vids_visited:
                    vids = self.tree.query_radius([self.vid_coor_rad[vid_visited]], r=0.000172657)
                    for vid in vids[0]:
                        if (not is_train) or (is_train and vid != vid_true):
                            nbs.add(vid)
                vids = list(nbs)
                probs = np.array([self.vid_pop[vid] for vid in vids], dtype=np.float64)
                probs /= probs.sum()
                self.uid_rid_sampling_info[uid][rid] = (vids, probs)
            if is_train:
                id_cnt = np.random.multinomial(self.nb_cnt, probs)
                vid_candidates = [vid_true]
                for id, cnt in enumerate(id_cnt):
                    for _ in range(cnt):
                        vid_candidates.append(vids[id])
                return vid_candidates
            else:
                return vids

    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_dim))
Пример #57
0
class Knearest:
    """
    kNN classifier
    """
    def __init__(self, X, y, k=5):
        """
        Creates a kNN instance
        :param x: Training data input
        :param y: Training data output
        :param k: The number of nearest points to consider in classification
        """

        self._kdtree = BallTree(X)
        self._y = y
        self._k = k
        self._counts = self.label_counts()
        
    def label_counts(self): 
        """
        Given the training labels, return a dictionary d where d[y] is  
        the number of times that label y appears in the training set. 
        """
        dictionary = { }
        for labels in self._y:
            if labels not in dictionary:
                dictionary[labels] = 0
            dictionary[labels] += 1

        return dictionary

    def majority(self, neighbor_indices):
        """
        Given the indices of training examples, return the majority label. Break ties 
        by choosing the tied label that appears most often in the training data. 

        :param neighbor_indices: The indices of the k nearest neighbors
        """
        assert len(neighbor_indices) == self._k, "Did not get k neighbor indices"

        neighbor_labels = [self._y[i] for i in neighbor_indices] #given indices, grab the corresponding labels from self._y
        
        labels_frequency = { }
        for labels in neighbor_labels:
            if labels not in labels_frequency:
                labels_frequency[labels] = 0
            labels_frequency[labels] += 1
            
        maximum = max(labels_frequency, key = labels_frequency.get)
        maximum_label = 0

        if labels_frequency[maximum] == 1:
            for x in labels_frequency:
                if self._counts[x] > maximum_label:
                    maximum_label = self._counts[x]
                    maximum = x

        return maximum
    
    def classify(self, example):
        """
        Given an example, return the predicted label. 

        :param example: A representation of an example in the same
        format as a row of the training data
        """
        dist, ind = self._kdtree.query(np.array(example).reshape(1, -1), k=self._k)
        return self.majority(ind[0])

    def confusion_matrix(self, test_x, test_y):
        """
        Given a matrix of test examples and labels, compute the confusion
        matrix for the current classifier.  Should return a 2-dimensional
        numpy array of ints, C, where C[ii,jj] is the number of times an 
        example with true label ii was labeled as jj.

        :param test_x: test data 
        :param test_y: true test labels 
        """
        C = np.zeros((10,10), dtype=int)

        for xx, yy in zip(test_x, test_y):
            jj = self.classify(xx) #return the predicted label
            C[yy][jj] += 1 #increase by 1 where true label and predicted label intersect

        #print(C)
        return C 
            
    @staticmethod
    def accuracy(C):
        """
        Given a confusion matrix C, compute the accuracy of the underlying classifier.
        
        :param C: a confusion matrix 
        """
        return np.sum(C.diagonal()) / C.sum()
Пример #58
0
def knn_forward(coord_array):
    X = np.array(list(map(lambda x: [x[0] * PI_R, x[1] * PI_R], coord_array)))
    tree = BallTree(X, leaf_size=2, metric='haversine')
    res = tree.query_radius(X[-1].reshape(1, -1), r=R)
    return res[0]
    def fit(
        self,
        X,
        y,
        num_boost_round=1000,
        validation_data=None,
        early_stopping_rounds=None,
        verbose_eval=0,
        persist_train=True,
        index_id=None,
        time_bins=None,
    ):
        """
        Transform feature space by fitting a XGBoost model and outputting its leaf indices.
        Build search index in the new space to allow nearest neighbor queries at scoring time.

        Args:
            X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            num_boost_round (Int): Number of boosting iterations.

            validation_data (Tuple): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

        Returns:
            XGBSEKaplanNeighbors: Fitted instance of XGBSEKaplanNeighbors
        """

        self.E_train, self.T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(self.T_train, self.E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # converting validation data to xgb format
        evals = ()
        if validation_data:
            X_val, y_val = validation_data
            dvalid = convert_data_to_xgb_format(
                X_val, y_val, self.xgb_params["objective"]
            )
            evals = [(dvalid, "validation")]

        # training XGB
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            evals=evals,
            verbose_eval=verbose_eval,
        )
        self.feature_importances_ = self.bst.get_score()

        # creating nearest neighbor index
        leaves = self.bst.predict(dtrain, pred_leaf=True)

        self.tree = BallTree(leaves, metric="hamming", leaf_size=40)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
        self.index_id = index_id

        return self
    def fit(
        self,
        X,
        y,
        persist_train=True,
        index_id=None,
        time_bins=None,
        ci_width=0.683,
        **xgb_kwargs,
    ):
        """
        Fit a single decision tree using xgboost. For each leaf in the tree,
        build a Kaplan-Meier estimator.

        !!! Note
            * Differently from `XGBSEKaplanNeighbors`, in `XGBSEKaplanTree`, the width of
            the confidence interval (`ci_width`) must be specified at fit time.

        Args:

            X ([pd.DataFrame, np.array]): Design matrix to fit XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

            time_bins (np.array): Specified time windows to use when making survival predictions

            ci_width (Float): Width of confidence interval

        Returns:
            XGBSEKaplanTree: Trained instance of XGBSEKaplanTree
        """

        E_train, T_train = convert_y(y)
        if time_bins is None:
            time_bins = get_time_bins(T_train, E_train)
        self.time_bins = time_bins

        # converting data to xgb format
        dtrain = convert_data_to_xgb_format(X, y, self.xgb_params["objective"])

        # training XGB
        self.bst = xgb.train(self.xgb_params, dtrain, num_boost_round=1, **xgb_kwargs)
        self.feature_importances_ = self.bst.get_score()

        # getting leaves
        leaves = self.bst.predict(dtrain, pred_leaf=True)

        # organizing elements per leaf
        leaf_neighs = (
            pd.DataFrame({"leaf": leaves})
            .groupby("leaf")
            .apply(lambda x: list(x.index))
        )

        # getting T and E for each leaf
        T_leaves = _align_leaf_target(leaf_neighs, T_train)
        E_leaves = _align_leaf_target(leaf_neighs, E_train)

        # calculating z-score from width
        z = st.norm.ppf(0.5 + ci_width / 2)

        # vectorized (very fast!) implementation of Kaplan Meier curves
        (
            self._train_survival,
            self._train_upper_ci,
            self._train_lower_ci,
        ) = calculate_kaplan_vectorized(T_leaves, E_leaves, time_bins, z)

        # adding leaf indexes
        self._train_survival = self._train_survival.set_index(leaf_neighs.index)
        self._train_upper_ci = self._train_upper_ci.set_index(leaf_neighs.index)
        self._train_lower_ci = self._train_lower_ci.set_index(leaf_neighs.index)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
            self.tree = BallTree(leaves.reshape(-1, 1), metric="hamming", leaf_size=40)
        self.index_id = index_id

        return self