def match(x,y,mytab):
    """Routine that matches the truth catalog
    with the input table
    
    Args:
    ----
        x: `float` RA of the truth objects to match (in degrees)
        y: `float` dec of the truth objects to match (in degrees)
        mytab: `astropy.table.Table` table containing the L2
            input catalog.

    Returns:
    -------
        ind: `int` array of indices to select the truth objects
            that match the detected objects
    """
    X = np.zeros((len(x),2))
    X[:,0]=x
    X[:,1]=y
    tree = KDTree(X,leaf_size=40)
    Y = np.zeros((len(mytab),2))
    Y[:,0]=mytab['coord_ra']*180/np.pi
    Y[:,1]=mytab['coord_dec']*180/np.pi
    dist, ind = tree.query(Y,k=1)
    print 'Matches with distance > 1 px, ', np.count_nonzero(dist>1)
    return ind
def match_bright(x,y,x2,y2,mags,dist=1./3600.):
    """Routine that matches the truth catalog
    with the input table
    
    Args:
    ----
        x: `float` RA of the truth objects to match (in degrees)
        y: `float` dec of the truth objects to match (in degrees)
        x2: `float` RA of detected objects to match (in degrees)
        y2: `float` dec of detected objects to match (in degrees)
        mags: `float` array containing the true input magnitudes
        dist: `float` maximum distance in degrees considered to match
            the objects, the default is 1 arcsecond.
    Returns:
    -------
        brightest_ind: `int` array of indices to select the truth objects
            that match the detected objects, returns -1 if no match has
            been found for a particular object
    """
    X = np.zeros((len(x),2))
    X[:,0]=x
    X[:,1]=y
    Y = np.zeros((len(x2),2))
    Y[:,0]=x2
    Y[:,1]=y2
    tree = KDTree(X,leaf_size=40)
    ind = tree.query_radius(Y, r=dist)
    brightest_indices = np.zeros(len(ind),dtype=np.int64)
    for i,ii in enumerate(ind):
        sorted_indices = np.argsort(mags[ii])
        if(len(sorted_indices)>0):
            brightest_indices[i] = ii[sorted_indices[0]]
        else:
            brightest_indices[i]=-1 
    return brightest_indices
示例#3
0
def compute_centroids(X, C):
    """Compute the centroids for dataset X given centers C. Note: centers
    C may not belong to X.
    """
    tree = KDTree(X)
    centroids = tree.query(C, k=1, return_distance=False).squeeze()
    return centroids
def count_close(x,y,x2,y2,distances):
    """Routine that counts the number of 
    objects that are within certain radius
    
    Args:
    ----
        x: `float` position X of objects to count
        y: `float` position Y of objects to count
        x2: `float` position X of the objects that serve as the center
            of the circle where we look for neighbors 
        y2: `float` position Y of the objects that serve as the center
            of the circle where we look for neighbors  
        distances: `float` array of radii where to count the objects
    Returns:
    -------
        neighbors: `float` the mean number of neighbors in a circle of radii
        corresponding to each entry of distances
        err: `float` standard deviation of the number of neighbors in a circle
        of radii corresponding to each entry of distances
    """
    X = np.zeros((len(x),2))
    X[:,0]=x
    X[:,1]=y
    Y = np.zeros((len(x2),2))
    Y[:,0]=x2
    Y[:,1]=y2
    tree = KDTree(X,leaf_size=40)
    neighbors = np.zeros(len(distances))
    err = np.zeros(len(distances))
    for i,distance in enumerate(distances):
        neighbors[i], err[i] = np.nanmean(tree.query_radius(Y, r=distance, count_only=True)), np.nanstd(tree.query_radius(Y, r=distance, count_only=True))
    return neighbors, err
示例#5
0
def compute_labels(X, C):
    """Compute the cluster labels for dataset X given centers C.
    """
    # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X
    tree = KDTree(C)
    labels = tree.query(X, k=1, return_distance=False).squeeze()
    return labels
示例#6
0
文件: enn.py 项目: timo-stoettner/ENN
    def buildDistanceMap (self, X, Y):
        classes = np.unique(Y)
        nClasses = len(classes)
        tree = KDTree(X)
        nRows = X.shape[0]

        TSOri = np.array([]).reshape(0,self.k)

        distanceMap = np.array([]).reshape(0,self.k)
        labels = np.array([]).reshape(0,self.k)

        for row in range(nRows):
            distances, indicesOfNeighbors = tree.query(X[row].reshape(1,-1), k = self.k+1)

            distances = distances[0][1:]
            indicesOfNeighbors = indicesOfNeighbors[0][1:]

            distanceMap = np.append(distanceMap, np.array(distances).reshape(1,self.k), axis=0)
            labels = np.append(labels, np.array(Y[indicesOfNeighbors]).reshape(1,self.k),axis=0)

        for c in classes:
            nTraining = np.sum(Y == c)
            labelTmp = labels[Y.ravel() == c,:]

            tmpKNNClass = labelTmp.ravel()
            TSOri = np.append(TSOri, len(tmpKNNClass[tmpKNNClass == c]) / (nTraining*float(self.k)))

        return distanceMap, labels, TSOri    
def kdtree(data, lake_matrix, k_neighbors = 10, leaf_size = 20):
    # training
    kdtree = KDTree(data, leaf_size=leaf_size, metric='euclidean')

    # testing
    distances, indices = kdtree.query(lake_matrix, k=k_neighbors)
    return np.array(indices), distances
示例#8
0
def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
                          metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    min_samples = min(dim - 1, min_samples)

    tree = KDTree(X, metric=metric, leaf_size=leaf_size)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)

    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
示例#9
0
文件: vksz.py 项目: amanzotti/vksz
def study_redmapper_lrg_3d(hemi='north'):
    # create 3d grid object
    grid = grid3d(hemi=hemi)
    
    # load SDSS data
    sdss = load_sdss_data_both_catalogs(hemi)
    
    # load redmapper catalog
    rm = load_redmapper(hemi=hemi)
    
    # get XYZ positions (Mpc) of both datasets
    x_sdss, y_sdss, z_sdss = grid.xyz_from_radecz(sdss['ra'], sdss['dec'], sdss['z'], applyzcut=False)
    x_rm, y_rm, z_rm = grid.xyz_from_radecz(rm['ra'], rm['dec'], rm['z_spec'], applyzcut=False)
    pos_sdss = np.vstack([x_sdss, y_sdss, z_sdss]).T
    pos_rm = np.vstack([x_rm, y_rm, z_rm]).T

    # build a couple of KDTree's, one for SDSS, one for RM.
    from sklearn.neighbors import KDTree
    tree_sdss = KDTree(pos_sdss, leaf_size=30)
    tree_rm = KDTree(pos_rm, leaf_size=30)

    lrg_counts = tree_sdss.query_radius(pos_rm, 100., count_only=True)
    pl.clf()
    pl.hist(lrg_counts, bins=50)
    
    
    ipdb.set_trace()
示例#10
0
文件: photo.py 项目: bd-j/pire
def match(x1, y1, x2=None, y2=None, k=5, kdt=None):
    X2 = np.vstack([x2, y2]).T
    X1 = np.vstack([x1, y1]).T
    if kdt is None:
        kdt = KDTree(X2, leaf_size=30, metric='euclidean')
    dists, inds = kdt.query(X1, k=k, return_distance=True)
    return dists, inds, kdt
示例#11
0
def _hdbscan_large_kdtree_cdist(X, min_cluster_size=5, min_samples=None, alpha=1.0,
                                metric='minkowski', p=2, gen_min_span_tree=False):

    if p is None:
        p = 2

    dim = X.shape[0]
    min_samples = min(dim - 1, min_samples)

    if metric == 'minkowski':
        tree = KDTree(X, metric=metric, p=p)
    else:
        tree = KDTree(X, metric=metric)

    core_distances = tree.query(X, k=min_samples)[0][:,-1]

    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, metric, p)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)
    condensed_tree = condense_tree(single_linkage_tree,
                                   min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)

    labels = -1 * np.ones(X.shape[0], dtype=int)
    probabilities = np.zeros(X.shape[0], dtype=float)
    for index, (cluster, prob) in enumerate(cluster_list):
        labels[cluster] = index
        probabilities[cluster] = prob
    return labels, probabilities, condensed_tree, single_linkage_tree, None
示例#12
0
def margin(indices, k, X, y):
    margins = []
    kd_tree = KDTree(X)
    for img_index in indices:
        margin = 0
        in_class = 0
        # most_frequent_class = 0
        current_class = y[img_index]
        # print current_class
        dists, neighbour_indices = kd_tree.query(X[img_index].reshape((1, X[img_index].shape[0])),
                                                 k)
        for index in neighbour_indices[0]:
            # print y[index]
            if y[index] == current_class:
                in_class += 1
        neighbour_dict = {}
        for index in neighbour_indices[0]:
            if y[index] in neighbour_dict:
                neighbour_dict[y[index]] += 1
            else:
                neighbour_dict[y[index]] = 1
        neighbour_dict.pop(current_class)
        if neighbour_dict:
            most_frequent = max(neighbour_dict.items(), key=lambda x: x[1])[1]
        margin = in_class - most_frequent
        margins.append(margin)
    return margins
示例#13
0
def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
                          metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)

    tree = KDTree(X, metric=metric, leaf_size=leaf_size)

    #TO DO: Deal with p for minkowski appropriately
    dist_metric = DistanceMetric.get_metric(metric)

    #Get distance to kth nearest neighbour
    core_distances = tree.query(X, k=min_samples,
                                dualtree=True,
                                breadth_first=True)[0][:, -1]
    #Mutual reachability distance is implicite in mst_linkage_core_cdist
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha)

    #Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    #Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, None
def _rsl_prims_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2 # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = KDTree(X, metric=metric)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=k)[0][:,-1]
    min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
示例#15
0
def margin_new(indices, k, X, y):
    margins = []
    kd_tree = KDTree(X)
    for img_index in indices:
        margin = 0
        dist_to_class = 0
        dist_to_others = 0
        current_class = y[img_index]
        dists, neighbour_indices = kd_tree.query(X[img_index].reshape((1, X[img_index].shape[0])),
                                                 k)
        classes = {}
        for i in xrange(neighbour_indices[0].shape[0]):
            index = neighbour_indices[0][i]
            if y[index] in classes:
                classes[y[index]] += dists[0][i]
            else:
                classes[y[index]] = dists[0][i]
        dist_to_class = classes[current_class]
        classes.pop(current_class)
        # print classes.items()
        if classes:
            dist_to_others = min(classes.items(), key=lambda x: x[1])[1]
        margin = dist_to_class - dist_to_others
        margins.append(margin)
    return margins
示例#16
0
文件: robot.py 项目: shivani1494/pa2
	def constructLMap(self):
		self.obstacleArray = []
		self.allPositions = []	
		#build your obstacle array 
		for i in range( len(self.map.grid) ):	
			for j in range( len(self.map.grid[0])):	
				[x, y] = self.map.cell_position(i, j) 
				if self.map.get_cell(x,y) == 1.0:
					self.obstacleArray.append(np.array(self.map.cell_position(i, j))) 
					#print self.map.cell_position(i, j)	
				self.allPositions.append(np.array(self.map.cell_position(i, j)))  
		#pass it into kdtree
		eExp = []
	
		kdt = KDTree(self.obstacleArray)
		dists = kdt.query(self.allPositions, k=1)[0][:]	
		self.laserStdDev = self.config["laser_sigma_hit"]
		constant = 1.0/( m.sqrt( 2 * m.pi) * self.laserStdDev )
		eExp = np.exp(-0.5*( dists**2  )/( self.laserStdDev**2 ) )
		probObsGivenLaser = eExp
		self.lMap.grid = probObsGivenLaser.reshape(self.lMap.grid.shape) 

		self.occupancyGridMsg = self.lMap.to_message()		
		
		self.lMapPublisher.publish(self.occupancyGridMsg) 
示例#17
0
def test_kdtree_projection(datas):

    from sklearn.neighbors import KDTree
    from sklearn import random_projection


    # datas = parse()
    Fs = fingerprints(datas)

    # The random projection
    transformer = random_projection.GaussianRandomProjection(n_components = 128)
    Fs_new = transformer.fit_transform(Fs)
    print Fs_new.shape

    tree = KDTree(Fs_new, leaf_size=20)

    # Select a random target
    target_i = random.choice(range(len( datas )))
    target = datas[target_i]
    Tf = np.vstack([fingerprint(target)])
    Tf_new = transformer.transform(Tf)

    # Match it
    with timer(10):
        for _ in xrange(10):
            dist, ind = tree.query(Tf_new, k=3)
    assert datas[ind[0][0]] == datas[target_i]
def uniform_points_points_sampling(limits, points, n):
    """Select the spatial uniform points in the sample by sampling uniform
    spatial points and getting the nearest ones in the available ones.

    Parameters
    ----------
    limits: numpy.ndarray, shape (2, 2)
        the limits of the space. There is the square four limits which defines
        the whole retrievable region.
    points: numpy.ndarray
        the points in the space selected.
    n: int
        the number of samples we want.

    Returns
    -------
    indices: numpy.ndarray, shape(n)
        the indices of the samples.

    """

    ## 0. Initialize retriever
    retriever = KDTree(points)
    ## 1. Compute spatial uniform points
    points_s = uniform_points_sampling(limits, n)
    ## 2. Get the nearest points in the sample
    result = retriever.query(points_s, k=1)
    indices = result[1]
    indices = indices.astype(int)
    return indices
示例#19
0
def concat_features_by_neighbors(df_labels, df_features,
                                 X_names=['Offense Type'],
                                 grid=["Latitude", "Longitude"],
                                 radius=1./500.,
                                 scale=np.array([1.,1.])):

    df_labels = df_labels.dropna(subset=grid)
    df_features = df_features.dropna(subset=grid)

    X = df_features.as_matrix(X_names)
    xy_features = df_features.as_matrix(grid)
    xy_labels = df_labels.as_matrix(grid)
    tree = KDTree(xy_features*scale)

    vocabulary = set()
    features = []
    for nei in tree.query_radius(xy_labels*scale, radius):
        U,I = np.unique(X[nei], return_inverse=True)
        D = dict(zip(U,np.bincount(I)))
        map(vocabulary.add, D)
        features.append(D)

    return pd.concat([df_labels, pd.DataFrame([map(fi.get, vocabulary)
                      for fi in features],
                      index=df_labels.index,
                      columns=vocabulary).fillna(0.)], axis=1)
示例#20
0
def match_regions(polygons, regionlocs, n_dim=2):
    """

    Parameters
    ----------
    polygons: list or array_like
        the polygons information.
    regionlocs: array_like
        the location information of the regions.
    n_dim: integer
        the number of dimensions.

    Returns
    -------
    assign_r: array_like
        the assignated regions.
    """
    n = len(polygons)
    centroids = np.zeros((n, n_dim))
    for i in xrange(n):
        centroids[i, :] = np.array(polygons[i])
    ret = KDTree(regionlocs)
    assign_r = np.zeros(n).astype(int)
    for i in xrange(n):
        assign_r[i] = ret.query(centroids[[i]])[1][0]
    return assign_r
示例#21
0
文件: hip.py 项目: kynan/GetLost
def get_hip_rank(points, sub):
    sub_coords = sub[['lat', 'lng']].values
    if not sub_coords.shape:
        return []
    sub_scores = sub.checkinsCount.apply(int).values
    kdt = KDTree(sub_coords, metric='euclidean')
    d, i = kdt.query(np.array(points), k=10)
    return (sub_scores[i] / d**2 * 1e-11).sum(axis=1)
示例#22
0
class KDBasedKNearestNeighbor(object):
    """
        KDTree-based KNN classifier with L2 distance
    """

    def __init__(self, k=1):
        self.k = k

    def fit(self, X_train, y_train):
        """
            Build KDtree using
            http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html
        """
        self.X_train = X_train
        self.y_train = y_train

        return self

    def calc_dist(self, X_test, metric, k=None):
        if k == None:
            k = self.k

        self.kd_tree = KDTree(self.X_train, metric=metric, leaf_size=self.k)

        return self

    def get_neighbors(self, X_test, k=None):
        if k == None:
            k = self.k

        neighbors = self.kd_tree.query(X_test, k)

        num_test = X_test.shape[0]

        y_pred = numpy.zeros(num_test)
        return neighbors[1]

    def predict_labels(self, X_test, k=None):
        """
            Make prediction using kdtree
            Return array of predicted labels
        """
        if k == None:
            k = self.k

        neighbors = self.kd_tree.query(X_test, k)

        num_test = X_test.shape[0]

        y_pred = numpy.zeros(num_test)

        for i in range(num_test):
            closest_y = self.y_train[neighbors[1][i]]
            count = Counter(closest_y)
            # print(count.most_common(1))
            y_pred[i] = count.most_common(1)[0][0]

        return y_pred
示例#23
0
def get_median_neighbors(df, n_neighbors, adj_r):
	'''
	INPUT: Pandas dataframe, and the number of comparable neighbors
	of each listing we'll take the median price of in adding the 
	median_neighbor_prices feature
	OUTPUT: Pandas dataframe with the median prices of the n_neighbors
	closest comparables added as a feature. This is accomplished using a 
	KD-Tree model to search for nearest-neighbors
	'''
	kd_df = df[['latitude', 'longitude']]
	kdvals = kd_df.values
	kd = KDTree(kdvals, leaf_size = 1000)
	cPickle.dump(kd, open('../models/kd_tree.pkl', 'wb'))
	neighbors = kd.query(kdvals, k=100)

	median_neighbor_prices = []
	
	for i in xrange(len(df)):
	    listing_neighbors = neighbors[1][i]
	    listing_id = df.ix[i,'id']
	    n_beds = df.ix[i,'beds']
	    sale_y = df.ix[i, 'sale_y']
	   
	    sub_df = df[(df.index.isin(listing_neighbors))]
	    sub_df = sub_df[
	        (sub_df['beds']  == n_beds)  &
	        (sub_df['id']    != listing_id)
	        ]

	    comp_listings = [item for item in listing_neighbors if item in sub_df.index]
	    df_filtered = pd.DataFrame()
	    df_filtered['last sale price']= df['last sale price'][comp_listings][:n_neighbors]
	    df_filtered['sale_y'] = df['sale_y'][comp_listings][:n_neighbors]

	    df_filtered['price adjusted'] = df_filtered['last sale price'] * (1.0 + (sale_y - df_filtered['sale_y']) * adj_r)
	    med_price = df_filtered['price adjusted'].median()
	    if med_price > 0:
	        median_neighbor_prices.append(med_price)
	    else:
			df_filtered = pd.DataFrame()
			df_filtered['last sale price']= df['last sale price'][comp_listings][:n_neighbors+10]
			df_filtered['sale_y'] = df['sale_y'][comp_listings][:n_neighbors+10]

			df_filtered['price adjusted'] = df_filtered['last sale price'] * (1.0 + (sale_y - df_filtered['sale_y']) * adj_r)
			med_price = df_filtered['price adjusted'].median()
			
			if med_price > 0:
			    median_neighbor_prices.append(med_price)
			else:
				df['price adjusted'] = df['last sale price'] * (1.0 + (sale_y - df['sale_y']) * adj_r)
				med_price = df['price adjusted'][comp_listings].median()
				median_neighbor_prices.append(med_price)

	df['med_neighbor_price'] = median_neighbor_prices
	   
	rmse = np.mean((df['med_neighbor_price'] - df['last sale price'])**2)**0.5
	print 'RMSE is ', rmse
	return df    
示例#24
0
def environment(x_h, y_h, z_h, x, y, z, D3):
    DD = np.array([x, y, z])
    DD = DD.T
    tree = KDTree(DD, leaf_size=20000)
    index = np.where(x_h == x)[0]
    dist, ind = tree.query(DD[index], k=4)
    r3 = max(dist[0])
    delta3 = D3**3.0 * (1.0/(r3**3.0) - 1.0/(D3**3.0))
    return  delta3
示例#25
0
def retrieve_7major_cp(locs, raw_locs, raw_cps):
    raw_cps = np.array(raw_cps).astype(int)
    ret = KDTree(raw_locs)
    new_cps = []
    for i in range(len(locs)):
        neighs = ret.query(locs[[i]], 7)[1].ravel()
        c = Counter([raw_cps[nei] for nei in neighs])
        new_cps.append(c.keys()[np.argmax(c.values())])
    return new_cps
示例#26
0
def find_knn(pts0, eval_pts, k=15):
    '''
    find the points within `pts0` closest to `eval_pts`
    '''
    pts0range = (pts0.max(axis=0) - pts0.min(axis=0))
    neigh = KDTree(pts0 / pts0range)

    nni = neigh.query(eval_pts / pts0range, k=k, return_distance=False)
    return nni
示例#27
0
def main():

    digits = load_digits()

    X = digits.data
    y = digits.target

    num_classes = np.unique(y).shape[0]

    plot_digits(X)

    # TSNE
    # Barnes-Hut: O(d NlogN) where d is dim and N is the number of samples
    # Exact: O(d N^2)
    t0 = time()
    tsne = manifold.TSNE(n_components=2, init="pca", method="barnes_hut", verbose=1)
    X_tsne = tsne.fit_transform(X)
    t1 = time()
    print "t-SNE: %.2f sec" % (t1 - t0)
    tsne.get_params()

    plt.figure(2)
    for k in range(num_classes):
        plt.plot(X_tsne[y == k, 0], X_tsne[y == k, 1], "o")
    plt.title("t-SNE embedding of digits dataset")
    plt.xlabel("X1")
    plt.ylabel("X2")
    axes = plt.gca()
    axes.set_xlim([X_tsne[:, 0].min() - 1, X_tsne[:, 0].max() + 1])
    axes.set_ylim([X_tsne[:, 1].min() - 1, X_tsne[:, 1].max() + 1])
    plt.show()

    # ISOMAP
    # 1. Nearest neighbors search: O(d log k N log N)
    # 2. Shortest path graph search: O(N^2(k+log(N))
    # 3. Partial eigenvalue decomposition: O(dN^2)

    t0 = time()
    isomap = manifold.Isomap(n_neighbors=5, n_components=2)
    X_isomap = isomap.fit_transform(X)
    t1 = time()
    print "Isomap: %.2f sec" % (t1 - t0)
    isomap.get_params()

    plt.figure(3)
    for k in range(num_classes):
        plt.plot(X_isomap[y == k, 0], X_isomap[y == k, 1], "o", label=str(k), linewidth=2)
    plt.title("Isomap embedding of the digits dataset")
    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.show()

    # Use KD-tree to find k-nearest neighbors to a query image
    kdt = KDTree(X_isomap)
    Q = np.array([[-160, -30], [-102, 14]])
    kdt_dist, kdt_idx = kdt.query(Q, k=20)
    plot_digits(X[kdt_idx.ravel(), :])
示例#28
0
def two_point(data, bins):
    """Two-point correlation function, using Landy-Szalay method

    Parameters
    ----------
    data : array_like
        input data, shape = [n_samples, n_features] (2D ndarray)
    bins : array_like
        bins within which to compute the 2-point correlation.
        shape = Nbins + 1 (1D ndarray)

    Returns
    -------
    corr : ndarray
        the estimate of the correlation function within each bin
        shape = Nbins
    """
    data = np.asarray(data)
    bins = np.asarray(bins)
    rng = check_random_state(None)

    n_samples, n_features = data.shape
    Nbins = len(bins) - 1

    # shuffle around an axis, making background dist.
    data_R = data.copy()
    for i in range(n_features - 1):
        rng.shuffle(data_R[:, i])

    factor = len(data_R) * 1. / len(data)

    # Fast two-point correlation functions added in scikit-learn v. 0.14
    # Makes tree to embed pairwise distances, increasing look-up speed
    KDT_D = KDTree(data)  # actual distances
    KDT_R = KDTree(data_R)  # randomized background distances

    counts_DD = KDT_D.two_point_correlation(data, bins)  # number of points within bins[i] radius
    counts_RR = KDT_R.two_point_correlation(data_R, bins)  # " " for randomized background


    DD = np.diff(counts_DD)  # number of points in a disc from bins[i-1] to bins[i]
    RR = np.diff(counts_RR)  # " " for randomized background

    # make zeros 1 for numerical stability (finite difference problems)
    RR_zero = (RR == 0)  # mask creation
    RR[RR_zero] = 1  # apply update


    counts_DR = KDT_R.two_point_correlation(data, bins)  # cross-correlation betw. actual and random

    DR = np.diff(counts_DR)  # binned cross-corr

    corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR  # the Landy-Szalay formula

    corr[RR_zero] = np.nan  # back-apply the zeros found in RR

    return corr
示例#29
0
def negativeLabels(features, positiveLabels):
    neg_lab = [[]]*len(features)
    for i in range(1, len(features)):
        kdt = KDTree(features[i]['RegionCenter'], metric='euclidean')
        neighb = kdt.query(features[i-1]['RegionCenter'], k=3, return_distance=False)
        for j in range(1, len(features[i])):
            for m in range(0, neighb.shape[1]):
                neg_lab[i].append([j,neighb[j][m]])
    return neg_lab
示例#30
0
def patch_classify():
    """
        patch可视化:观察patch在。
        PCA空间,训练数据和实际数据的关系。
        构造了kd-tree
    """
    with open('training_data_full.pickle') as f:
        # 读取对应的原始patch
        kk = open("raw_data_full.pickle", 'rb')
        raw_lib = cPickle.load(kk)
        raw_lib = np.asarray(raw_lib, dtype='float32')

        # 读取数据转换特征
        training_data = cPickle.load(f)
        patch_lib, feature_lib = training_data
        feature_lib, patch_lib = (np.asarray(feature_lib, dtype='float32'), np.asarray(patch_lib, dtype='float32'))
        feature_lib = feature_lib.reshape((-1, 4 * 9 * 9))

        # 构造KD-tree
        tree = KDTree(feature_lib, leaf_size=len(feature_lib) / 100)

        # 在KD-tree当中搜索最近的100个点
        dist, ind1 = tree.query(feature_lib[5678], k=100)
        nn1 = feature_lib[ind1][0]

        dist, ind2 = tree.query(feature_lib[10000], k=100)
        nn2 = feature_lib[ind2][0]

        dist, ind3 = tree.query(feature_lib[1233], k=100)
        nn3 = feature_lib[ind3][0]

        # 计算并转换PCA空间
        pca = PCA(n_components=2)
        d2_data = pca.fit_transform(feature_lib).T

        # 降临近点的高维坐标转换成PCA空间的低维坐标
        r1 = pca.transform(nn1).T
        r2 = pca.transform(nn2).T
        r3 = pca.transform(nn3).T

        # 设置绘制范围
        ax = plt.axes([0.1, 0.1, 0.8, 0.8])

        # 绘制全部数据的散点图
        ax.scatter(d2_data[0], d2_data[1], c='g')
        # 绘制三个类别的散点图
        ax.scatter(r1[0], r1[1], c='r')
        ax.scatter(r2[0], r2[1], c='b')
        ax.scatter(r3[0], r3[1], c='y')

        # patch_lib \ raw_lib分别是差值patch和原始patch
        patch_show(raw_lib[ind1][0], [0.05, 0.05, 0.4, 0.4], 'red')
        patch_show(raw_lib[ind2][0], [0.05, 0.55, 0.4, 0.4], 'blue')
        patch_show(raw_lib[ind3][0], [0.55, 0.05, 0.4, 0.4], 'yellow')

        plt.show()
示例#31
0
文件: S3DIS.py 项目: zxczrx123/KPConv
    def load_subsampled_clouds(self, subsampling_parameter):
        """
        Presubsample point clouds and load into memory (Load KDTree for neighbors searches
        """

        if 0 < subsampling_parameter <= 0.01:
            raise ValueError('subsampling_parameter too low (should be over 1 cm')

        # Create path for files
        tree_path = join(self.path, 'input_{:.3f}'.format(subsampling_parameter))
        if not exists(tree_path):
            makedirs(tree_path)

        # Initiate containers
        self.input_trees = {'training': [], 'validation': []}
        self.input_colors = {'training': [], 'validation': []}
        self.input_labels = {'training': [], 'validation': []}

        for i, file_path in enumerate(self.train_files):

            # Restart timer
            t0 = time.time()

            # get cloud name and split
            cloud_name = file_path.split('/')[-1][:-4]
            if self.all_splits[i] == self.validation_split:
                cloud_split = 'validation'
            else:
                cloud_split = 'training'

            # Name of the input files
            KDTree_file = join(tree_path, '{:s}_KDTree.pkl'.format(cloud_name))
            sub_ply_file = join(tree_path, '{:s}.ply'.format(cloud_name))

            # Check if inputs have already been computed
            if isfile(KDTree_file):
                print('\nFound KDTree for cloud {:s}, subsampled at {:.3f}'.format(cloud_name, subsampling_parameter))

                # read ply with data
                data = read_ply(sub_ply_file)
                sub_colors = np.vstack((data['red'], data['green'], data['blue'])).T
                sub_labels = data['class']

                # Read pkl with search tree
                with open(KDTree_file, 'rb') as f:
                    search_tree = pickle.load(f)

            else:
                print('\nPreparing KDTree for cloud {:s}, subsampled at {:.3f}'.format(cloud_name, subsampling_parameter))

                # Read ply file
                data = read_ply(file_path)
                points = np.vstack((data['x'], data['y'], data['z'])).T
                colors = np.vstack((data['red'], data['green'], data['blue'])).T
                labels = data['class']

                # Subsample cloud
                sub_points, sub_colors, sub_labels = grid_subsampling(points,
                                                                      features=colors,
                                                                      labels=labels,
                                                                      sampleDl=subsampling_parameter)

                # Rescale float color and squeeze label
                sub_colors = sub_colors / 255
                sub_labels = np.squeeze(sub_labels)

                # Get chosen neighborhoods
                search_tree = KDTree(sub_points, leaf_size=50)

                # Save KDTree
                with open(KDTree_file, 'wb') as f:
                    pickle.dump(search_tree, f)

                # Save ply
                write_ply(sub_ply_file,
                          [sub_points, sub_colors, sub_labels],
                          ['x', 'y', 'z', 'red', 'green', 'blue', 'class'])

            # Fill data containers
            self.input_trees[cloud_split] += [search_tree]
            self.input_colors[cloud_split] += [sub_colors]
            self.input_labels[cloud_split] += [sub_labels]

            size = sub_colors.shape[0] * 4 * 7
            print('{:.1f} MB loaded in {:.1f}s'.format(size * 1e-6, time.time() - t0))

        print('\nPreparing reprojection indices for testing')

        # Get number of clouds
        self.num_training = len(self.input_trees['training'])
        self.num_validation = len(self.input_trees['validation'])

        # Get validation and test reprojection indices
        self.validation_proj = []
        self.validation_labels = []
        i_val = 0
        for i, file_path in enumerate(self.train_files):

            # Restart timer
            t0 = time.time()

            # Get info on this cloud
            cloud_name = file_path.split('/')[-1][:-4]

            # Validation projection and labels
            if self.all_splits[i] == self.validation_split:
                proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name))
                if isfile(proj_file):
                    with open(proj_file, 'rb') as f:
                        proj_inds, labels = pickle.load(f)
                else:
                    data = read_ply(file_path)
                    points = np.vstack((data['x'], data['y'], data['z'])).T
                    labels = data['class']

                    # Compute projection inds
                    proj_inds = np.squeeze(self.input_trees['validation'][i_val].query(points, return_distance=False))
                    proj_inds = proj_inds.astype(np.int32)

                    # Save
                    with open(proj_file, 'wb') as f:
                        pickle.dump([proj_inds, labels], f)

                self.validation_proj += [proj_inds]
                self.validation_labels += [labels]
                i_val += 1
                print('{:s} done in {:.1f}s'.format(cloud_name, time.time() - t0))

        print()

        return
示例#32
0
文件: knn.py 项目: deltat99/Pyod
class KNN(BaseDetector):
    """
    kNN class for outlier detection.
    For an observation, its distance to its kth nearest neighbor could be
    viewed as the outlying score. It could be viewed as a way to measure
    the density. More to see the references below.

    Three kNN detectors are supported:
    largest: use the distance to the kth neighbor as the outlier score
    mean: use the average of all k neighbors as the outlier score
    median: use the median of the distance to k neighbors as the outlier score

    .. [1] Ramaswamy, S., Rastogi, R. and Shim, K., 2000, May.
           Efficient algorithms for mining outliers from large data sets. In
           ACM Sigmod Record (Vol. 29, No. 2, pp. 427-438). ACM.

    .. [2] Angiulli, F. and Pizzuti, C., 2002, August. Fast outlier detection
           in high dimensional spaces. In European Conference on Principles of
           Data Mining and Knowledge Discovery,pp. 15-27.

    :param contamination: the amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
    :type contamination: float in (0, 0.5], optional (default=0.1)

    :param n_neighbors: Number of neighbors to use by default
        for k neighbors queries.
    :type n_neighbors: int, optional (default=5)

    :param method: {'largest', 'mean', 'median'}

        - largest: use the distance to the kth neighbor as the outlier score
        - mean: use the average of all k neighbors as the outlier score
        - median: use the median of the distance to k neighbors as the outlier score
    :type method: str, optional (default='largest')
    """
    def __init__(self, contamination=0.1, n_neighbors=5, method='largest'):
        super(KNN, self).__init__(contamination=contamination)
        self.n_neighbors = n_neighbors
        self.method = method

    def fit(self, X, y=None):

        # Validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.tree_ = KDTree(X)

        neigh = NearestNeighbors(n_neighbors=self.n_neighbors)
        neigh.fit(X)

        dist_arr, _ = neigh.kneighbors(n_neighbors=self.n_neighbors,
                                       return_distance=True)

        if self.method == 'largest':
            dist = dist_arr[:, -1]
        elif self.method == 'mean':
            dist = np.mean(dist_arr, axis=1)
        elif self.method == 'median':
            dist = np.median(dist_arr, axis=1)

        self.decision_scores_ = dist.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):

        check_is_fitted(self,
                        ['tree_', 'decision_scores_', 'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        pred_scores = np.zeros([X.shape[0], 1])

        for i in range(X.shape[0]):
            x_i = X[i, :]
            x_i = np.asarray(x_i).reshape(1, x_i.shape[0])

            # get the distance of the current point
            dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors)

            if self.method == 'largest':
                dist = dist_arr[:, -1]
            elif self.method == 'mean':
                dist = np.mean(dist_arr, axis=1)
            elif self.method == 'median':
                dist = np.median(dist_arr, axis=1)

            pred_score_i = dist[-1]

            # record the current item
            pred_scores[i, :] = pred_score_i

        return pred_scores.ravel()
示例#33
0
def kd_nn(cities):
    points = [(city.x, city.y) for city in cities]
    tree = KDTree(points, leaf_size=10, metric='euclidean')
    results = tree.query(points, k=2, return_distance=False)
    return results
class SurfacePointCloud:
    def __init__(self, mesh, points, normals=None, scans=None):
        self.mesh = mesh
        self.points = points
        self.normals = normals
        self.scans = scans

        self.kd_tree = KDTree(points)

    def get_random_surface_points(self, count, use_scans=True):
        if use_scans:
            indices = np.random.choice(self.points.shape[0], count)
            return self.points[indices, :]
        else:
            return self.mesh.sample(count)

    def get_sdf(self, query_points, use_depth_buffer=False, sample_count=11):
        if use_depth_buffer:
            distances, _ = self.kd_tree.query(query_points)
            distances = distances.astype(np.float32).reshape(-1) * -1
            distances[self.is_outside(query_points)] *= -1
            return distances
        else:
            distances, indices = self.kd_tree.query(query_points,
                                                    k=sample_count)
            distances = distances.astype(np.float32)

            closest_points = self.points[indices]
            direction_to_surface = query_points[:,
                                                np.newaxis, :] - closest_points
            inside = np.einsum('ijk,ijk->ij', direction_to_surface,
                               self.normals[indices]) < 0
            inside = np.sum(inside, axis=1) > sample_count * 0.5
            distances = distances[:, 0]
            distances[inside] *= -1
            return distances

    def get_sdf_in_batches(self,
                           query_points,
                           use_depth_buffer=False,
                           sample_count=11,
                           batch_size=1000000):
        if query_points.shape[0] <= batch_size:
            return self.get_sdf(query_points,
                                use_depth_buffer=use_depth_buffer,
                                sample_count=sample_count)

        result = np.zeros(query_points.shape[0])
        for i in range(int(math.ceil(query_points.shape[0] / batch_size))):
            start = i * batch_size
            end = min(result.shape[0], (i + 1) * batch_size)
            result[start:end] = self.get_sdf(query_points[start:end, :],
                                             use_depth_buffer=use_depth_buffer,
                                             sample_count=sample_count)
        return result

    def get_voxels(self,
                   voxel_resolution,
                   use_depth_buffer=False,
                   sample_count=11,
                   pad=False,
                   check_result=False):
        from mesh_to_sdf.utils import get_raster_points, check_voxels

        sdf = self.get_sdf_in_batches(get_raster_points(voxel_resolution),
                                      use_depth_buffer, sample_count)
        voxels = sdf.reshape(
            (voxel_resolution, voxel_resolution, voxel_resolution))

        if check_result and not check_voxels(voxels):
            raise BadMeshException()

        if pad:
            voxels = np.pad(voxels, 1, mode='constant', constant_values=1)

        return voxels

    def sample_sdf_near_surface(self,
                                number_of_points=500000,
                                use_scans=True,
                                sign_method='normal',
                                normal_sample_count=11,
                                min_size=0):
        query_points = []
        surface_sample_count = int(number_of_points * 47 / 50) // 2
        surface_points = self.get_random_surface_points(surface_sample_count,
                                                        use_scans=use_scans)
        query_points.append(
            surface_points +
            np.random.normal(scale=0.0025, size=(surface_sample_count, 3)))
        query_points.append(
            surface_points +
            np.random.normal(scale=0.00025, size=(surface_sample_count, 3)))

        unit_sphere_sample_count = number_of_points - surface_sample_count * 2
        unit_sphere_points = np.random.uniform(-1,
                                               1,
                                               size=(unit_sphere_sample_count *
                                                     2, 3))
        unit_sphere_points = unit_sphere_points[
            np.linalg.norm(unit_sphere_points, axis=1) < 1]
        query_points.append(unit_sphere_points[:unit_sphere_sample_count, :])
        query_points = np.concatenate(query_points).astype(np.float32)

        if sign_method == 'normal':
            sdf = self.get_sdf_in_batches(query_points,
                                          use_depth_buffer=False,
                                          sample_count=normal_sample_count)
        elif sign_method == 'depth':
            sdf = surface_point_cloud.get_sdf_in_batches(query_points,
                                                         use_depth_buffer=True)
        else:
            raise ValueError(
                'Unknown sign determination method: {:s}'.format(sign_method))

        if min_size > 0:
            model_size = np.count_nonzero(
                sdf[-unit_sphere_sample_count:] < 0) / unit_sphere_sample_count
            if model_size < min_size:
                raise BadMeshException()

        return query_points, sdf

    def show(self):
        scene = pyrender.Scene()
        scene.add(pyrender.Mesh.from_points(self.points, normals=self.normals))
        pyrender.Viewer(scene, use_raymond_lighting=True, point_size=2)

    def is_outside(self, points):
        result = None
        for scan in self.scans:
            if result is None:
                result = scan.is_visible(points)
            else:
                result = np.logical_or(result, scan.is_visible(points))
        return result
示例#35
0
文件: vis_tsne.py 项目: yzhao062/DCSO
    ])

    # generate threshold sum
    target_test_threshold = np.sum(test_scores_norm.clip(0), axis=1)
    test_target_list.append(target_test_threshold)
    method_list.append('threshold')

    # generate average of maximum (AOM) and maximum of average (MOA)
    target_test_aom = aom(test_scores_norm, n_buckets, n_clf)
    target_test_moa = moa(test_scores_norm, n_buckets, n_clf)
    test_target_list.extend([target_test_aom, target_test_moa])
    method_list.extend(['aom', 'moa'])
    ###################################################################
    # use mean as the pseudo target
    for k in final_k_list:
        tree = KDTree(X_train_norm)
        dist_arr, ind_arr = tree.query(X_test_norm, k=k)

        m_list = [
            'a_dist_d', 'a_dist_r', 'a_dist_n', 'a_pear_d', 'a_pear_r',
            'a_pear_n'
        ]

        # initialize different buckets
        pred_scores_best = np.zeros([X_test.shape[0], len(m_list)])
        pred_scores_max_d = np.zeros([X_test.shape[0], len(m_list)])
        pred_scores_max_f5 = np.zeros([X_test.shape[0], len(m_list)])
        pred_scores_max_f10 = np.zeros([X_test.shape[0], len(m_list)])
        pred_scores_max_f15 = np.zeros([X_test.shape[0], len(m_list)])

        for i in range(X_test.shape[0]):  # X_test_norm.shape[0]
示例#36
0
文件: knn.py 项目: mugurd/pyod
class KNN(BaseDetector):
    # noinspection PyPep8
    """kNN class for outlier detection.
    For an observation, its distance to its kth nearest neighbor could be
    viewed as the outlying score. It could be viewed as a way to measure
    the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for
    details.

    Three kNN detectors are supported:
    largest: use the distance to the kth neighbor as the outlier score
    mean: use the average of all k neighbors as the outlier score
    median: use the median of the distance to k neighbors as the outlier score

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for k neighbors queries.

    method : str, optional (default='largest')
        {'largest', 'mean', 'median'}

        - 'largest': use the distance to the kth neighbor as the outlier score
        - 'mean': use the average of all k neighbors as the outlier score
        - 'median': use the median of the distance to k neighbors as the
          outlier score

    radius : float, optional (default = 1.0)
        Range of parameter space to use by default for `radius_neighbors`
        queries.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
        Algorithm used to compute the nearest neighbors:

        - 'ball_tree' will use BallTree
        - 'kd_tree' will use KDTree
        - 'brute' will use a brute-force search.
        - 'auto' will attempt to decide the most appropriate algorithm
          based on the values passed to :meth:`fit` method.

        Note: fitting on sparse input will override the setting of
        this parameter, using brute force.

    leaf_size : int, optional (default = 30)
        Leaf size passed to BallTree or KDTree.  This can affect the
        speed of the construction and query, as well as the memory
        required to store the tree.  The optimal value depends on the
        nature of the problem.

    metric : string or callable, default 'minkowski'
        metric to use for distance computation. Any metric from scikit-learn
        or scipy.spatial.distance can be used.

        If metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays as input and return one value indicating the
        distance between them. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Distance matrices are not supported.

        Valid values for metric are:

        - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
          'manhattan']

        - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
          'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
          'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
          'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
          'sqeuclidean', 'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics.

    p : integer, optional (default = 2)
        Parameter for the Minkowski metric from
        sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
        equivalent to using manhattan_distance (l1), and euclidean_distance
        (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
        See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances

    metric_params : dict, optional (default = None)
        Additional keyword arguments for the metric function.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for neighbors search.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
        Affects only kneighbors and kneighbors_graph methods.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """
    def __init__(self,
                 contamination=0.1,
                 n_neighbors=5,
                 method='largest',
                 radius=1.0,
                 algorithm='auto',
                 leaf_size=30,
                 metric='minkowski',
                 p=2,
                 metric_params=None,
                 n_jobs=1,
                 **kwargs):
        super(KNN, self).__init__(contamination=contamination)
        self.n_neighbors = n_neighbors
        self.method = method
        self.radius = radius
        self.algorithm = algorithm
        self.leaf_size = leaf_size
        self.metric = metric
        self.p = p
        self.metric_params = metric_params
        self.n_jobs = n_jobs

        self.neigh_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       radius=self.radius,
                                       algorithm=self.algorithm,
                                       leaf_size=self.leaf_size,
                                       metric=self.metric,
                                       p=self.p,
                                       metric_params=self.metric_params,
                                       n_jobs=self.n_jobs,
                                       **kwargs)

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """

        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.tree_ = KDTree(X, leaf_size=self.leaf_size, metric=self.metric)
        self.neigh_.fit(X)

        dist_arr, _ = self.neigh_.kneighbors(n_neighbors=self.n_neighbors,
                                             return_distance=True)
        dist = self._get_dist_by_method(dist_arr)

        self.decision_scores_ = dist.ravel()
        self._process_decision_scores()

        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self,
                        ['tree_', 'decision_scores_', 'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        pred_scores = np.zeros([X.shape[0], 1])

        for i in range(X.shape[0]):
            x_i = X[i, :]
            x_i = np.asarray(x_i).reshape(1, x_i.shape[0])

            # get the distance of the current point
            dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors)
            dist = self._get_dist_by_method(dist_arr)
            pred_score_i = dist[-1]

            # record the current item
            pred_scores[i, :] = pred_score_i

        return pred_scores.ravel()

    def _get_dist_by_method(self, dist_arr):
        """Internal function to decide how to process passed in distance array

        Parameters
        ----------
        dist_arr : numpy array of shape (n_samples, n_neighbors)
            Distance matrix.

        Returns
        -------
        dist : numpy array of shape (n_samples,)
            The outlier scores by distance.
        """

        if self.method == 'largest':
            return dist_arr[:, -1]
        elif self.method == 'mean':
            return np.mean(dist_arr, axis=1)
        elif self.method == 'median':
            return np.median(dist_arr, axis=1)
示例#37
0
#                      (a.y - b.y)**2 +
#                      alpha*(a.theta - b.theta)**2)


def dist_func(a, b):
    alpha = 1
    return np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2 + alpha *
                   (a[2] - b[2])**2)


pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func)
model_states, X = make_states()
for i in range(X.shape[0]):
    print(X[i, :])
print("TREE TIME")
tree = KDTree(X, leaf_size=4, metric="euclidean")
pts = np.array([(0, 0, 0)])
dist, ind = tree.query(pts, k=1)
for i in ind:
    print(X[i])
    print(np.asscalar(i))
    print(model_states[np.asscalar(i)])

# print(dist)
# print(KDTree.valid_metrics)
# a = np.empty((5, 5, 3))
# b = np.empty((3, 5, 5))
#
# r = 0
# for i in range(a.shape[0]):
#     for j in range(a.shape[1]):
    'primary_focus_subject_Nutrition', 'primary_focus_subject_Other',
    'primary_focus_subject_Parent Involvement',
    'primary_focus_subject_Performing Arts',
    'primary_focus_subject_Social Sciences',
    'primary_focus_subject_Special Needs', 'primary_focus_subject_Team Sports',
    'primary_focus_subject_Visual Arts', 'poverty_level_high poverty',
    'poverty_level_highest poverty', 'poverty_level_low poverty',
    'poverty_level_moderate poverty', 'grade_level_Grades 3-5',
    'grade_level_Grades 6-8', 'grade_level_Grades 9-12',
    'grade_level_Grades PreK-2', 'school_metro_rural', 'school_metro_suburban',
    'school_metro_urban', 'resource_type_Books', 'resource_type_Other',
    'resource_type_Supplies', 'resource_type_Technology',
    'resource_type_Trips', 'resource_type_Visitors'
]

tree = KDTree(lookup[X], metric="chebyshev")

#---------- URLS AND WEB PAGES -------------#
app = flask.Flask(__name__)


@app.route("/")
def viz_page():
    with open("dc_prediction.html", 'r') as viz_file:
        return viz_file.read()


@app.route("/score", methods=["POST"])
def score():
    data = flask.request.json
    x = np.matrix(data["example"])
    loc, keypoints = blob_detect(gray_crop)
    # print(loc.shape)
    # print(loc)
    # im_with_keypoints = cv2.drawKeypoints(gray_crop, keypoints, np.array([]),
    #                                       (0, 0, 255), cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
    # cv2.imshow('keypoints', im_with_keypoints)

    if count == 0:

        loc_0 = loc.copy()
        recent_loc = loc.copy()
    elif count > 0:
        print('===========frame: {}================='.format(count))
        # print(loc_0[1,:])
        kdt = KDTree(loc, leaf_size=30, metric='euclidean')
        dist, ind = kdt.query(recent_loc, k=1)
        thd = (dist < 14) * 1
        thd_nz = np.where(thd)[0]
        # update point if close enough point are detected
        recent_loc[thd_nz] = np.reshape(loc[ind[thd_nz]], (len(thd_nz), 2))

        # visualize the displacement field
        loc_v = 2 * recent_loc - loc_0  # diff vector

        img_rgb = cv2.cvtColor(gray_crop, cv2.COLOR_GRAY2RGB)
        # draw image and save vectors
        for i in range(0, len(loc_0)):
            cv2.arrowedLine(
                img_rgb, (int(np.around(
                    recent_loc[i, 0])), int(np.around(recent_loc[i, 1]))),
示例#40
0
def get_top_n():
    ref_meta = load_csv(REF_CSV)
    query_meta = load_csv(QUERY_CSV)
    full_ref_xy = get_xy(ref_meta)
    full_query_xy = get_xy(query_meta)
    num_q = full_query_xy.shape[0]

    pca_f = np.array(load_pickle(PCA_LV_PICKLE))
    full_ref_f = np.array(load_pickle(REF_LV_PICKLE))
    full_query_f = np.array(load_pickle(QUERY_LV_PICKLE))

    full_xy_dists = pairwise_distances(full_query_xy,
                                       full_ref_xy,
                                       metric='euclidean')

    for d in DIMS:

        print(d)
        pca = PCA(whiten=True, n_components=d)
        pca = pca.fit(pca_f)
        pca_ref_f = pca.transform(full_ref_f)
        pca_query_f = pca.transform(full_query_f)

        for l in L:
            print(l)

            out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d))
            mkdir(out_folder)
            name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1])
            out_pickle = os.path.join(out_folder, '{}.pickle'.format(name))

            if os.path.exists(out_pickle):
                print('{} already exists. Skipping.'.format(out_pickle))
                continue

            ref_idx = [0]
            for i in range(len(full_ref_xy)):
                if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])**
                       2) >= l**2:
                    ref_idx.append(i)

            if len(ref_idx) < N:
                continue

            ref_f = np.array([pca_ref_f[i, :] for i in ref_idx])
            xy_dists = np.array([full_xy_dists[:, i]
                                 for i in ref_idx]).transpose()

            print('Building tree')
            ref_tree = KDTree(ref_f)

            print('Retrieving')
            top_f_dists, top_i = np.array(
                ref_tree.query(pca_query_f,
                               k=N,
                               return_distance=True,
                               sort_results=True))
            top_f_dists = np.array(top_f_dists)
            top_i = np.array(top_i, dtype=int)

            top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]]
                           for q in range(num_q)]

            gt_i = np.argmin(xy_dists, axis=1)
            gt_g_dist = np.min(xy_dists, axis=1)

            # Translate to original indices
            top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)]
            gt_i = [ref_idx[r] for r in gt_i]

            save_pickle(
                [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx],
                out_pickle)
示例#41
0
    xvals = np.random.uniform(xmin, xmax, num_samples)
    yvals = np.random.uniform(ymin, ymax, num_samples)
    zvals = np.random.uniform(zmin, zmax, num_samples)

    zvalsMax = np.max(zvals)

    points = np.array(list(zip(xvals, yvals, zvals)))

    #points[:10]
    #for point in points:
    #    if not collides(polygons, point):
    #        to_keep.append(point)

    nodes = []
    #points = np.random.random((100, 3))  # 10 points in 3 dimensions
    tree = KDTree(data[:, :3])

    for p in points:
        idxs = tree.query([p], k=1, return_distance=False)[0]
        print(idxs)
        if not collides(polygons[idxs[0]], p):
            nodes.append(p)
    # idxs = tree.query([points[1]], k=3, return_distance=False)[0]
    # print(idxs)

    # idxs = tree.query([points[0]], k=3, return_distance=False)[0]
    # TODO: connect nodes
    # Suggested method
    # 1) cast nodes into a graph called "g" using networkx
    # 2) write a method "can_connect()" that:
    # casts two points as a shapely LineString() object
示例#42
0
def createBatches(xyz, data, label, batchsize=2048):
    """ Create batches from numpy array with Nearest Neighbor approach.
        Leftover points are discarded. All input array should have the same amount of points.
    Input:
    Numpy Array xyz, this array is used to construct KDTree and determine nearest neighbors (Usually X,Y,Z coordinates) (num_points, 3)
    Numpy Array data, with features (num_points, num_features)
    Numpy Array label, with labels (num_points, )
    Int batchsize, with batchsize (default: 2048)
    Return:
    Tuple of
    Numpy Array, with original xyz coordinates for visul (num_batches, batchsize, 3)
    Numpy Array, with batches of data (num_batches, batchsize, num_features)
    Numpy Array, with labels (num_batches, batchsize, )
    """
    xyz_batches = []
    data_batches = []
    label_batches = []

    num_batches = xyz.shape[0] // batchsize  # floor division to get num batches

    # debug output
    print("Number of Batches: " + str(num_batches))

    start = time.time()

    for i in range(num_batches):
        # find points for each batch starting with next point in data
        #start = time.time()
        #tree = KDTree(xyz, leaf_size=xyz.shape[0])
        #end = time.time()

        #print("build time: " + str(end - start))
        #
        #start = time.time()
        #indices = tree.query(xyz[:1], k=batchsize, return_distance=False, sort_results=True)
        #end = time.time()

        #print("query time: " + str(end - start))

        tree = KDTree(xyz, leaf_size=xyz.shape[0])
        indices = tree.query(xyz[:1],
                             k=batchsize,
                             return_distance=False,
                             sort_results=True)

        # append batch to stores
        # np squeez to get rid of single dimensional shape entries
        # before (1, batchsize, 3) -> after (batchsize, 3)
        xyz_batches.append(np.squeeze(xyz[indices]))
        data_batches.append(np.squeeze(data[indices]))
        label_batches.append(np.squeeze(label[indices]))

        # remove allocated indices to prepare next iteration
        data = np.delete(data, indices, axis=0)
        label = np.delete(label, indices, axis=0)
        xyz = np.delete(xyz, indices, axis=0)

        # to monitor progress for huge input sets
        if i % 10 == 0:
            print(str(i / num_batches))

    # convert lists to numpy array and return tuple
    xyz = np.asarray(xyz_batches, dtype=np.float64)
    data = np.asarray(data_batches, dtype=np.float64)
    label = np.asarray(label_batches, dtype=np.int8)

    # create and return tuple
    return (xyz, data, label)
        embed = embed_model.predict([x, m], batch_size=100, verbose=1)
        embed_dict[i] = embed
        del x, y, m
        np.save(filename, embed)
    del sequence_dict[i]
    print 'embedded', i, rev_label_dict[i]

    #embed_dict[i] = embed_dict[i][0:1000]

del sequence_dict, model, embed_model
result = []

tree_dict = dict()

for i in range(N):
    tree_dict[i] = KDTree(embed_dict[i], leaf_size=10)
    print 'tree', i


def distance(embed, tree, embed_name, tree_name):
    path = "/mnt/data/computervision/tara/results64/"
    dist_filename = path + embed_name + "_" + tree_name + "_distances.npy"
    ind_filename = path + embed_name + "_" + tree_name + "_indices.npy"
    if os.path.exists(dist_filename):
        dists = np.load(dist_filename)
        indices = np.load(ind_filename)
    else:
        (dists, indices) = tree.query(embed, k=1)
        np.save(dist_filename, dists)
        np.save(ind_filename, indices)
    dist = np.mean(dists)
示例#44
0
文件: KNN_PA3.py 项目: tkonz/KNN
class knn():
    def fit(self, xtrain, ytrain, k, tree=True):
        self.xtrain = xtrain
        self.ytrain = ytrain
        self.k = k
        self.tree = tree
        self.correct = 0

    def dist(self, a, b):
        return np.linalg.norm(a - b)

    def closest(self, row, k):
        best_dist = self.dist(row, self.xtrain[0])
        best_indx = 0

        for i in range(self.k, len(self.xtrain)):
            dist = self.dist(row, self.xtrain[i])
            if dist < best_dist:
                best_dist = dist
                best_indx = i
            #if i % 100 == 0:
            #print("Iteration ", i)
        #print(self.ytrain[best_indx])
        return self.ytrain[best_indx]

    def predictKD(self, xtest, k):
        self.predictions = []
        self.kdtree = KDTree(self.xtrain, leaf_size=40)
        dist, ind = self.kdtree.query(xtest, k=self.k)
        self.predictions = self.ytrain[ind[:, 0]]
        self.predictions = np.squeeze(self.predictions)
        return self.predictions

    def predict(self, xtest, k):
        self.predictions = []
        for row in xtest:
            label = self.closest(row, k)
            self.predictions.append(label)
        return self.predictions

    def accuracy_score(self, ytrue):
        self.correct = 0
        for i in range(len(ytrue)):
            if ytrue[i] == self.predictions[i]:
                self.orrect += 1
        return (self.correct / float(len(ytrue))) * 100.0

    def get_results(self, ylabel):
        self.ylabel = ylabel
        size = len(self.ylabel)
        conf = confusion_matrix(self.ylabel, self.predictions)
        plt.figure(0).clf()
        plt.imshow(conf)
        print(classification_report(self.ylabel, self.predictions))
        fpr = (len(ylabel) - self.correct) / float(len(ylabel))
        tpr = self.correct / float(len(ylabel))
        plt.figure(1).clf()
        plt.scatter(fpr, tpr, marker='o', label='KNN ROC point')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
        plt.savefig('Log_ROC')
        plt.show()

        # For each point cloud, a sub sample of point will be used for the nearest neighbors and training
        sub_npy_file = NEW_PATH / folder / (file_name + '.npy')
        xyz = data[:,:3].astype(np.float32)
        colors = data[:,3:6].astype(np.uint8)

        if folder!=TEST_PATH or LABELS_AVAILABLE_IN_TEST_SET:
            labels = data[:,-1].astype(np.uint8)
            sub_xyz, sub_colors, sub_labels = DP.grid_sub_sampling(xyz, colors, labels, sub_grid_size)
            sub_colors = sub_colors / 255.0
            np.save(sub_npy_file, np.concatenate((sub_xyz, sub_colors, sub_labels), axis=1).T)

        else:
            sub_xyz, sub_colors = DP.grid_sub_sampling(xyz, colors, None, sub_grid_size)
            sub_colors = sub_colors / 255.0
            np.save(sub_npy_file, np.concatenate((sub_xyz, sub_colors), axis=1).T)

        # The search tree is the KD_tree saved for each point cloud
        search_tree = KDTree(sub_xyz)
        kd_tree_file = NEW_PATH / folder / (file_name + '_KDTree.pkl')
        with open(kd_tree_file, 'wb') as f:
            pickle.dump(search_tree, f)

            # Projection is the nearest points of the selected grid to each point of the cloud
            proj_idx = np.squeeze(search_tree.query(xyz, return_distance=False))
            proj_idx = proj_idx.astype(np.int32)
            proj_save = NEW_PATH / folder / (file_name + '_proj.pkl')
            with open(proj_save, 'wb') as f:
                pickle.dump([proj_idx, labels], f)
示例#46
0
def ConstructMatchingModelRandom(G1, G2, Type, AddTriplet):
    KP = ComputeFeatureDistance(G1.PFeature, G2.PFeature)
    KQ = ComputeKQ(G1, G2, Type)

    NP1 = G1.NofNodes
    NP2 = G2.NofNodes
    nT = np.floor(NP1 * NP2)
    t1 = np.floor(np.random.rand(3, nT) * NP1)
    while (True):
        probFound = False
        for i in range(3):
            ind = (t1[i, :] == t1[(i + 1) % 3, :])
            if (np.sum(ind) != 0):
                idxs = np.nonzero(ind)
                t1[i][idxs] = np.floor(np.random.rand(1, len(idxs[0])) * NP1)
                probFound = True
        if (probFound == False):
            break

    t1 = t1.transpose()
    T = np.sort(t1, axis=1)
    T = T[np.lexsort(np.fliplr(T).T)]
    NRepeated = np.ones(T.shape[0], dtype=int)
    for i in range(1, T.shape[0]):
        if (np.sum(np.abs(T[i] - T[i - 1])) == 0):
            NRepeated[i] = 0

    NRepTri = np.nonzero(NRepeated)
    T = T[NRepTri]
    TF1 = np.zeros([T.shape[0], 3])
    for ti in range(T.shape[0]):
        TF1[ti] = computeTripletsFeatureSinAlpha(G1.P, T[ti])

    NofT2 = G2.NofNodes * (G2.NofNodes - 1) * (G2.NofNodes - 2)
    T2 = np.zeros([NofT2, 3], dtype=int)
    TF2 = np.zeros([6 * NofT2, 3], dtype=float)
    T2Cnt = 0
    for i1 in range(G2.NofNodes):
        for i2 in range(i1 + 1, G2.NofNodes):
            for i3 in range(i2 + 1, G2.NofNodes):
                T2[T2Cnt][0] = i1
                T2[T2Cnt][1] = i2
                T2[T2Cnt][2] = i3
                T2Cnt += 1
    T2 = PermunateTriplets(T2)

    for ti in range(T2.shape[0]):
        TF2[ti] = computeTripletsFeatureSinAlpha(G2.P, T2[ti])

    kdt = KDTree(TF2, metric='euclidean')
    nNN = T.shape[0]
    [distT, indicesT] = kdt.query(TF1, k=nNN, return_distance=True)
    distT = np.exp(-(distT / np.mean(distT)))
    KP = np.exp(-KP)
    NofNodes = G1.NofNodes
    NofStates = intArray(NofNodes)
    for i in range(NofNodes):
        NofStates[i] = NofNodes
    G = CFactorGraph(NofNodes, NofStates)
    bi = doubleArray(NofNodes)
    for ni in range(NofNodes):
        for xi in range(NofNodes):
            bi[xi] = float(KP[ni][xi])
        G.AddNodeBelief(ni, bi)
    nnzEdgeIdx = VecVecInt(KQ.shape[1])
    for ni in range(G2.Edges.shape[0]):
        CurrentAssign = VecInt(2)
        CurrentAssign[0] = int(G2.Edges[ni][0])
        CurrentAssign[1] = int(G2.Edges[ni][1])
        InvCurrentAssign = VecInt(2)
        InvCurrentAssign[0] = int(G2.Edges[ni][1])
        InvCurrentAssign[1] = int(G2.Edges[ni][0])
        nnzEdgeIdx[ni] = CurrentAssign
        nnzEdgeIdx[ni + G2.Edges.shape[0]] = InvCurrentAssign

    for ei in range(KQ.shape[0]):
        CEdgeVec = VecInt(2)
        CEdgeVec[0] = int(G1.Edges[ei][0])
        CEdgeVec[1] = int(G1.Edges[ei][1])
        CurrentNNZV = doubleArray(KQ.shape[1])
        for xij in range(KQ.shape[1]):
            CurrentNNZV[xij] = KQ[ei][xij]
        G.AddGenericGenericSparseFactor(CEdgeVec, nnzEdgeIdx, CurrentNNZV)

    for ti in range(distT.shape[0]):
        CTripletsVec = VecInt(3)
        CTripletsVec[0] = int(T[ti][0])
        CTripletsVec[1] = int(T[ti][1])
        CTripletsVec[2] = int(T[ti][2])
        nnzTripIdx = VecVecInt(distT.shape[1])
        nnzTripV = doubleArray(distT.shape[1])
        for xijk in range(distT.shape[1]):
            cIdxVec = VecInt(3)
            cIdxVec[0] = int(T2[indicesT[ti][xijk]][0])
            cIdxVec[1] = int(T2[indicesT[ti][xijk]][1])
            cIdxVec[2] = int(T2[indicesT[ti][xijk]][2])
            nnzTripIdx[xijk] = cIdxVec
            nnzTripV[xijk] = 6 * distT[ti][xijk]
        G.AddGenericGenericSparseFactor(CTripletsVec, nnzTripIdx, nnzTripV)

    G.AddAuctionFactor()

    return G
示例#47
0
    def load_subsampled_clouds(self, subsampling_parameter):
        """
        Presubsample point clouds and load into memory (Load KDTree for neighbors searches
        """

        if 0 < subsampling_parameter <= 0.01:
            raise ValueError(
                'subsampling_parameter too low (should be over 1 cm')

        # Create path for files
        tree_path = join(self.path,
                         'input_{:.3f}'.format(subsampling_parameter))
        if not exists(tree_path):
            makedirs(tree_path)

        # List of training files
        self.train_files = np.sort([
            join(self.train_path, f) for f in listdir(self.train_path)
            if f[-4:] == '.ply'
        ])

        # Add test files
        self.test_files = np.sort([
            join(self.test_path, f) for f in listdir(self.test_path)
            if f[-4:] == '.ply'
        ])
        files = np.hstack((self.train_files, self.test_files))

        # Initiate containers
        self.input_trees = {'training': [], 'validation': [], 'test': []}
        self.input_colors = {'training': [], 'validation': [], 'test': []}
        self.input_labels = {'training': [], 'validation': []}

        # Advanced display
        N = len(files)
        progress_n = 30
        fmt_str = '[{:<' + str(progress_n) + '}] {:5.1f}%'
        print('\nPreparing KDTree for all scenes, subsampled at {:.3f}'.format(
            subsampling_parameter))

        for i, file_path in enumerate(files):

            # Restart timer
            t0 = time.time()

            # get cloud name and split
            cloud_name = file_path.split('/')[-1][:-4]
            cloud_folder = file_path.split('/')[-2]
            if 'train' in cloud_folder:
                if self.all_splits[i] == self.validation_split:
                    cloud_split = 'validation'
                else:
                    cloud_split = 'training'
            else:
                cloud_split = 'test'

            if (cloud_split != 'test'
                    and self.load_test) or (cloud_split == 'test'
                                            and not self.load_test):
                continue

            # Name of the input files
            KDTree_file = join(tree_path, '{:s}_KDTree.pkl'.format(cloud_name))
            sub_ply_file = join(tree_path, '{:s}.ply'.format(cloud_name))

            # Check if inputs have already been computed
            if isfile(KDTree_file):

                # read ply with data
                data = read_ply(sub_ply_file)
                sub_reflectance = np.expand_dims(data['reflectance'], 1)
                if cloud_split == 'test':
                    sub_labels = None
                else:
                    sub_labels = data['class']

                # Read pkl with search tree
                with open(KDTree_file, 'rb') as f:
                    search_tree = pickle.load(f)

            else:

                # Read ply file
                data = read_ply(file_path)
                points = np.vstack(
                    (data['x'], data['y'], data['z'])).astype(np.float32).T
                reflectance = np.expand_dims(data['reflectance'],
                                             1).astype(np.float32)
                if cloud_split == 'test':
                    int_features = None
                else:
                    int_features = data['class']

                # Saturate reflectance
                reflectance = np.minimum(reflectance, 50.0)

                # Subsample cloud
                sub_data = grid_subsampling(points,
                                            features=reflectance,
                                            labels=int_features,
                                            sampleDl=subsampling_parameter)

                # Rescale and saturate float reflectance
                sub_reflectance = sub_data[1] / 50.0

                # Get chosen neighborhoods
                search_tree = KDTree(sub_data[0], leaf_size=50)

                # Save KDTree
                with open(KDTree_file, 'wb') as f:
                    pickle.dump(search_tree, f)

                # Save ply
                if cloud_split == 'test':
                    sub_labels = None
                    write_ply(sub_ply_file, [sub_data[0], sub_reflectance],
                              ['x', 'y', 'z', 'reflectance'])
                else:
                    sub_labels = np.squeeze(sub_data[2])
                    write_ply(sub_ply_file,
                              [sub_data[0], sub_reflectance, sub_labels],
                              ['x', 'y', 'z', 'reflectance', 'class'])

            # Fill data containers
            self.input_trees[cloud_split] += [search_tree]
            self.input_colors[cloud_split] += [sub_reflectance]
            if cloud_split in ['training', 'validation']:
                self.input_labels[cloud_split] += [sub_labels]

            print('', end='\r')
            print(fmt_str.format('#' * (((i + 1) * progress_n) // N),
                                 100 * (i + 1) / N),
                  end='',
                  flush=True)

        # Get number of clouds
        self.num_training = len(self.input_trees['training'])
        self.num_validation = len(self.input_trees['validation'])
        self.num_test = len(self.input_trees['test'])

        # Get validation and test reprojection indices
        self.validation_proj = []
        self.validation_labels = []
        self.test_proj = []
        self.test_labels = []
        i_val = 0
        i_test = 0

        # Advanced display
        N = max(self.num_validation + self.num_test, 1)
        print('', end='\r')
        print(fmt_str.format('#' * progress_n, 100), flush=True)
        print('\nPreparing reprojection indices for validation and test')

        for i, file_path in enumerate(files):

            # get cloud name and split
            cloud_name = file_path.split('/')[-1][:-4]
            cloud_folder = file_path.split('/')[-2]

            # Validation projection and labels
            if (not self.load_test
                ) and 'train' in cloud_folder and self.all_splits[
                    i] == self.validation_split:
                proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name))
                if isfile(proj_file):
                    with open(proj_file, 'rb') as f:
                        proj_inds, labels = pickle.load(f)
                else:

                    # Get original points
                    data = read_ply(file_path)
                    points = np.vstack((data['x'], data['y'], data['z'])).T
                    labels = data['class']

                    # Compute projection inds
                    proj_inds = np.squeeze(
                        self.input_trees['validation'][i_val].query(
                            points, return_distance=False))
                    proj_inds = proj_inds.astype(np.int32)

                    # Save
                    with open(proj_file, 'wb') as f:
                        pickle.dump([proj_inds, labels], f)

                self.validation_proj += [proj_inds]
                self.validation_labels += [labels]
                i_val += 1

            # Test projection
            if self.load_test and 'test' in cloud_folder:
                proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name))
                if isfile(proj_file):
                    with open(proj_file, 'rb') as f:
                        proj_inds = pickle.load(f)
                else:

                    # Get original points
                    data = read_ply(file_path)
                    points = np.vstack((data['x'], data['y'], data['z'])).T

                    # Compute projection inds
                    proj_inds = np.squeeze(
                        self.input_trees['test'][i_test].query(
                            points, return_distance=False))
                    proj_inds = proj_inds.astype(np.int32)

                    # Save
                    with open(proj_file, 'wb') as f:
                        pickle.dump(proj_inds, f)

                self.test_proj += [proj_inds]
                self.test_labels += [np.zeros(0, dtype=np.int32)]
                i_test += 1

            print('', end='\r')
            print(fmt_str.format('#' * (((i_val + i_test) * progress_n) // N),
                                 100 * (i_val + i_test) / N),
                  end='',
                  flush=True)

        print('\n')

        return
def get_top_n():
    # check if complete:
    ld_checkpoints = get_checkpoints('obm')

    ld_cp_names = []
    for cp in ld_checkpoints:
        cp_name = cp.split('/')[-2]
        cp_name = ''.join(os.path.basename(cp_name).split('.'))  # Removing '.'
        cp_name += '_e{}'.format(cp[-1])
        ld_cp_names.append(cp_name)

    if any([x in QUERY_LV_PICKLE for x in ld_cp_names]):
        L = [0.0, 0.3, 1.0, 5.0]
        D = [64, 128, 256, 512, 1024, 2048, 4096]
    else:
        L = [0.0]
        D = [256]

    complete = True
    for l in L:
        for d in D:

            out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d))
            name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1])
            out_pickle = os.path.join(out_folder, '{}.pickle'.format(name))

            if not os.path.exists(out_pickle):
                complete = False
                break
        if not complete:
            break

    if complete:
        print('Skipping complete {}'.format(QUERY_LV_PICKLE))
        return

    ref_meta = load_csv(REF_CSV)
    query_meta = load_csv(QUERY_CSV)
    full_ref_xy = get_xy(ref_meta)
    full_query_xy = get_xy(query_meta)
    num_q = full_query_xy.shape[0]

    pca_f = np.array(load_pickle(PCA_LV_PICKLE))
    full_ref_f = np.array(load_pickle(REF_LV_PICKLE))
    full_query_f = np.array(load_pickle(QUERY_LV_PICKLE))

    full_xy_dists = pairwise_distances(full_query_xy,
                                       full_ref_xy,
                                       metric='euclidean')

    for d in D:

        print(d)
        pca = PCA(whiten=True, n_components=d)
        pca = pca.fit(pca_f)
        pca_ref_f = pca.transform(full_ref_f)
        pca_query_f = pca.transform(full_query_f)

        for l in L:
            print(l)

            out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d))
            mkdir(out_folder)
            name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1])
            out_pickle = os.path.join(out_folder, '{}.pickle'.format(name))

            if os.path.exists(out_pickle):
                print('{} already exists. Skipping.'.format(out_pickle))
                continue

            ref_idx = [0]
            for i in range(len(full_ref_xy)):
                if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])**
                       2) >= l**2:
                    ref_idx.append(i)

            if len(ref_idx) < N:
                continue

            ref_f = np.array([pca_ref_f[i, :] for i in ref_idx])
            xy_dists = np.array([full_xy_dists[:, i]
                                 for i in ref_idx]).transpose()

            print('Building tree')
            ref_tree = KDTree(ref_f)

            print('Retrieving')
            top_f_dists, top_i = np.array(
                ref_tree.query(pca_query_f,
                               k=N,
                               return_distance=True,
                               sort_results=True))
            top_f_dists = np.array(top_f_dists)
            top_i = np.array(top_i, dtype=int)

            top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]]
                           for q in range(num_q)]

            gt_i = np.argmin(xy_dists, axis=1)
            gt_g_dist = np.min(xy_dists, axis=1)

            # Translate to original indices
            top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)]
            gt_i = [ref_idx[r] for r in gt_i]

            save_pickle(
                [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx],
                out_pickle)
示例#49
0
        cat_neighbors = cat_neighbors_z_slice[abs(cat_neighbors_z_slice['RA'] -
                                                  gal['RA']) < 0.7 / dis /
                                              np.pi * 180]
        cat_neighbors = cat_neighbors[
            abs(cat_neighbors['DEC'] - gal['DEC']) < 0.7 / dis / np.pi * 180]
        if len(cat_neighbors) == 0:  # central gals which has no companion
            coord_random_list, radial_bkg = bkg(cat_neighbors_z_slice,
                                                coord_massive_gal,
                                                mode=mode)
            radial_dist_bkg += radial_bkg
            cat_random_copy = cut_random_cat(cat_random_copy,
                                             coord_random_list)
            continue
        else:
            ind = KDTree(np.array(cat_neighbors['RA',
                                                'DEC']).tolist()).query_radius(
                                                    [(gal['RA'], gal['DEC'])],
                                                    0.7 / dis / np.pi * 180)
            cat_neighbors = cat_neighbors[ind[0]]
            cat_neighbors = cat_neighbors[
                cat_neighbors['NUMBER'] != gal['NUMBER']]
            if len(cat_neighbors) == 0:  # central gals which has no companion
                coord_random_list, radial_bkg = bkg(cat_neighbors_z_slice,
                                                    coord_massive_gal,
                                                    mode=mode)
                radial_dist_bkg += radial_bkg
                cat_random_copy = cut_random_cat(cat_random_copy,
                                                 coord_random_list)
                continue

        # isolation cut on central
        if gal[mass_keyname] < max(
示例#50
0
def local_optimize_nn(
    data,
    graph,
    hub_info,
    n_components,
    learning_rate,
    a,
    b,
    gamma,
    negative_sample_rate,
    n_epochs,
    init,
    random_state,
    parallel=False,
    verbose=False,
    label=None,
    k=0,
):

    graph = graph.tocoo()
    graph.sum_duplicates()
    n_vertices = graph.shape[1]

    graph.data[
        hub_info[graph.col] == 2
    ] = 1.0  # current (NNs) -- other (hubs): 1.0 weight
    graph.data[
        hub_info[graph.row] == 2
    ] = 0.0  # current (hubs) -- other (hubs, nns): 0.0 weight (remove)
    graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
    graph.eliminate_zeros()

    init_data = np.array(init)
    if len(init_data.shape) == 2:
        if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]:
            tree = KDTree(init_data)
            dist, ind = tree.query(init_data, k=2)
            nndist = np.mean(dist[:, 1])
            embedding = init_data + random_state.normal(
                scale=0.001 * nndist, size=init_data.shape
            ).astype(np.float32)
        else:
            embedding = init_data

    epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs)

    head = graph.row
    tail = graph.col

    embedding = (
        10.0
        * (embedding - np.min(embedding, 0))
        / (np.max(embedding, 0) - np.min(embedding, 0))
    ).astype(np.float32, order="C")

    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    embedding = nn_layout_optimize(
        embedding,
        embedding,
        head,
        tail,
        hub_info,
        n_epochs,
        n_vertices,
        epochs_per_sample,
        a,
        b,
        rng_state,
        gamma=gamma,
        learning_rate=learning_rate,
        negative_sample_rate=negative_sample_rate,
        parallel=parallel,
        verbose=verbose,
        k=k,
        label=label,
    )

    return embedding
        interest_indexes = np.where(hash_index == selected_id)[0]
        fuzzy_indexes = np.where((scalar_product <= upper_bound+bucket_residual)*\
             (scalar_product >= lower_bound-bucket_residual))[0]

        # Restrict the points
        points_ = points[interest_indexes]
        labels_ = labels[interest_indexes]

        # 3.1 Voxelisation of the data
        N = points_.shape[0]
        indexes = np.arange(N)

        t0 = time.time()
        print('Building KDTree...')
        kd = KDTree(points_, metric='minkowski')
        t1 = time.time()
        print('KDTree built in {} sec'.format(t1 - t0))

        # Single-Shot query for cubical voxels
        radius = 0.1

        t0 = time.time()
        neighborhoods_inner_sphere = kd.query_radius(points_, r=radius)
        t1 = time.time()
        print('Query time for computing neighborhoods on all points: {} sec'.
              format(t1 - t0))

        t0 = time.time()
        neighborhoods_outer_sphere = kd.query_radius(points_,
                                                     r=radius * sqrt(3))
示例#52
0
def get_nearest_neighbors_from_set(node_set, k):
    tree = KDTree(node_set)
    return tree.query(node_set, k=k)
示例#53
0
    _has_mongo = False
else:
    _has_mongo = True

if __name__ == '__main__':

    client = MongoClient(mongoconnection.server)
    db = client[mongoconnection.db]
    if mongoconnection.passwd is not None:
        db.authenticate(mongoconnection.user, password=mongoconnection.passwd)
    col = db[mongoconnection.col]

    coords = np.load(wind_data_path + '/Coords.npy')
    memo = np.zeros(coords.shape[0])
    query = {"experiment": "rnnseq2seq", "site": ""}
    tree = KDTree(coords, leaf_size=1)
    count = 0
    for i in range(coords.shape[0]):
        # print(i, end=' ')
        # if i%100 == 0:
        #     print()
        # dist= tree.query_radius(coords[0,:].reshape(1, -1), r=0.1, return_distance=True, sort_results=True)
        dist = tree.query_radius(coords[i, :].reshape(1, -1),
                                 r=0.05,
                                 count_only=False,
                                 return_distance=False)[0]
        # print(i, dist)
        try:
            if len(dist) > 1:
                tsum = 0
                for j in dist:
示例#54
0
def detect_in_scope(vector_list, label_list):
    X = vector_list

    tree = KDTree(X, leaf_size=min(len(X) // 2, 400))
    return tree
示例#55
0
    def __getitem__(self, index):
        rets = {}
        imgs = np.zeros((self.nViews, *self.OutputSize[::-1]),
                        dtype=np.float32)
        if self.rgbd:
            imgs_rgb = np.zeros((self.nViews, *self.OutputSize[::-1], 3),
                                dtype=np.float32)
        if self.segm:
            segm = np.zeros((self.nViews, 1, *self.OutputSize[::-1]),
                            dtype=np.float32)
            if self.dynamicWeighting:
                dynamicW = np.zeros((self.nViews, 1, *self.OutputSize[::-1]),
                                    dtype=np.float32)
        if self.normal:
            normal = np.zeros((self.nViews, *self.OutputSize[::-1], 3),
                              dtype=np.float32)

        R = np.zeros((self.nViews, 4, 4))
        Q = np.zeros((self.nViews, 7))
        assert (self.nViews == 2)
        ct0, ct1 = self.__getpair__(index)
        imgsPath = []
        basePath = self.base_this
        frameid0 = f"{ct0:06d}"
        frameid1 = f"{ct1:06d}"

        if self.fullsize_rgbdn:
            imgs_rgb_full = np.zeros((self.nViews, 480, 640, 3),
                                     dtype=np.float32)
            imgs_full = np.zeros((self.nViews, 480, 640), dtype=np.float32)
            imgs_full[0] = self.LoadImage(
                os.path.join(basePath, 'obs_depth',
                             '{}.png'.format(frameid0))).copy()
            imgs_full[1] = self.LoadImage(
                os.path.join(basePath, 'obs_depth',
                             '{}.png'.format(frameid1))).copy()
            imgs_rgb_full[0] = self.LoadImage(os.path.join(
                basePath, 'obs_rgb', '{}.png'.format(frameid0)),
                                              depth=False).copy() / 255.
            imgs_rgb_full[1] = self.LoadImage(os.path.join(
                basePath, 'obs_rgb', '{}.png'.format(frameid1)),
                                              depth=False).copy() / 255.
            rets['rgb_full'] = imgs_rgb_full[np.newaxis, :]
            rets['depth_full'] = imgs_full[np.newaxis, :]

        imgs[0] = self.LoadImage(
            os.path.join(basePath, 'depth', '{}.png'.format(frameid0))).copy()
        imgs[1] = self.LoadImage(
            os.path.join(basePath, 'depth', '{}.png'.format(frameid1))).copy()
        dataMask = np.zeros((self.nViews, 1, *self.OutputSize[::-1]),
                            dtype=np.float32)
        dataMask[0, 0, :, :] = (imgs[0] != 0)
        dataMask[1, 0, :, :] = (imgs[1] != 0)
        rets['dataMask'] = dataMask[np.newaxis, :]

        if self.rgbd:
            imgs_rgb[0] = self.LoadImage(os.path.join(
                basePath, 'rgb', '{}.png'.format(frameid0)),
                                         depth=False).copy() / 255.
            imgs_rgb[1] = self.LoadImage(os.path.join(
                basePath, 'rgb', '{}.png'.format(frameid1)),
                                         depth=False).copy() / 255.

        R[0] = np.loadtxt(
            os.path.join(basePath, 'pose', frameid0 + '.pose.txt'))
        R[1] = np.loadtxt(
            os.path.join(basePath, 'pose', frameid1 + '.pose.txt'))
        Q[0, :4] = rot2Quaternion(R[0][:3, :3])
        Q[0, 4:] = R[0][:3, 3]
        Q[1, :4] = rot2Quaternion(R[1][:3, :3])
        Q[1, 4:] = R[1][:3, 3]
        imgsPath.append(f"{basePath}/{ct0:06d}")
        imgsPath.append(f"{basePath}/{ct1:06d}")

        if self.normal:
            tp = self.LoadImage(os.path.join(basePath, 'normal',
                                             '{}.png'.format(frameid0)),
                                depth=False).copy().astype('float')
            mask = (tp == 0).sum(2) < 3
            tp[mask] = tp[mask] / 255. * 2 - 1
            normal[0] = tp
            tp = self.LoadImage(os.path.join(basePath, 'normal',
                                             '{}.png'.format(frameid1)),
                                depth=False).copy().astype('float')
            mask = (tp == 0).sum(2) < 3
            tp[mask] = tp[mask] / 255. * 2 - 1
            normal[1] = tp

        if self.segm:
            tp = (self.LoadImage(os.path.join(basePath, 'semantic_idx',
                                              '{}.png'.format(frameid0)),
                                 depth=False).copy())[:, :, 1]
            segm[0] = tp.reshape(segm[0].shape)
            tp = (self.LoadImage(os.path.join(basePath, 'semantic_idx',
                                              '{}.png'.format(frameid1)),
                                 depth=False).copy())[:, :, 1]

            segm[1] = tp.reshape(segm[1].shape)

            segm_ = np.zeros((self.nViews, 1, *self.OutputSize[::-1]),
                             dtype=np.float32)
            segm_[0] = segm[0]
            segm_[1] = segm[1]
            segm_ = segm_[np.newaxis, :]

        if self.denseCorres:
            # get 3d point cloud for each pano
            pcs, masks = self.Pano2PointCloud(
                imgs[0],
                self.representation)  # be aware of the order of returned pc!!!
            pct, maskt = self.Pano2PointCloud(imgs[1], self.representation)

            #pct = np.matmul(R[0],np.matmul(np.linalg.inv(R[1]),np.concatenate((pct,np.ones([1,pct.shape[1]])))))[:3,:]
            pct = np.matmul(np.linalg.inv(R[1]),
                            np.concatenate(
                                (pct, np.ones([1, pct.shape[1]]))))[:3, :]
            pcs = np.matmul(np.linalg.inv(R[0]),
                            np.concatenate(
                                (pcs, np.ones([1, pcs.shape[1]]))))[:3, :]
            # find correspondence using kdtree
            tree = KDTree(pct.T)
            IdxQuery = np.random.choice(range(pcs.shape[1]), 5000)
            # sample 5000 query points
            pcsQuery = pcs[:, IdxQuery]
            nearest_dist, nearest_ind = tree.query(pcsQuery.T, k=1)
            hasCorres = (nearest_dist < 0.08)
            idxTgtNeg = []
            idxSrc = self.PanoIdx(masks[IdxQuery[np.where(hasCorres)[0]]],
                                  imgs.shape[1], imgs.shape[2],
                                  self.representation)
            idxTgt = self.PanoIdx(maskt[nearest_ind[hasCorres]], imgs.shape[1],
                                  imgs.shape[2], self.representation)

            if hasCorres.sum() < 200:
                rets['denseCorres'] = {
                    'idxSrc': np.zeros([1, 500, 2]),
                    'idxTgt': np.zeros([1, 500, 2]),
                    'valid': np.array([0]),
                    'idxTgtNeg': idxTgtNeg
                }

            else:
                # only pick 2000 correspondence per pair
                idx500 = np.random.choice(range(idxSrc.shape[0]), 500)
                idxSrc = idxSrc[idx500][np.newaxis, :]
                idxTgt = idxTgt[idx500][np.newaxis, :]

                rets['denseCorres'] = {
                    'idxSrc': idxSrc,
                    'idxTgt': idxTgt,
                    'valid': np.array([1]),
                    'idxTgtNeg': idxTgtNeg
                }

        # reprojct the second image into the first image plane
        if self.reproj:

            assert (imgs.shape[1] == 160 and imgs.shape[2] == 640)
            h = imgs.shape[1]

            pct, mask = util.depth2pc(
                imgs[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44],
                'scannet')  # be aware of the order of returned pc!!!

            colorpct = imgs_rgb[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 +
                                44, :].reshape(-1, 3)[mask]
            normalpct = normal[1, 80 - 33:80 + 33,
                               160 + 80 - 44:160 + 80 + 44, :].reshape(-1,
                                                                       3)[mask]
            depthpct = imgs[1, 80 - 33:80 + 33,
                            160 + 80 - 44:160 + 80 + 44].reshape(-1)[mask]

            R_this = np.matmul(R[0], np.linalg.inv(R[1]))
            R_this_p = R_this.copy()
            dR = util.randomRotation(epsilon=0.1)
            dRangle = angular_distance_np(dR[np.newaxis, :],
                                          np.eye(3)[np.newaxis, :])[0]

            R_this_p[:3, :3] = np.matmul(dR, R_this_p[:3, :3])
            R_this_p[:3, 3] += np.random.randn(3) * 0.1

            t2s_dr = np.matmul(R_this, np.linalg.inv(R_this_p))

            pct_reproj = np.matmul(
                R_this_p, np.concatenate(
                    (pct.T, np.ones([1, pct.shape[0]]))))[:3, :]
            pct_reproj_org = np.matmul(
                R_this, np.concatenate(
                    (pct.T, np.ones([1, pct.shape[0]]))))[:3, :]
            flow = pct_reproj_org - pct_reproj
            normalpct = np.matmul(R_this_p[:3, :3], normalpct.T).T
            flow = flow.T
            t2s_rgb = self.reproj_helper(pct_reproj_org, colorpct,
                                         imgs_rgb[0].shape, 'color')
            t2s_rgb_p = self.reproj_helper(pct_reproj, colorpct,
                                           imgs_rgb[0].shape, 'color')
            t2s_n_p = self.reproj_helper(pct_reproj, normalpct,
                                         imgs_rgb[0].shape, 'normal')
            t2s_d_p = self.reproj_helper(pct_reproj, depthpct,
                                         imgs_rgb[0].shape[:2], 'depth')

            t2s_flow_p = self.reproj_helper(pct_reproj, flow,
                                            imgs_rgb[0].shape, 'color')
            t2s_mask_p = (t2s_d_p != 0).astype('int')

            pct, mask = util.depth2pc(
                imgs[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44],
                'scannet')  # be aware of the order of returned pc!!!

            colorpct = imgs_rgb[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 +
                                44, :].reshape(-1, 3)[mask]
            normalpct = normal[0, 80 - 33:80 + 33,
                               160 + 80 - 44:160 + 80 + 44, :].reshape(-1,
                                                                       3)[mask]
            depthpct = imgs[0, 80 - 33:80 + 33,
                            160 + 80 - 44:160 + 80 + 44].reshape(-1)[mask]

            R_this = np.matmul(R[1], np.linalg.inv(R[0]))
            R_this_p = R_this.copy()
            dR = util.randomRotation(epsilon=0.1)
            dRangle = angular_distance_np(dR[np.newaxis, :],
                                          np.eye(3)[np.newaxis, :])[0]

            R_this_p[:3, :3] = np.matmul(dR, R_this_p[:3, :3])
            R_this_p[:3, 3] += np.random.randn(3) * 0.1
            s2t_dr = np.matmul(R_this, np.linalg.inv(R_this_p))
            pct_reproj = np.matmul(
                R_this_p, np.concatenate(
                    (pct.T, np.ones([1, pct.shape[0]]))))[:3, :]
            pct_reproj_org = np.matmul(
                R_this, np.concatenate(
                    (pct.T, np.ones([1, pct.shape[0]]))))[:3, :]
            flow = pct_reproj_org - pct_reproj
            # assume always observe the second view(right view)

            normalpct = np.matmul(R_this_p[:3, :3], normalpct.T).T
            flow = flow.T

            s2t_rgb = self.reproj_helper(pct_reproj_org, colorpct,
                                         imgs_rgb[0].shape, 'color')
            s2t_rgb_p = self.reproj_helper(pct_reproj, colorpct,
                                           imgs_rgb[0].shape, 'color')
            s2t_n_p = self.reproj_helper(pct_reproj, normalpct,
                                         imgs_rgb[0].shape, 'normal')
            s2t_d_p = self.reproj_helper(pct_reproj, depthpct,
                                         imgs_rgb[0].shape[:2], 'depth')
            s2t_flow_p = self.reproj_helper(pct_reproj, flow,
                                            imgs_rgb[0].shape, 'color')
            s2t_mask_p = (s2t_d_p != 0).astype('int')

            # compute an envelop box
            try:
                tp = np.where(t2s_d_p.sum(0))[0]
                w0, w1 = tp[0], tp[-1]
                tp = np.where(t2s_d_p.sum(1))[0]
                h0, h1 = tp[0], tp[-1]
            except:
                w0, h0 = 0, 0
                w1, h1 = t2s_d_p.shape[1] - 1, t2s_d_p.shape[0] - 1
            t2s_box_p = np.zeros(t2s_d_p.shape)
            t2s_box_p[h0:h1, w0:w1] = 1

            try:
                tp = np.where(s2t_d_p.sum(0))[0]
                w0, w1 = tp[0], tp[-1]
                tp = np.where(s2t_d_p.sum(1))[0]
                h0, h1 = tp[0], tp[-1]
            except:
                w0, h0 = 0, 0
                w1, h1 = s2t_d_p.shape[1] - 1, s2t_d_p.shape[0] - 1
            s2t_box_p = np.zeros(s2t_d_p.shape)
            s2t_box_p[h0:h1, w0:w1] = 1

            rets['proj_dr'] = np.stack((t2s_dr, s2t_dr), 0)[np.newaxis, :]
            rets['proj_flow'] = np.stack((t2s_flow_p, s2t_flow_p),
                                         0).transpose(0, 3, 1,
                                                      2)[np.newaxis, :]
            rets['proj_rgb'] = np.stack((t2s_rgb, s2t_rgb),
                                        0).transpose(0, 3, 1, 2)[np.newaxis, :]
            rets['proj_rgb_p'] = np.stack(
                (t2s_rgb_p, s2t_rgb_p), 0).transpose(0, 3, 1, 2)[np.newaxis, :]
            rets['proj_n_p'] = np.stack((t2s_n_p, s2t_n_p),
                                        0).transpose(0, 3, 1, 2)[np.newaxis, :]
            rets['proj_d_p'] = np.stack((t2s_d_p, s2t_d_p),
                                        0).reshape(1, 2, 1, t2s_d_p.shape[0],
                                                   t2s_d_p.shape[1])
            rets['proj_mask_p'] = np.stack(
                (t2s_mask_p, s2t_mask_p),
                0).reshape(1, 2, 1, t2s_mask_p.shape[0], t2s_mask_p.shape[1])
            rets['proj_box_p'] = np.stack(
                (t2s_box_p, s2t_box_p), 0).reshape(1, 2, 1, t2s_box_p.shape[0],
                                                   t2s_box_p.shape[1])

        imgs = imgs[np.newaxis, :]
        if self.rgbd:
            imgs_rgb = imgs_rgb[np.newaxis, :].transpose(0, 1, 4, 2, 3)
        if self.normal:
            normal = normal[np.newaxis, :].transpose(0, 1, 4, 2, 3)
        R = R[np.newaxis, :]
        Q = Q[np.newaxis, :]
        if self.segm:
            rets['segm'] = segm_
            if self.dynamicWeighting:
                rets['dynamicW'] = dynamicW[np.newaxis, :]
        rets['interval'] = self.interval_this
        rets['norm'] = normal
        rets['rgb'] = imgs_rgb
        rets['depth'] = imgs
        rets['Q'] = Q
        rets['R'] = R
        rets['imgsPath'] = imgsPath
        return rets
示例#56
0
文件: Practica1.py 项目: JJavier98/MH
def relief(data, tags):
    ####################################### BUCLES ####################################################
    """
	w = np.zeros(data.shape[1])
	closest_enemy_id = -4
	closest_friend_id = -4

	for i in range(data.shape[0]):
		enemy_distance = 999
		friend_distance = 999
		for j in range(data.shape[0]):
			if i != j:
				current_distance = np.linalg.norm(data[i] - data[j])

				if tags[i] == tags[j] and current_distance < friend_distance:
					friend_distance = current_distance
					closest_friend_id = j
				elif tags[i] != tags[j] and current_distance < enemy_distance:
					enemy_distance = current_distance
					closest_enemy_id = j

		w = w + np.abs(data[i] - data[closest_enemy_id]) - np.abs(data[i] - data[closest_friend_id])
	"""
    ######################################### KDTree ##################################################

    w = np.zeros(data.shape[1])
    closest_enemy_id = -4
    closest_friend_id = -4
    ally_found = False
    enemy_found = False

    tree = KDTree(data)
    nearest_ind = tree.query(data, k=data.shape[0], return_distance=False)[:,
                                                                           1:]

    for i in range(nearest_ind.shape[0]):
        for j in range(nearest_ind.shape[1]):
            if not ally_found and tags[i] == tags[nearest_ind[i, j]]:
                ally_found = True
                closest_friend_id = nearest_ind[i, j]
            elif not enemy_found and tags[i] != tags[nearest_ind[i, j]]:
                enemy_found = True
                closest_enemy_id = nearest_ind[i, j]
            if ally_found and enemy_found:
                break
        ally_found = enemy_found = False
        w = w + np.abs(data[i] - data[closest_enemy_id]) - np.abs(
            data[i] - data[closest_friend_id])

###########################################################################################

    w_max = np.max(w)
    w[w < 0.0] = 0.0
    w /= w_max

    # Comentado para no retrasar la ejecucion del algoritmo
    """
	for i in range(len(w)):
		plt.bar(i,w[i])
	plt.show()
	"""

    return w
示例#57
0
if fil.crs != WGS:
    fil = fil.to_crs(WGS)
fil = fil.to_crs(UTM)
fil['area'] = fil.area
fil['centroid'] = fil['geometry'].centroid
fil = fil.to_crs(WGS)
fil = fil[['PID', 'centroid', 'area']]

#short = fil[:50000]
short = fil

area_dict = dict(zip(list(short.index), list(short['area'])))
matrix = list(
    zip(short.centroid.apply(lambda x: x.x),
        short.centroid.apply(lambda x: x.y)))
KD_tree = KDTree(matrix)


###
def Main(passed_dict):

    # unpack passed dict into local variables for this thread.
    short = passed_dict['df']
    thread_no = passed_dict['thread_no']
    print_thresh = passed_dict['print_thresh']
    save_thresh = passed_dict['save_thresh']

    # set up some counters / timings
    t = time.time()
    counter = 1
示例#58
0
dataFile = open("mnist.dat", "wb")
for x in x_train:
    dataFile.write(x.tobytes())
dataFile.close()
y_bools = [y % 2 == 0 for y in y_train]
y_str = [str(y) for y in y_train]

df = pd.DataFrame({"y": y_train, "even": y_bools, "name": y_str})
df.index.rename('index', inplace=True)
df.to_csv('mnist.csv')

# KNN data for tests
data = np.memmap("mnist.dat", dtype=np.float32)
data = data.reshape([-1, 784])

tree = KDTree(data, leaf_size=2)
dist, ind = tree.query(data[:100], k=5)

dist, ind = tree.query(np.zeros([1, 784], dtype=np.float32), k=5)

nbrs = {
    "d0": dist[:, 0],
    "d1": dist[:, 1],
    "d2": dist[:, 2],
    "d3": dist[:, 3],
    "d4": dist[:, 4],
    "i0": ind[:, 0],
    "i1": ind[:, 1],
    "i2": ind[:, 2],
    "i3": ind[:, 3],
    "i4": ind[:, 4],
示例#59
0
class parameter_estimation(object):
    def __init__(self, exclude_FDR = False, salary_growth_outlier_weight = 0.1):
        
        # Read data
        self.data = pd.read_csv('data_cleaned/main_data.txt', sep = '\t')  
        self.school_clustering = pd.read_csv('data_cleaned/school_clustering.txt', sep='\t')
        self.major_list = pd.read_csv('data_cleaned/major_list.txt', sep='\t')
        self.salary_growth = pd.read_csv('data_cleaned/salary_growth_data.txt', sep = '\t')
        self.salary_growth_outlier_weight = salary_growth_outlier_weight

        # Clean columns names
        self.data.columns = [x.lower() for x in self.data.columns]
        self.salary_growth.columns = [x.lower() for x in self.salary_growth.columns]

        if exclude_FDR:
            self.data = self.data[self.data['source']!='FDR Report'].copy()

        # Select data that are in the clustering data
        self.data = self.data[self.data['school_in_clustering'] == 'Y']
        self.salary_growth = self.salary_growth[self.salary_growth['school_in_clustering'] == 'Y']
        # Get sigma calculated from 25 - 75 percentile or Average - Median
        self.data['sigma_qt'] = self.data.apply(lambda row: self._sigma_qt(row),axis=1)
        # This sigma value is used in calculation from median from mean
         
        medain_average_ratio_t = self.data.query(''' median_salary>0 and average_salary>0 ''')[['median_salary','average_salary']].median()
        self.medain_average_ratio = medain_average_ratio_t[0]/medain_average_ratio_t[1]

        # Get salary median estimated
        self.data['salary_median'] = self.data.apply (lambda row: self._median(row, self.medain_average_ratio),axis=1)
        
        # Identify the schools that have overall records only
        only_all_schools = set(self.data.loc[(pd.isna(self.data['majorcategoryid'])),'school_name_matched']) - set(self.data.loc[(~pd.isna(self.data['majorcategoryid'])),'school_name_matched'])
        self.data.loc[self.data['school_name_matched'].isin(only_all_schools), 'only_all_flag'] = 1
        self.data.fillna({'only_all_flag':0}, inplace=True)
        
        # Add salary growth match flag
        self.school_clustering.loc[self.school_clustering['school_name'].isin(self.salary_growth['school_name_matched']),'matched_flag_growth'] = 1
        self.school_clustering.fillna({'matched_flag_growth':0}, inplace=True)
        
        # Add salary growth else match flag
        self.school_clustering.loc[self.school_clustering['school_name'].isin(self.salary_growth.query('il_flag==0')['school_name_matched']),'matched_flag_growth_else'] = 1
        self.school_clustering.fillna({'matched_flag_growth_else':0}, inplace=True)
        
        # Add salary match flag
        self.school_clustering.loc[self.school_clustering['school_name'].isin(self.data.query('only_all_flag==0')['school_name_matched']),'matched_flag'] = 1
        self.school_clustering.fillna({'matched_flag':0}, inplace=True)
        
        # Add sigma match flag
        sigma_schools = list(self.data.loc[self.data['sigma_qt']>0,'school_name_matched'])
        self.school_clustering.loc[self.school_clustering['school_name'].isin(sigma_schools),'matched_flag_sigma'] = 1
        self.school_clustering.fillna({'matched_flag_sigma':0}, inplace=True)
        
        self.sigma_data = self.data.loc[self.data['sigma_qt']>0].copy()
        self.data.set_index('school_name_matched', inplace=True)
        self.salary_growth.set_index('school_name_matched', inplace=True)
        self.sigma_data.set_index('school_name_matched', inplace=True)
        
        ### create title_to_category_ratio
        self.data['major_category_median'] = self.data.groupby(['school_name_matched','state','school_name','major_category'])['salary_median'].transform(np.mean)
        self.data['ratio'] = self.data['major_category_median'] / self.data['salary_median']
        self.title_to_category_ratio = self.data[self.data['major_title'] != 'all'].groupby(['major_category','major_title'])['ratio'].mean().reset_index()
        self.title_to_category_ratio = pd.merge(self.major_list[['major_title','major_category']], self.title_to_category_ratio, on = ['major_title','major_category'], how = 'left')
        self.title_to_category_ratio = self.title_to_category_ratio.fillna(1)
        
        ### create category_to_school_ratio
        salary_median = self.data.loc[self.data['major_category'] == 'all'].reset_index().groupby('school_name_matched')['salary_median'].mean().reset_index(name = 'school_median')
        category_salary = self.data.loc[self.data['major_category'] != 'all',['major_category','major_title','salary_median']].reset_index().drop_duplicates(keep='first')
        school_median = pd.merge(salary_median, category_salary, on = 'school_name_matched', how = 'inner')
        school_median = school_median[school_median['major_title'] == 'all']
        school_median['ratio'] = school_median['salary_median'] / school_median['school_median']
        self.category_to_school_ratio = school_median.groupby('major_category')['ratio'].mean().reset_index()
        self.category_to_school_ratio = pd.merge(self.major_list[['major_category']].drop_duplicates(),self.category_to_school_ratio, on = ['major_category'], how = 'left')
        self.category_to_school_ratio = self.category_to_school_ratio.fillna(1)
        self.data = self.data.drop(columns = ['major_category_median','ratio'])
        
        ### Initialize functions
        self.build_tree()
        
    def _sigma_qt(self, row):
        if row['salary_min']!= 0:
            sigma = (np.log(row['salary_max']) - np.log(row['salary_min']))/(norm.ppf(0.75, loc=0, scale=1) - norm.ppf(0.25, loc=0, scale=1))
        elif row['average_salary'] != 0 and row['median_salary'] != 0 and row['average_salary'] > row['median_salary']:
            sigma = np.sqrt(2*(np.log(row['average_salary']) - np.log(row['median_salary'])))
        else:
            sigma = 0
        return sigma

    def _median (self, row, r):
        if row['median_salary'] != 0 :
            return row['median_salary']
        elif row['salary_min'] != 0:
            return np.exp((np.log(row['salary_min'])+np.log(row['salary_max']))/2)
        else:
            # return np.exp(np.log(row['average_salary']) - np.power(max_sigma,2)/2)
            return row['average_salary'] * r
    
    def build_tree(self):
        # features used for clustering
        clustering_features = ['state','level','control','long_x','lat_y','student_count','rank_num',\
                               'tuition','school_city_demo','school_city_gdp','matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']

        clustering_data = self.school_clustering[clustering_features]

        # one hot encoding for numerical data
        cat_vars = ['state','level','control']

        for var in cat_vars:
            cat_list = pd.get_dummies(clustering_data[var], prefix=var)
            clustering_data1 =clustering_data.join(cat_list)
            clustering_data = clustering_data1

        clustering_data_vars = clustering_data.columns.values.tolist()
        to_keep = [i for i in clustering_data_vars if i not in cat_vars]
        clustering_data = clustering_data[to_keep]

        # scale the features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler = scaler.fit(clustering_data)
        clustering_data_trans = pd.DataFrame(scaler.transform(clustering_data), columns = clustering_data.columns)
        clustering_data_trans.index = self.school_clustering['school_name']

        self.data_train1 = (clustering_data_trans[clustering_data_trans['matched_flag'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_train2 = (clustering_data_trans[clustering_data_trans['matched_flag_growth_else'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_train3 = (clustering_data_trans[clustering_data_trans['matched_flag_growth'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_train4 = (clustering_data_trans[clustering_data_trans['matched_flag_sigma'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
#         print(self.data_train.shape)
        self.data_test1 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_test2 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_test3 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
        self.data_test4 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma']))
#         print(self.data_test.shape)
        
        self.kdt1 = KDTree(np.array(self.data_train1))
        self.kdt2 = KDTree(np.array(self.data_train2))
        self.kdt3 = KDTree(np.array(self.data_train3))
        self.kdt4 = KDTree(np.array(self.data_train4))
        
    def get_salary_neighbors(self, school_name, k = 3):
        a = np.expand_dims(np.array(self.data_test1.loc[school_name]), axis=0)
        _, ind_list = self.kdt1.query(a, k)  
        x = self.data_train1.iloc[ind_list[0,:]].index
        return list(x)
    
    def get_growth_neighbors_else(self, school_name, k = 3):
        a = np.expand_dims(np.array(self.data_test2.loc[school_name]), axis=0)
        _, ind_list = self.kdt2.query(a, k)  
        x = self.data_train2.iloc[ind_list[0,:]].index
        return list(x)
    
    def get_growth_neighbors(self, school_name, k = 3):
        a = np.expand_dims(np.array(self.data_test3.loc[school_name]), axis=0)
        _, ind_list = self.kdt3.query(a, k)  
        x = self.data_train3.iloc[ind_list[0,:]].index
        return list(x)

    def get_sigma_neighbors(self, school_name, k = 3):
        a = np.expand_dims(np.array(self.data_test4.loc[school_name]), axis=0)
        _, ind_list = self.kdt4.query(a, k)  
        x = self.data_train4.iloc[ind_list[0,:]].index
        return list(x)
    
    def find_one_median(self, school_name, major_title, major_category):
        matched_records = self.data.loc[[school_name]]

        ### if major_title is matched ###
        if major_title in list(matched_records.major_title):
#             print('find_major_title {}'.format(major_title))
            median = matched_records.loc[matched_records['major_title'] == major_title,'salary_median'].mean()
            salary_similar = list(matched_records.loc[matched_records['major_title'] == major_title,'salary_median'])
        ### if major_category is matched ###
        elif major_category in np.unique(matched_records.major_category):
            medain_temp = matched_records.loc[matched_records['major_category']==major_category,'salary_median'].mean()
#             print(medain_temp)
            ratio = self.title_to_category_ratio.loc[self.title_to_category_ratio['major_title'] == major_title,'ratio']
            median = medain_temp * float(ratio)
            salary_similar = list(matched_records.loc[matched_records['major_category']==major_category,'salary_median'])
        ### if major_category is not matched ###
        else:
            medain_temp = matched_records['salary_median'].mean()
#             print(medain_temp)
            ratio1 = self.category_to_school_ratio.loc[self.category_to_school_ratio['major_category']==major_category,'ratio']
            ratio2 = self.title_to_category_ratio.loc[self.title_to_category_ratio['major_title'] == major_title,'ratio']
            median = float(medain_temp) * float(ratio1) * float(ratio2)
            salary_similar = list(matched_records['salary_median'])

        return float(median), salary_similar

    
    def find_one_sigma(self, school_name, major_title, major_category):
        matched_records = self.sigma_data.loc[[school_name]]
        
        if major_title in list(matched_records.major_title):
            sigma = matched_records.loc[matched_records['major_title'] == major_title, 'sigma_qt'].mean()
        
        elif major_category in np.unique(matched_records.major_category):
            sigma = matched_records.loc[matched_records['major_category'] == major_category, 'sigma_qt'].mean()
        
        else:
            sigma = matched_records['sigma_qt'].mean()
            
        return sigma

    def get_value(self, school, m_title, k):
        # first get the major_category and school state from input
        m_category, state = self.match_input(school, m_title)
#         print(school)
#         print(m_title)
#         print(m_category)
        # if we have school's salary information:
        similar_schools = self.get_salary_neighbors(school, k)
        self.salary_similar_schools = similar_schools
        similar_schools_sigma = self.get_sigma_neighbors(school, k)
        # print(similar_schools)
        median_array = []
        similar_median_array = []
        sigma_array = []
        for s1 in similar_schools:
            a,b = self.find_one_median(s1, m_title, m_category)
            median_array.append(a)
            similar_median_array.append(b)

        for s2 in similar_schools_sigma:
            sigma_array.append(self.find_one_sigma(s2, m_title, m_category))
#         print(median_array)
        
        median_knn = np.mean(np.apply_over_axes(np.sort, np.array(median_array), axes=0)[:2])
        sigma_knn = np.mean(sigma_array)
        
        if school in self.data[self.data['only_all_flag']==0].index:
            median_self, _ = self.find_one_median(school, m_title, m_category)
            median = 0.8 * median_self + 0.2 * median_knn
        else:
            median = median_knn

        if school in self.sigma_data.index:
            sigma_self = self.find_one_sigma(school, m_title, m_category)
            sigma = 0.8 * sigma_self + 0.2 * sigma_knn
        else:
            sigma = sigma_knn
        
        return median, sigma, similar_median_array
    
    def find_growth(self, school_name, major_title, major_category):
        matched_records = self.salary_growth.loc[[school_name]]
        
        if major_title in list(matched_records.major_title):
            growth = list(matched_records.loc[matched_records['major_title'] == major_title, ['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean())
        
        elif major_category in np.unique(matched_records.major_category):
            growth = list(matched_records.loc[matched_records['major_category'] == major_category, ['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean())
        
        else:
            growth = list(matched_records[['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean())
            
        return growth
            
    def get_growth(self, school, m_title):
        m_category, state = self.match_input(school, m_title)
        
        similar_schools = self.get_growth_neighbors(school, k = 3)
        similar_schools_else = self.get_growth_neighbors_else(school, k = 3)

        growth_array = []
        growth_array_else = []
        
        for ss in similar_schools:
            g1 = self.find_growth(ss, m_title, m_category)
            growth_array.append(g1)
        
        growth_array = np.array(growth_array)

        
        for sss in similar_schools_else:
            g2 = self.find_growth(sss, m_title, m_category)
            growth_array_else.append(g2)
        
        growth_array_else = np.array(growth_array_else)

        salary_growth_all = np.mean(growth_array, axis = 0)
        salary_growth_else = np.mean(growth_array_else, axis = 0)
        salary_growth = self.salary_growth_outlier_weight * salary_growth_all + (1-self.salary_growth_outlier_weight) * salary_growth_else
          
        return salary_growth
    
    def input_check(self,school):
        output = 1
        
        if school not in list(self.school_clustering['school_name']):
            print('Error: School name \"{}\" is not recorded!'.format(school))
            output = 0
        
        return output
    
    def match_input(self, school, major):
        mc = self.major_list.loc[self.major_list['major_title'] == major,'major_category'].values[0]
        state = np.array(self.school_clustering.loc[self.school_clustering['school_name']==school, 'state'])[0]
        return mc,state
    
    def find_estimate(self, school, majorID, k=5):
        
        school = school.lower()
        check_result = self.input_check(school)
        output = {}

        if check_result == 0:
                output['Error'] = {}
                output['Error']['salary_year_1'] = -1
                output['Error']['salary_year_2'] = -1
                output['Error']['salary_year_3'] = -1
                output['Error']['salary_year_4'] = -1
                output['Error']['salary_year_5'] = -1
                output['Error']['sigma'] = -1
                
        else: 

            if not isinstance(majorID, list):
                majorID = [majorID]
        
            for m in majorID:
                if int(m) < 1 or int(m) > max(self.major_list['majorID']):
                    print('Error: majorID not existed!')
                    output['Error'] = {}
                    output['Error']['salary_year_1'] = -1
                    output['Error']['salary_year_2'] = -1
                    output['Error']['salary_year_3'] = -1
                    output['Error']['salary_year_4'] = -1
                    output['Error']['salary_year_5'] = -1
                    output['Error']['sigma'] = -1
                    output['Error']['similar_schools'] = -1
                    output['Error']['similar_salary'] = -1
                    
                else:   
                    m_title = (self.major_list.loc[self.major_list['majorID'] == int(m), 'major_title'].values)[0]
                    output[m_title] = {}
                    median, sigma, similar_median = self.get_value(school, m_title, k)
                    salary_growth = self.get_growth(school, m_title)

                    output[m_title]['salary_year_1'] = median
                    output[m_title]['salary_year_2'] = output[m_title]['salary_year_1'] * (1 + salary_growth[0])
                    output[m_title]['salary_year_3'] = output[m_title]['salary_year_2'] * (1 + salary_growth[1])
                    output[m_title]['salary_year_4'] = output[m_title]['salary_year_3'] * (1 + salary_growth[2])
                    output[m_title]['salary_year_5'] = output[m_title]['salary_year_4'] * (1 + salary_growth[3])
                    output[m_title]['sigma'] = sigma
                    output[m_title]['similar_schools'] = self.salary_similar_schools
                    output[m_title]['similar_salary'] = similar_median
        
        return output
示例#60
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gps', '-G', type=str, help="GPS file to run")
    parser.add_argument('--atm', '-A', type=str, help='ATM directory to run')
    parser.add_argument('--hemisphere',
                        '-H',
                        type=int,
                        default=-1,
                        help='hemisphere, must be 1 or -1')
    parser.add_argument('--query',
                        '-Q',
                        type=float,
                        default=100,
                        help='KD-Tree query radius')
    parser.add_argument('--median',
                        '-M',
                        default=False,
                        action='store_true',
                        help='Run block median')
    parser.add_argument('--scan',
                        '-S',
                        default=False,
                        action='store_true',
                        help='Run ATM scan fit')
    parser.add_argument('--verbose',
                        '-v',
                        default=False,
                        action='store_true',
                        help='verbose output of run')
    args = parser.parse_args()

    if args.hemisphere == 1:
        SRS_proj4 = '+proj=stere +lat_0=90 +lat_ts=70 +lon_0=-45 +k=1 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs '
    elif args.hemisphere == -1:
        SRS_proj4 = '+proj=stere +lat_0=-90 +lat_ts=-71 +lon_0=0 +k=1 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs'

    # tilde expansion of file arguments
    GPS_file = os.path.expanduser(args.gps)
    fileBasename, fileExtension = os.path.splitext(GPS_file)
    ATM_dir = os.path.expanduser(args.atm)

    print("working on GPS file {0}, ATM directory {1}".format(
        GPS_file, ATM_dir)) if args.verbose else None

    # find Qfit files within ATM_dir
    Qfit_regex = re.compile(
        r"ATM1B.*_(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2}).*.h5")
    Qfit_files = [
        os.path.join(ATM_dir, f) for f in os.listdir(ATM_dir)
        if Qfit_regex.search(f)
    ]

    # output directory
    out_dir = os.path.join(ATM_dir, 'xovers')
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    # output file
    out_file = 'vs_{0}.h5'.format(os.path.basename(fileBasename))
    # check if output file exists
    if os.path.isfile(os.path.join(out_dir, out_file)):
        print("found: {0}".format(os.path.join(
            out_dir, out_file))) if args.verbose else None

    # read GPS HDF5 file
    GPS_field_dict = {None: ['latitude', 'longitude', 'z']}
    GPS = pc.data().from_h5(GPS_file,
                            field_dict=GPS_field_dict).get_xy(SRS_proj4)
    # run block median over GPS data
    if args.median:
        GPS = blockmedian_for_gps(GPS, 5)

    # read all Qfit files within ATM directory
    Qlist = list()
    for f in sorted(Qfit_files):
        Qlist.append(pc.ATM_Qfit.data().from_h5(f))
    # merge the list of ATM data and build the search tree
    Q_full = pc.data().from_list(Qlist).get_xy(SRS_proj4)

    # fit scan parameters to an ATM data structure
    if args.scan:
        Q_full = fit_ATM_data(Q_full)

    # run block median for qsub
    if args.median:
        Q_full = blockmedian_for_qsub(Q_full, 5)

    # construct search tree from ATM Qfit coords
    # pickle Qtree to save computational time for future runs
    if os.path.isfile(os.path.join(ATM_dir, 'tree.p')):
        Qtree = pickle.load(open(os.path.join(ATM_dir, 'tree.p'), 'rb'))
    else:
        Qtree = KDTree(np.c_[Q_full.x, Q_full.y])
        pickle.dump(Qtree, open(os.path.join(ATM_dir, 'tree.p'), 'wb'))

    # output fields
    out_fields = [
        'x', 'y', 'z', 'longitude', 'latitude', 't_qfit', 'h_qfit_50m',
        'sigma_qfit_50m', 'dz_50m', 'RDE_50m', 'N_50m', 'hbar_20m',
        'h_qfit_10m', 'sigma_qfit_10m', 'dz_10m', 'RDE_10m', 'N_10m',
        'x_10m_mean', 'y_10m_mean'
    ]
    # append scan fields to output template
    if args.scan:
        out_fields.extend(['scan_XT_50m', 'scan_XT_10m'])
    out_template = {f: np.NaN for f in out_fields}
    out = list()

    # query the search tree to find points within query radius
    Qquery = Qtree.query_radius(np.c_[GPS.x, GPS.y], args.query)
    # indices of GPS points within bin
    ind, = np.nonzero([np.any(i) for i in Qquery])
    # loop over queries in the GPS data
    for i in ind:
        GPSsub = GPS.copy_subset(np.array([i]))
        # grab the Qfit bins around the GPS bin
        Qdata = Q_full.copy_subset(Qquery[i], by_row=True)
        Qdata.index(
            np.isfinite(Qdata.elevation) & np.isfinite(Qdata.latitude)
            & np.isfinite(Qdata.longitude))
        # create output dictionary of GPS and plane-fit ATM comparison
        this_out = compare_gps_with_qfit(GPSsub, Qdata, out_template)
        if this_out is not None:
            out.append(this_out)

    # if there were overlapping points between the GPS and ATM data
    if out:
        D = dict()
        with h5py.File(os.path.join(out_dir, out_file), 'w') as h5f:
            for field in out[0].keys():
                D[field] = np.array([ii[field] for ii in out])
                print(field, D[field].dtype) if args.verbose else None
                h5f.create_dataset(field, data=D[field])