def space_partitioning(): """ This is the main function for reading the input file, processing queries, and printing the results. It takes a space-partitioning approach with two kd-trees. """ logging.info("Reading from sys.stdin...") data = read_input(sys.stdin) logging.info("Building a tree from {} topic points.".format(len(data['topics']))) # Build the topics tree t0 = time.clock() dimensions = ['x', 'y'] tree = kdtree.KDTree(data['topics'], dimensions) t1 = time.clock() logging.info("Tree constructed, there are {} total nodes ({} s).". format(tree.number_nodes, t1 - t0)) # Build another tree for questions queries, with empty topics excluded logging.info("Building a tree from {} topic points.".format(len(data['topics_with_questions']))) t0 = time.clock() dimensions = ['x', 'y'] pruned_tree = kdtree.KDTree(data['topics_with_questions'].values(), dimensions) t1 = time.clock() logging.info("Tree constructed, there are {} total nodes ({} s).". format(tree.number_nodes, t1 - t0)) # Actually process the queries logging.info("Starting {} queries...".format(len(data['queries']))) stat_list = [] pass_list = [] t0 = time.clock() process_queries(data, tree, pruned_tree, stat_list, pass_list) t1 = time.clock() logging.info("Queries finished ({} s)".format(t1-t0)) # Pull together some analysis for debugging and optimization. pass_list.sort() logging.info("In {} queries, the number passes was:". format(len(data['queries']))) logging.info(" {} -> min".format(pass_list[0])) logging.info(" {} -> average".format(sum(pass_list)/len(pass_list))) logging.info(" {} -> median".format(pass_list[len(pass_list)/2])) logging.info(" {} -> max".format(pass_list[-1])) stat_list.sort() logging.info("In {} queries, the number nodes visited was:". format(len(data['queries']))) logging.info(" {} -> min".format(stat_list[0])) logging.info(" {} -> average".format(sum(stat_list)/len(stat_list))) logging.info(" {} -> median".format(stat_list[len(stat_list)/2])) logging.info(" {} -> max".format(stat_list[-1]))
def main(): # Calculate risk analysis weather_data, accident_data = load_data('wcvaarr.db') print('PRIMA') weather_tree = kdtree.KDTree(points=weather_data, features=['latitude', 'longitude', 'date']) accident_tree = kdtree.KDTree(points=accident_data, features=['latitude', 'longitude', 'date']) # Write the data out to file data = weather_tree.root.flatten() with open('weather.json', 'w') as fp: json.dump(data, fp) data = accident_tree.root.flatten() with open('accident.json', 'w') as fp: json.dump(data, fp)
def GetOffsets(patches, indices): start = time() kd = kdtree.KDTree(patches, leafsize=cfg.KDT_LEAF_SIZE, tau=cfg.TAU) dist, offsets = kdtree.get_annf_offsets(patches, indices, kd.tree, cfg.TAU) end = time() print "GetOffsets execution time: ", end - start return offsets
def __init__(self, metric, method='bruteforce'): """Accepts either bruteforce, kdtree, or balltree methods. If kdtree, metric must be a weighted euclidean metric.""" self.metric = metric self.method = method if self.method == 'kdtree': self.kdtree = kdtree.KDTree(self.metric) if check_kdtree: self.checker = NearestNeighbors(self.metric) print "Debugging: Double checking KD-tree with nearest neighbors" elif self.method == 'balltree': try: from sklearn.neighbors import BallTree, DistanceMetric self.points = [] self.datas = [] self.dirty = True except ImportError: print "NearestNeighbors: scikit-learn is not installed, falling back to brute force" self.method = 'bruteforce' self.nodes = [] elif self.method == 'se3balltree': try: from sklearn.neighbors import BallTree, DistanceMetric self.points = [] self.datas = [] self.dirty = True except ImportError: print "NearestNeighbors: scikit-learn is not installed, falling back to brute force" self.method = 'bruteforce' self.nodes = [] else: self.nodes = []
def mosaic(img_set, img_target, tile_size, nearest_imgs=0, blend=0): """ Return an image of img_target composed of images from img_set with tile_size. The images in img_set might be rescaled and cropped to fit tile_size. """ # number of tiles to be used tx, ty = img_target.size[0] / tile_size[0] + 1, img_target.size[ 1] / tile_size[1] + 1 # transform all images in set to tile_size img_set = [ImagePoint(rescale_crop(img, tile_size)) for img in img_set] # build mosaic image mosaic_img = Image.new('RGB', img_target.size) # rescale image into each pixel as a tile target_pixels = img_target.resize((tx, ty), Image.ANTIALIAS).load() # Create a KDTree of image set for tiles tree = kdtree.KDTree(img_set) # tiles tracking for alternation last_tile_position = [[] for x in xrange(nearest_imgs)] # for each tile, select the best image and compose mosaic for x in xrange(tx): for y in xrange(ty): # selects neigh, the n-th neighbour fartest from itself fartest = 0 for p in xrange(nearest_imgs): if not last_tile_position[p]: neigh = p break else: # find the closest distance dist = distance((x, y), min(last_tile_position[p], key=lambda e: distance((x, y), e))) if dist > fartest: fartest = dist neigh = p # sets current position as last last_tile_position[neigh].append((x, y)) # calculate target's tile mean target_mean = target_pixels[x, y][:3] # sorts img_set acording to distance: best_tile = tree.query(target_mean, t=nearest_imgs)[neigh].image # apply best tile to mosaic image mosaic_img.paste(best_tile, (tile_size[0] * x, tile_size[1] * y)) return Image.blend(mosaic_img, img_target, blend)
def __init__(self, metric, method='bruteforce'): self.metric = metric self.method = method if self.method == 'kdtree': self.kdtree = kdtree.KDTree(self.metric) if check_kdtree: self.checker = NearestNeighbors(self.metric) print "Debugging: Double checking KD-tree with nearest neighbors" else: self.nodes = []
def do_stuff(set_array, range_bounds): """Funkcja pomocnicza do tworzenia drzewa na podstawie zadanych parametrów :param set_array: zbiór punktów :param range_bounds: obszar przeszukiwania :return: drzewo kd """ tree = kdt.KDTree(set_array) tree.search_range(range_bounds[0], range_bounds[1]) print(tree) return tree
def rmspe(conf, point_list_brd): """ Return the RMSPE error statistic generated from K-fold cross validation. Take the given KFoldConf object and an ordered broadcasted list of point objects and return the desired error statistic. """ # deep copy point_list points = copy.deepcopy(point_list_brd.value) # scale time dimensions for p in points: p.scale_time(conf.time_scale) # build a list of sets representing the relevant partition partition = [list() for i in range(conf.folds)] for i, p in enumerate(points): partition[i % conf.folds].append(p) # generate results for kfold cross validation with this err stat results = [0.0] * conf.folds for i in range(conf.folds): # initialize validation_set and training_set validation_set = partition[i] training_set = list() for j in range(conf.folds): if j != i: training_set.extend(partition[j]) # generate conf.m bags at conf.alpha by sampling with replacement n_prime = int(len(training_set) * conf.alpha) bags = [sample_with_replacement(training_set, n_prime) for i in range(conf.m)] trees = [kdtree.KDTree(bag) for bag in bags] for point in validation_set: # compute the average estimate for pollution at point over bags avg_estimate = 0.0 for tree in trees: nnl = tree.query(point, conf.neighbors) avg_estimate += point.interpolate(nnl, conf.power) avg_estimate /= conf.m # incorporate this information into the results vector results[i] += ((avg_estimate - point.value()) / point.value()) ** 2.0 results[i] /= len(validation_set) results[i] = math.sqrt(results[i]) * 100 # return the average of the elements in the results vector return sum(results) / len(results)
def integration_test(): k = 2 p = 0.01 TRAIN_SIZE = 100000 TEST_SIZE = 2000 mu = np.zeros(k) cov = np.diag(np.ones(k)) dist = scipy.stats.multivariate_normal(mean=mu, cov=cov) sample_size = 1000000 np.random.seed(0) threshold = np.percentile(dist.pdf( np.random.multivariate_normal(mean=mu, cov=cov, size=sample_size) ), p * 100) print("Threshold: {}".format(threshold)) np.random.seed(0) training_data = np.random.multivariate_normal(mean=mu, cov=cov, size=TRAIN_SIZE) bw = TRAIN_SIZE ** (-1 / (k + 4)) print("BW: {}".format(bw)) # kernel = scipy.stats.multivariate_normal(mean=mu, cov=cov * (bw*bw)) kernel = Kernel(k=k, bw=bw) start_time = time.time() t = kdtree.KDTree(dim=k).build(training_data) print("Constructed Tree in: {}".format(time.time() - start_time)) raw_threshold = threshold * TRAIN_SIZE eps = 0.01 tkde = tkde.TKDE( t, kernel, threshold=raw_threshold, epsilon=eps * raw_threshold ) np.random.seed(1) test_data = np.random.multivariate_normal(mean=mu, cov=cov, size=1000) test_pdfs = np.array([tkde.calc(test_query)[0] for test_query in test_data]) est_threshold = np.percentile(test_pdfs, p * 100) print("Est Threshold: {}".format(est_threshold)) actual_test_pdfs = dist.pdf(test_data) disagree_on = (~((actual_test_pdfs < threshold) == (test_pdfs < threshold))) n_disagree = np.sum(disagree_on) print("disagree on: {} ".format(n_disagree)) print(test_pdfs[disagree_on]) print(actual_test_pdfs[disagree_on])
def rmspe(conf, point_list_brd, radius_table_brd): """ Return the RMSPE error statistic generated from K-fold cross validation. Take the given KFoldConf object and an ordered broadcasted list of point objects and return the desired error statistic. """ # deep copy point_list points = copy.deepcopy(point_list_brd.value) # scale time dimensions for p in points: p.scale_time(conf.time_scale) # build a list of sets representing the relevant partition partition = [list() for i in range(conf.folds)] for i, p in enumerate(points): partition[i % conf.folds].append(p) results = [0.0 for i in range(conf.folds)] for i in range(conf.folds): # initialize validation set and training set validation_set = partition[i] training_set = list() for j in range(conf.folds): if j != i: training_set += partition[j] # build a kdtree from the training set tree = kdtree.KDTree(training_set) # compute result for this validation set for p in validation_set: nnl = tree.query(p, conf.neighbors) # The modification below was made for experiment #03B distance_limit = radius_table_brd.value[str(p.time_scale)] nnl = exclude.exclude_nodes(nnl, p, distance_limit) results[i] += ((p.interpolate(nnl, conf.power) - p.value()) / p.value())**2.0 results[i] /= len(validation_set) results[i] = math.sqrt(results[i]) * 100 # return the average of the elements in the results vector return sum(results) / len(results)
def mare(conf, point_list_brd): """ Return the MARE error statistic generated from K-fold cross validation. Take the given KFoldConf object and an ordered broadcasted list of point objects and return the desired error statistic. """ # deep copy point_list points = copy.deepcopy(point_list_brd.value) # scale time dimensions for p in points: p.scale_time(conf.time_scale) # build a list of sets representing the relevant partition partition = [list() for i in range(conf.folds)] for i, p in enumerate(points): partition[i % conf.folds].append(p) results = [0.0 for i in range(conf.folds)] for i in range(conf.folds): # initialize validation set and training set validation_set = partition[i] training_set = list() for j in range(conf.folds): if j != i: training_set += partition[j] # build a kdtree from the training set tree = kdtree.KDTree(training_set) # compute result for this validation set for p in validation_set: nnl = tree.query(p, conf.neighbors) results[i] += (abs(p.interpolate(nnl, conf.power) - p.value()) / p.value()) results[i] /= len(validation_set) # return the average of the elements in the results vector return sum(results) / len(results)
def weakly_simplefy_polygon(polygon, cutouts): for c in cutouts: c.reverse() while len(cutouts) > 0: kdtree.c = 0 sys.stderr.write('todo:' + str(len(cutouts)) + '\n') tree = kdtree.KDTree([(x, y, i) for i, (x, y) in enumerate(polygon[:-1])]) c_best, best, limit, c_best_n = None, None, None, None for c in cutouts: for i, (x, y) in enumerate(c[:-1]): n_best, n_limit = tree.find_nearest((x, y, c, i), limit) if best == None or limit > n_limit: c_best, c_best_n, best, limit = c, i, n_best, n_limit pn = best[2] sys.stderr.write( str(kdtree.c) + ' ' + str(polygon[pn]) + ' ' + str(c_best[c_best_n]) + '\n') polygon[pn:pn] = [polygon[pn] ] + c_best[c_best_n:-1] + c_best[:c_best_n + 1] cutouts.remove(c_best) return polygon
def reset(self): if self.method == 'kdtree': self.kdtree = kdtree.KDTree(self.metric) if check_kdtree: self.checker = NearestNeighbors(self.metric) else: self.nodes = []
def test_verbose(): """ Processes the queries and displays output for checking accuracy, instead of just printing out query results. Very verbose, so running this on more than 25 topics or queries is a mistake. """ logging.info("Reading from sys.stdin...") data = read_input(sys.stdin) show_topics(data['topics']) show_queries(data['queries']) # Nature of the dataset logging.info("There are {} topics, {} questions, and {} queries.".format( len(data['topics']), len(data['questions']), len(data['queries']))) logging.info("There are {} topics that have no questions.".format( data['num_topics_without_questions'])) logging.info("There are {} questions that have no topics.".format( data['num_questions_without_topics'])) # Build the topic tree logging.info("Building tree from {} topic points...".format( len(data['topics']))) t0 = time.clock() dimensions = ['x', 'y'] tree = kdtree.KDTree(data['topics'], dimensions) t1 = time.clock() logging.info("Tree constructed, there are {} total nodes ({} s).".format( tree.number_nodes, t1 - t0)) #logging.info("Here's what the tree structure looks like: ") #tree.root.print_tree() # Build the pruned topic tree for questions queries, with empty topics excluded logging.info("Building pruned tree from {} topic points...".format( len(data['topics_with_questions']))) t0 = time.clock() dimensions = ['x', 'y'] pruned_tree = kdtree.KDTree(data['topics_with_questions'].values(), dimensions) t1 = time.clock() logging.info("Tree constructed, there are {} total nodes ({} s).".format( pruned_tree.number_nodes, t1 - t0)) #logging.info("Here's what the tree structure looks like: ") #pruned_tree.root.print_tree() stats = {} for query in data['queries']: # Pull out the number of results desired for the query. num_results = query['count'] logging.info( "Query: The {query[count]} {type}'s nearest to ({query[x]:0.2f}, {query[y]:0.2f})" .format(query=query, type=query['type'])) # Topic queries are straight up nearest neighbor queries. if query['type'] == 't': nearest = tree.k_nearest(query, num_results, stats) nearest['list'].sort(key=itemgetter('distance')) # Just print out the topics for count, result in enumerate(nearest['list']): logging.info( " Topic {0} - ({1[point]}), distance {1[distance]:0.2f}". format(count, result)) # And some nice info. logging.info( " {} nodes (over {} passes) in the {}-node tree were traversed to get this result." .format(stats['nodes'], stats['passes'], tree.number_nodes)) # Otherwise search is more complicated because we care about number of # records associated with the nearest point(s) elif query['type'] == 'q': nearest = tree.k_nearest_linked_records( query, num_results, 'questions', data['max_possible_questions'], stats) nearest['questions'].sort(key=itemgetter('distance')) for count, result in enumerate(nearest['questions']): logging.info( " Question {1[id]}, distance {1[distance]:0.2f}".format( count, result)) logging.info( " {} nodes (over {} passes) in the {}-node tree were traversed to get this result." .format(stats['nodes'], stats['passes'], tree.number_nodes))
def stress_test(): """ This is a function to give an idea of the order of magnitude of running time on large inputs. """ # Sample number points randomly on a square of specified origin and size. origin = {'x': 0, 'y': 0} size = 1000000 number = 10000 data = sample_square(origin, size, number) print( "Building a 2d-tree from {number} points sampled on a square of size {size}..." .format(size=size, number=number)) dimensions = ['x', 'y'] tree = kdtree.KDTree(data, dimensions) print("Tree constructed, there are {} total nodes.".format( tree.number_nodes)) stats = {'nodes': 0} queries = 10000 print("Randomly generating {} test points for querying the tree...".format( queries)) test_points = sample_square(origin, size, queries) print("Test points created.") print("Start nearest-neighbor queries...") t0 = time.clock() stat_list = [] for death in test_points: # Find the single nearest neighbor to the query point result = tree.root.nearest(death, stats) stat_list.append(stats['nodes']) t1 = time.clock() time_elapsed = t1 - t0 # Print some stats to give an idea of the number of nodes traversed. print("Queries finished ({} s).".format(time_elapsed)) stat_list.sort() print("In {} NN queries, the number nodes visited was:".format(queries)) print(" {} -> min".format(stat_list[0])) print(" {} -> average".format(sum(stat_list) / len(stat_list))) print(" {} -> median".format(stat_list[len(stat_list) / 2])) print(" {} -> max".format(stat_list[-1])) k = 10 print("Starting {}-nearest-neighbor queries...".format(k)) t0 = time.clock() stat_list = [] pass_list = [] for death in test_points: # Find the single nearest neighbor to the query point result = tree.root.k_nearest(death, k, stats) stat_list.append(stats['nodes']) pass_list.append(stats['passes']) time_elapsed = time.clock() - t0 # Print some stats to give an idea of the number of nodes traversed. print("Queries finished ({} s)".format(time_elapsed)) pass_list.sort() print("In {} {}NN queries, the number passes was:".format(queries, k)) print(" {} -> min".format(pass_list[0])) print(" {} -> average".format(sum(pass_list) / len(pass_list))) print(" {} -> median".format(pass_list[len(pass_list) / 2])) print(" {} -> max".format(pass_list[-1])) stat_list.sort() print("In {} {}NN queries, the number nodes visited was:".format( queries, k)) print(" {} -> min".format(stat_list[0])) print(" {} -> average".format(sum(stat_list) / len(stat_list))) print(" {} -> median".format(stat_list[len(stat_list) / 2])) print(" {} -> max".format(stat_list[-1]))
def check_tree(): """ This is a function for testing the accuracy of results. """ # Sample number points randomly on a square of specified origin and size. origin = {'x': 2, 'y': 1} side_length = 100 number = 10 data = sample_square(origin, side_length, number) print( "Building a 2d-tree from {number} points sampled on a square of size {size}..." .format(size=side_length, number=number)) dimensions = ['x', 'y'] tree = kdtree.KDTree(data, dimensions) print("Tree constructed, there are {} total nodes.".format( tree.number_nodes)) # Build a kd-tree, using the sort / scan / sublist method. dimensions = ['x', 'y'] tree = kdtree.KDTree(data, dimensions) print("Here's what the tree structure looks like: ") tree.root.print_tree() # Test searching for a point that is guaranteed to be in the tree print("Searching for a point guaranteed to be in the tree...") result = tree.root.search(data[0]) print("Search result for ({0[x]:0.2f},{0[y]:0.2f}) is: {1}".format( data[0], result)) # Dictionary to hold stats about kd-tree traversals. stats = {} # How many queries to make (and how many random test points to create) queries = 1 print("Randomly generating a test point for querying the tree...".format( queries)) test_points = sample_square(origin, side_length, queries) # Test searching for a point not in the tree, to get the potential parent print("Searching for a point not in the tree...") result = tree.root.search(test_points[0]) print("Search result for ({0[x]:0.2f},{0[y]:0.2f}) is: {1}".format( test_points[0], result)) # Test nearest, which finds the single closest point to the query nearest = tree.root.nearest(test_points[0], stats) print( "Result of nearest for ({p[x]:0.2f},{p[y]:0.2f}) is: {result}".format( p=test_points[0], result=nearest['point'])) print( "And {} nodes in the {}-node tree were traversed to get this result.". format(stats['nodes'], tree.number_nodes)) # Test k-nearest, which finds the k nearest points to the query num_results = 5 nearest = tree.root.k_nearest(test_points[0], num_results, stats) print("Result of {k}-nearest for ({p[x]:0.2f},{p[y]:0.2f}) is:".format( p=test_points[0], k=num_results)) nearest['list'].sort(key=itemgetter('distance')) for count, point in enumerate(nearest['list']): print(" {0} - ({1[point]}), distance {1[distance]:0.2f}".format( count, point)) print( "And {} nodes (over {} passes) in the {}-node tree were traversed to get this result." .format(stats['nodes'], stats['passes'], tree.number_nodes)) # Calculate and display the actual distances of each point from the target num_results = 5 print("And here are the top {} nearest points to the target: ".format( num_results)) results = [] for point in data: distance = kdtree.KDTreeNode.distance(point, test_points[0]) results.append({ 'x': point['x'], 'y': point['y'], 'distance': distance }) results.sort(key=itemgetter('distance')) for result in results[:num_results]: print(" ({0[x]:0.2f}, {0[y]:0.2f}) -> {0[distance]:0.2f}".format( result)) partitions_to_file(tree, test_points[0], origin, { 'x': origin['x'] + side_length, 'y': origin['y'] + side_length }) datapoints_to_file(data) searchpoints_to_file(test_points)
def check_k_nearest_accuracy(): """ This is a function for verifying the accuracy of results by calculating the actual nearest neighbors by brute force. Needless to say this is for debugging only. """ # Sample number points randomly on a square of specified origin and size. origin = {'x': 0, 'y': 0} size = 1000000 number = 10000 k = 10 data = sample_square(origin, size, number) print("Testing the accuracy of {} nearest neighbors results...") print( "Building a 2d-tree from {number} points sampled on a square of size {size}..." .format(size=size, number=number)) dimensions = ['x', 'y'] tree = kdtree.KDTree(data, dimensions) print("Tree constructed, there are {} total nodes.".format( tree.number_nodes)) stats = {'nodes': 0} queries = 100 print("Randomly generating {} test points for querying the tree...".format( queries)) test_points = sample_square(origin, size, queries) print("Test points created.") print("Start queries...") stat_list = [] result_list = [] stats = {} for test_point in test_points: # Find the k nearest neighborsto the query point k_nearest = tree.k_nearest(test_point, k, stats) # Now calculate the actual distances of each point from the target # for the purpose of testing accuracy all_points = [] for point in data: distance = kdtree.KDTreeNode.distance(point, test_point) all_points.append({ 'x': point['x'], 'y': point['y'], 'distance': distance }) # Now sort the list of points by distance from the test point and pull out # the point with minimum distance all_points.sort(key=itemgetter('distance')) real_nearest = all_points[:k] # Mark whether the result was correct or not in the result_list. So # if result_list[4] is false it means the nearest calculation was wrong for # test_points[4] # The == operator should work here because the numbers are pulled/calculated # in exactly the same way. for index, neighbor in enumerate(k_nearest['list']): correct = ( real_nearest[index]['x'] == neighbor['point'].point['x'] and real_nearest[index]['y'] == neighbor['point'].point['y'] and real_nearest[index]['distance'] == neighbor['distance']) # Fail out on first non-match. if not correct: break result_list.append(correct) if not correct: # Print results if they don't match. print("Bruteforce results: ") for index, result in enumerate(real_nearest): print( "{0}: {1[x]:0.2f}, {1[y]:0.2f}, distance {1[distance]:0.2f}" .format(index, result)) print("Kdtree results: ") for index, neighbor in enumerate(k_nearest['list']): print( "{0}: {1[x]:0.2f}, {1[y]:0.2f}, distance {2:0.2f}".format( index, neighbor['point'].point, neighbor['distance'])) # Print some stats to give an idea of the number of nodes traversed. print("Queries and testing finished.".format(queries)) frequencies = Counter(result_list) print("In {} queries, {} were correct and {} were incorrect.".format( queries, frequencies[True], frequencies[False]))
def _plot_plane(ax, node, num_dims, default_plane_width=10, num_samples=10): boundaries = helper._boundaries(node, num_dims) boundaries = boundaries[node.axis:] + boundaries[:node.axis] child_dim = _dim_range(boundaries[1], default_plane_width, num_samples) grandchild_dim = _dim_range(boundaries[2], default_plane_width, num_samples) child_matrix, grandchild_matrix = np.meshgrid(child_dim, grandchild_dim) constant_dim = np.linspace(node.data[node.axis], node.data[node.axis], num_samples) constant_matrix, _ = np.meshgrid(constant_dim, constant_dim) plot_input = [constant_matrix, child_matrix, grandchild_matrix] plot_input = plot_input[-node.axis:] + plot_input[:-node.axis] ax.plot_surface(plot_input[0], plot_input[1], plot_input[2], alpha=0.8) def _dim_range(boundary, default_plane_width, num_samples): beg = boundary[0] if boundary[0] is not None else -default_plane_width end = boundary[1] if boundary[1] is not None else default_plane_width return np.linspace(beg, end, num_samples) if __name__ == "__main__": num_dims = 3 tree = kdtree.KDTree(test_data.list3d_2, num_dims) point = test_data.rand_point(num_dims) k = 7 result = tree.knn(point, k) knn(tree, point, result)
def _data_interpolation(centroid_rdd, pollutant): """ Run the interpolation of ozone at the centroid locations. """ # Set parameters unique for this interpolation task. if pollutant == 'ozone': time_scale = (0.4 + 2.0) / 2.0 data_file = '../data/clean/monthly_ozone_1990-2015.csv' point_list = point.load_point_file(data_file) point_list = [p.scale_time(time_scale) for p in point_list] else: time_scale = (0.18 + 0.16) / 2.0 data_file = '../data/clean/monthly_pm25_1990-2015.csv' point_list = point.load_point_file(data_file) point_list = [p.scale_time(time_scale) for p in point_list] # Bag the point list and produce a list of trees to use for prediction. bag_size = int(len(point_list) * ALPHA) bags = [kfold.sample_with_replacement(point_list, bag_size) for _ in range(NUM_BAGS)] trees = [kdtree.KDTree(bag) for bag in bags] tree_tuple_brd = SC.broadcast(trees) # Define a mapper for interpolating each query point. def interpolation_mapper(query_point, tree_tuple_brd): """ Set the max and mean estimates for query_point using the list of KDTree objects for interpolation. """ # Generate a list of estimates for this query point. estimates = [] for tree in tree_tuple_brd.value: nodes = tree.query(query_point, NEIGHBORS) estimates.append(query_point.interpolate(nodes, POWER)) # Average the estimates from each bag. max_est = sum([est[0] for est in estimates]) / len(estimates) mean_est = sum([est[1] for est in estimates]) / len(estimates) # Fix the results within query_point. query_point.max_est = max_est query_point.mean_est = mean_est return query_point # Transform centroid_rdd into an RDD of query points, scale the time # dimension, and cache the intermediate result. def query_point_factory(record, month): """ Build a QueryPoint from a CSV record and a month value. """ result = point.QueryPoint(record) result.month = month return result query_point_rdd = centroid_rdd.map(lambda p: query_point_factory(*p)) query_point_rdd = query_point_rdd.map(lambda q: q.scale_time(time_scale)) query_point_rdd = query_point_rdd.cache() # Map the query_point_rdd through the interpolation mapper. query_point_rdd = query_point_rdd.map(lambda q: interpolation_mapper(q, tree_tuple_brd)).cache() # ---------------------- Aggregation ---------------------------------- # TESTING # ------- def simple_report(query_point): """ No comment. """ month = (query_point.month % 12) + 1 year = (query_point.month / 12) + 1990 return query_point.blk_id +\ ',' +\ str(month) +\ ',' +\ str(year) +\ ',' +\ str(query_point.max_est) +\ ',' +\ str(query_point.mean_est) # Write the output to a file. if pollutant == 'ozone': query_point_rdd.map(simple_report).saveAsTextFile('ozone_inter_output') else: query_point_rdd.map(simple_report).saveAsTextFile('pm25_inter_output')
args = parser.parse_args() # Construct database fields = ["x","y"] if args.quadtree: fields.append("quad") dtb = db.Database(fields) field_idx = dtb.fields() # Load the data data_loader = dl.DataLoader() data_loader.load(args,dtb) # Create KDTree tree = kd.KDTree(dtb, {'max-depth' : args.max_depth, 'max-elements' : args.max_elements}) plotter = pl.Plotter(tree,dtb,args) # Testing: Implementing the QuadTree if args.quadtree: quadtree = qt.QuadTree(tree.bounding_box(), args.quadtree) if args.quadshow: plotter.add_quadtree(quadtree) # Testing: Implementing the KDTree if args.closest: # This is for testing, to check if your closest query is correclty implemented # Step 1 query and fetch
BLUE = (0, 0, 255) dd = [1000, 1000] game_display = pygame.display.set_mode(dd) pygame.display.set_caption('Quad Tree Test') pygame.display.update() game_exit = False clock = pygame.time.Clock() fps = 100 one_pressed = False three_pressed = False state = True counter = 0 boundary = kdtr.Boundary(0, 0, dd[0], dd[1]) quadtree = kdtr.KDTree(boundary, 4) rectangle = None points = [] for i in range(1000): p = kdtr.Point(random.gauss(dd[0] / 2, dd[0] / 8), random.gauss(dd[1] / 2, dd[1] / 8)) quadtree.insert(p) points.append(p) # for j in range((dd[0]/2)-10, (dd[0]/2)+10): # for k in range((dd[1]/2)-10, (dd[1]/2)+10): # p = kdtr.Point(j, k) # points.append(p) # quadtree.insert(p) while not game_exit: points_in_range = []
def database_to_tree(c, dim=512): c.execute("SELECT * FROM responseData") res = [] for sentence, source, vector in c.fetchall(): res.append({"sentence": sentence, "source": source, "vector": vector}) return kdtree.KDTree(res, dim)
def _plot_line(node, num_dims, default_plane_width=10, num_samples=10): boundaries = helper._boundaries(node, num_dims) boundaries = boundaries[node.axis:] + boundaries[:node.axis] other_dim = _dim_range(boundaries[1], default_plane_width, num_samples) constant_dim = np.linspace(node.data[node.axis], node.data[node.axis], num_samples) plot_input = [constant_dim, other_dim] plot_input = plot_input[-node.axis:] + plot_input[:-node.axis] plt.plot(plot_input[0], plot_input[1], alpha=0.8) def _dim_range(boundary, default_plane_width, num_samples): beg = boundary[0] if boundary[0] is not None else -default_plane_width end = boundary[1] if boundary[1] is not None else default_plane_width return np.linspace(beg, end, num_samples) if __name__ == "__main__": num_dims = 2 tree = kdtree.KDTree(test_data.list2d_1, num_dims) point = test_data.rand_point(num_dims) k = 3 result = tree.knn(point, k) knn(tree, point, result)
import math import random import re import sqlite3 import string import sys import kdtree app = flask.Flask(__name__) # Load the trees try: print('Loading trees...', end='') sys.stdout.flush() weather_tree = kdtree.KDTree(features=['latitude', 'longitude', 'date'], json_file='json/weather.json') accident_tree = kdtree.KDTree(features=['latitude', 'longitude', 'date'], json_file='json/accident.json') print('done', end='\n\n') except: print( 'Unable to load trees. "json/accident.json" or "json/weather.json" may not exist.', file=sys.stderr) sys.exit(1) def convert_from_dms(degree, minute, second, direction): """ Convert from degrees-minute-second form to decimal form and return it. """ return direction * (degree + (1.0 / 60.0) * minute +
def check_nearest_accuracy(): """ This is a function for verifying the accuracy of results by calculating the actual nearest neighbors by brute force. Needless to say this is for debugging only. """ print("Testing the accuracy of nearest neighbor results...") # Sample number points randomly on a square of specified origin and size. origin = {'x': 0, 'y': 0} size = 1000000 number = 10000 data = sample_square(origin, size, number) print( "Building a 2d-tree from {number} points sampled on a square of size {size}..." .format(size=size, number=number)) dimensions = ['x', 'y'] tree = kdtree.KDTree(data, dimensions) print("Tree constructed, there are {} total nodes.".format( tree.number_nodes)) stats = {'nodes': 0} queries = 100 print("Randomly generating {} test points for querying the tree...".format( queries)) test_points = sample_square(origin, size, queries) print("Test points created.") print("Start queries...") stat_list = [] result_list = [] for test_point in test_points: # Find the single nearest neighbor to the query point nearest = tree.root.nearest(test_point, stats) stat_list.append(stats['nodes']) # Now calculate the actual distances of each point from the target # for the purpose of testing accuracy all_points = [] for point in data: distance = kdtree.KDTreeNode.distance(point, test_point) all_points.append({ 'x': point['x'], 'y': point['y'], 'distance': distance }) # Now sort the list of points by distance from the test point and pull out # the point with minimum distance all_points.sort(key=itemgetter('distance')) real_nearest = all_points[0] # Mark whether the result was correct or not in the result_list. So # if result_list[4] is false it means the nearest calculation was wrong for # test_points[4] # The == operator should work here because the numbers are pulled/calculated # in exactly the same way. correct = (real_nearest['x'] == nearest['point'].point['x'] and real_nearest['y'] == nearest['point'].point['y'] and real_nearest['distance'] == nearest['distance']) result_list.append(correct) # Print some stats to give an idea of the number of nodes traversed. print("Queries and testing finished.".format(queries)) frequencies = Counter(result_list) print("In {} queries, {} were correct and {} were incorrect.".format( queries, frequencies[True], frequencies[False])) stat_list.sort() print("In {} queries on the {}-node tree, the number nodes visited was:". format(queries, tree.number_nodes)) print(" {} -> min".format(stat_list[0])) print(" {} -> average".format(sum(stat_list) / len(stat_list))) print(" {} -> median".format(stat_list[len(stat_list) / 2])) print(" {} -> max".format(stat_list[-1]))