def compute_cluster_labels(feature_vectors, search_box_half_span, min_cluster_size): """Use DBSCAN to compute clusters for a set of points. DBSCAN is a clustering algorithm that looks for regions of high density in a set of points. Connected regions of high density are identified as clusters. Small regions of low density or even ingle points get identified as noise (belonging to no cluster). There are three arguments to the process. First, you supply the points to cluster. Second, you ask for cluster labels with respect to two parameters: the search box size (defining "nearby" points) and the minimum number of points that you're willing to call a cluster. """ native_feature_vectors = [ convert_to_feature_vector(p) for p in feature_vectors ] native_box_half_span = convert_to_feature_vector(search_box_half_span) cluster_engine_name = 'dbscan_learn_cluster_ids_{}'.format( len(feature_vectors[0])) dbscan_learn_cluster_labels = getattr(_dbscan_clustering, cluster_engine_name) return dbscan_learn_cluster_labels(native_feature_vectors, native_box_half_span, min_cluster_size)
def test_feature_vector_repr(dimension): """Make sure that feature vector repr() works as expected We expect repr(my_feature_vector) to return a representation like 'tracktable.domain.feature_vectors.FeatureVector3(1, 2, 3)'. Arguments: dimension {int}: How many components to give the feature vector Returns: 0 on success, 1 on error (also prints an error message) """ components = [x + 0.5 for x in range(dimension)] my_feature_vector = convert_to_feature_vector(components) expected_representation = ( 'tracktable.domain.feature_vectors.FeatureVector{}({})').format( dimension, ', '.join([str(x) for x in components])) if expected_representation != repr(my_feature_vector): logger = logging.getLogger(__name__) logger.error( ('Expected repr(my_feature_vector) to be "' '{}" but got "{}" instead').format( expected_representation, repr(my_feature_vector))) return 1 else: return 0
def distance_geometry_signature(trajectory, num_control_points=4, normalize_distance=True): # Sets the distance increment for control points based on the number of control points # Calculates the fractions of the trajectory where control points should be # Gives the values where the control points are located control_point_increment = 1.0 / (num_control_points - 1) control_point_fractions = [ control_point_increment * i for i in range(num_control_points) ] control_points = [ point_at_length_fraction(trajectory, t) for t in control_point_fractions ] # A signature is a collection of the calculated distances that will be converted to a feature vector signature = [] # Calculate the list of distances for stepsize in range(num_control_points - 1, 0, -1): for start in range(0, num_control_points - stepsize): end = start + stepsize signature.append( distance(control_points[start], control_points[end])) # Normalize distances to compare trajectory shapes if normalize_distance: largest_distance = max(signature) signature = [ 0 if not largest_distance else d / largest_distance for d in signature ] # Convert distances to a feature vector return convert_to_feature_vector(signature)
def __init__(self, points=None): self._tree = None self._original_points = None if points is not None: self._original_points = points self._feature_vectors = [ convert_to_feature_vector(p) for p in points ] self._setup_tree()
def points(self, new_points): """Populate the r-tree with a new set of points You must supply points (points in space or feature vectors) with dimension between 1 and 30. A new R-tree will be initialized with copies of those points. NOTE: This version of the code does indeed copy the points. A future version might get around that. Args: new_points: List of points to use """ if new_points != self._original_points: self._original_points = list(new_points) self._feature_vectors = [ convert_to_feature_vector(p) for p in self._original_points ] self._setup_tree()
def find_nearest_neighbors(self, seed_point, num_neighbors): return self._tree.find_nearest_neighbors(convert_to_feature_vector(seed_point), num_neighbors)
def find_points_in_box(self, min_corner, max_corner): return self._tree.find_points_in_box( convert_to_feature_vector(min_corner), convert_to_feature_vector(max_corner) )
def find_nearest_neighbors(self, seed_point, num_neighbors): return self._tree.find_nearest_neighbors( convert_to_feature_vector(seed_point), num_neighbors)
def find_points_in_box(self, min_corner, max_corner): return self._tree.find_points_in_box( convert_to_feature_vector(min_corner), convert_to_feature_vector(max_corner))
def get_features(trajectory): signature = [] signature.append(cha(trajectory)) signature.append(distance(trajectory[0], trajectory[len(trajectory)-1])) return convert_to_feature_vector(signature)
def compute_cluster_labels(feature_vectors, search_box_half_span, min_cluster_size): """Use DBSCAN to compute clusters for a set of points. DBSCAN is a clustering algorithm that looks for regions of high density in a set of points. Connected regions of high density are identified as clusters. Small regions of low density or even single points get identified as noise (belonging to no cluster). There are three arguments to the process. First, you supply the points to cluster. Second, you ask for cluster labels with respect to two parameters: the search box size (defining "nearby" points) and the minimum number of points that you're willing to call a cluster. You will get back a list of (vertex_id, cluster_id) pairs. If you supplied a list of points as input the vertex IDs will be indices into that list. If you supplied pairs of (my_vertex_id, point) instead, the vertex IDs will be whatever you supplied. """ # Are we dealing with decorated points? decorated_points = False first_point = feature_vectors[0] vertex_ids = list(range(len(feature_vectors))) logger = logging.getLogger(__name__) logger.debug("Testing for point decoration. First point: {}".format( first_point)) try: if len(first_point) == 2 and len(first_point[0]) > 0: logger.debug( ("Points are decorated. First point: {}").format( first_point)) decorated_points = True vertex_ids = [ point[1] for point in feature_vectors ] except TypeError: # The second element of the point is something that doesn't # have a len(). It is probably a coordinate, meaning we've # got bare points. pass if not decorated_points: logger.debug("Points are not decorated", logger) if decorated_points: native_feature_vectors = [ convert_to_feature_vector(p[0]) for p in feature_vectors ] else: native_feature_vectors = [ convert_to_feature_vector(p) for p in feature_vectors ] native_box_half_span = convert_to_feature_vector(search_box_half_span) if decorated_points: point_size = len(first_point[0]) else: point_size = len(first_point) cluster_engine_name = 'dbscan_learn_cluster_ids_{}'.format(point_size) dbscan_learn_cluster_labels = getattr(_dbscan_clustering, cluster_engine_name) integer_labels = dbscan_learn_cluster_labels( native_feature_vectors, native_box_half_span, min_cluster_size ) final_labels = [] for (vertex_index, cluster_id) in integer_labels: final_labels.append((vertex_ids[vertex_index], cluster_id)) return final_labels