Пример #1
0
 def calculate_distance(self):
     distance = 0
     excess_duration = 0
     for depot_id, routes in self.routes.items():
         depot_coordinate = self.depots[depot_id][0]
         max_duration = self.depots[depot_id][1]
         for route in routes:
             key = hash((depot_id, tuple(route)))
             if key in Chromosome.route_memo:
                 route_distance = Chromosome.route_memo[key]
                 distance += route_distance
                 if max_duration != 0:
                     if route_distance > max_duration:
                         excess_duration += route_distance - max_duration
             else:
                 trip = list(map(lambda x: self.customers[x][0], route))
                 trip.append(depot_coordinate)
                 trip.insert(0, depot_coordinate)
                 route_distance = 0
                 for i in range(len(trip) - 1):
                     route_distance += euclidean_distance(trip[i], trip[i + 1])
                 Chromosome.route_memo[key] = route_distance
                 distance += route_distance
                 if max_duration != 0:
                     if route_distance > max_duration:
                         excess_duration += route_distance - max_duration
     return distance, excess_duration
    def get_nearest_neighbhours(self, test_instance, training_data=None, k_neighbours=None):
        """
        Computes the euclidian distances for the test instance with all training data instances, and returns
        the first K instances.
        :param test_instance: a 1*n feature vector to be the main operand in the euclidian distance calculations
        :param training_data: the feature vectors with their classes to have their distances compared
        :param k_neighbours: how many
        :return: a list of the closest K training items and their classes
        """
        if k_neighbours is None:
            k_neighbours = 51

        if training_data is None:
            if self.training_data is None:
                raise ValueError("KNN Model has not been given with any training data to compute neighbours")
            else:
                training_data = self.training_data

        items_with_distances = []
        for item in training_data:
            dist = util.euclidean_distance(item[0], test_instance)
            items_with_distances.append({"item": item, "distance": dist})
        # We use a lambda to sort each tuple based on the value at its 'distance' key
        items_with_distances = sorted(items_with_distances, key=lambda k: k["distance"])
        if k_neighbours != None:
            return items_with_distances[:k_neighbours]
        else:
            return items_with_distances
Пример #3
0
    def cluster(self, vectors, assign_clusters=False,ClusterNum=None, DisType='euc',Stype='mean',trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()   # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if(0==l):
            return []


        if('cos'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
        elif('euc'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters,ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1 
        #     print v,"\t",
        #     if (m%7==0):
        #         print
        #/////////////////////////////////////////////////////

        if(2==len(vectors[0])):         # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Пример #4
0
    def cluster(self,
                vectors,
                assign_clusters=False,
                ClusterNum=None,
                DisType='cos',
                Stype='avg',
                trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()  # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            self.draw_2D(vectors, result)

        return result
Пример #5
0
    def __init__(self, position: VECTOR, radius: float, center: VECTOR):
        self.position = position
        self.radius = radius
        self.center = center

        self.orbit_radius = euclidean_distance(*position, *center)
        self.rotation_angle = 365 / (self.radius * self.orbit_radius)
        self.rotation_angle /= 10  # Arbitrary constant to slow things down
Пример #6
0
 def get_swap_cluster(self) -> Dict:
     cluster = defaultdict(lambda: (int, float("inf")))
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if distance < cluster[customer_id][1]:
                 cluster[customer_id] = depot_id, distance
     swap_cluster = defaultdict(list)
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if ((distance - cluster[customer_id][1]) / cluster[customer_id][1]) <= 2:
                 swap_cluster[customer_id].append(depot_id)
     return swap_cluster
Пример #7
0
def make_label(dim, radius):
    label = np.full((dim, dim), -1)
    center = int(dim / 2.0)
    start = center - ceil(radius)
    end = center + ceil(radius)
    for i in inclusive_range(start, end):
        for j in inclusive_range(start, end):
            if euclidean_distance(i, j, center, center) <= radius:
                label[i,j] = 1
    return label
Пример #8
0
 def get_neighbourhood(self, index: tensor, radius: float) -> List[Tuple]:
     result = []
     for i in range(self.output_rows):
         for j in range(self.output_cols):
             indices = (i, j)
             distance = euclidean_distance(index, tensor(list(indices)))
             if distance <= radius:
                 if not np.array_equal(index, tensor(list(indices))):
                     result.append((indices, distance))
     return result
Пример #9
0
def get_relational_features(target, landmark):
    t_point = (target['pos_x'],target['pos_y'])
    l_point = (landmark['pos_x'],landmark['pos_y'])
    x_diff = l_point[0] - t_point[0]
    y_diff = l_point[1] - t_point[1]
    distance = util.euclidean_distance(t_point, l_point)
        
    ab = 0 if y_diff < 0 else 1
    lr = 0 if x_diff > 0 else 1
    return {'ab':ab, 'lr':lr,'xdiff':x_diff, 'ydiff': y_diff,'dist':distance}
Пример #10
0
 def calculate_food_distance(self, game_state, pacman_position, food_position):
     """
     problem = graphSearchProblem.PositionSearchProblem(game_state,
                                                        start=pacman_position,
                                                        goal=food_position,
                                                        warn=False, visualize=False)
     path_to_food = graphSearchProblem.aStarSearch(problem)
     distance = len(path_to_food)
     """
     distance = util.euclidean_distance(pacman_position, food_position)
     return distance
Пример #11
0
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	'''
	Compute the average cost of medoids based on certain cost function and do the clustering
	'''
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
				else:
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		medoids[choice].append(i)
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Пример #12
0
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
	'''
	Compute the average cost of medoids based on certain cost function and do the clustering
	'''
	# Init the cluster
	size = len(data)
	total_cost = {}
	medoids = {}
	for idx in medoids_idx:
		medoids[idx] = []
		total_cost[idx] = 0.0

	# Compute the distance and do the clustering
	for i in range(size):
		choice = -1
		# Make a big number
		min_cost = float('inf')
		for m in medoids:
			if cacheOn == True:
				# Check for cache
				tmp = distances_cache.get((m,i), None)
			if cacheOn == False or tmp == None:
				if costF_idx == 0:
					# euclidean_distance
					tmp = euclidean_distance(data[m], data[i])
				elif costF_idx == 1:
					# manhattan_distance
					tmp = manhattan_distance(data[m], data[i])
				elif costF_idx == 2:
					# pearson_distance
					tmp = pearson_distance(data[m], data[i])
				else:
					print('Error: unknown cost function idx: ' % (costF_idx))
			if cacheOn == True:
				# Save the distance for acceleration
				distances_cache[(m,i)] = tmp
			# Clustering
			if tmp < min_cost:
				choice = m
				min_cost = tmp
		# Done the clustering
		medoids[choice].append(i)
		total_cost[choice] += min_cost

	# Compute the average cost
	avg_cost = 0.0
	for idx in medoids_idx:
		avg_cost += total_cost[idx] / len(medoids[idx])

	# Return the average cost and clustering
	return(avg_cost, medoids)
Пример #13
0
 def get_customer_cluster(self) -> Dict:
     cluster = defaultdict(list)
     for customer_id in self.customers:
         customer_coordinate = self.customers[customer_id][0]
         best_distance = float('inf')
         best_depot = None
         for depot_id in self.depots:
             depot_coordinate = self.depots[depot_id][0]
             distance = euclidean_distance(customer_coordinate, depot_coordinate)
             if distance < best_distance:
                 best_depot = depot_id
                 best_distance = distance
         cluster[best_depot].append(customer_id)
     return cluster
Пример #14
0
    def __get_neighbors(self, data, k):
        distances = []

        for i in range(len(self.train_x)):
            dist = euclidean_distance(data, self.train_x[i])
            distances.append((self.train_x[i] + [self.train_y[i]], dist))

        distances.sort(key=operator.itemgetter(1))

        # return the first k neighbors with the smallest
        # distance
        neighbors = [distances[i][0] for i in range(k)]

        return neighbors
Пример #15
0
def cluster_points(points, cluster_dist=7):
    old_points = np.array(points)
    new_points = []

    while len(old_points) > 1:
        p1 = old_points[0]
        distances = np.array(
            [util.euclidean_distance(p1, p2) for p2 in old_points])
        idx = (distances < cluster_dist)
        points_cluster = old_points[idx]
        centroid = util.get_centroid(points_cluster)
        new_points.append(centroid)
        old_points = old_points[np.invert(idx)]

    return new_points
Пример #16
0
    def update_distance_from_car(self, car_pose):
        new_distance = util.euclidean_distance(self.x,
                                               self.y,
                                               car_pose.position.x,
                                               car_pose.position.y)

        if self.distance_from_car:
            if floats_equal(new_distance, self.distance_from_car):
                # No change in shift_relative_to_car
                pass
            elif new_distance < self.distance_from_car:
                self.shift_relative_to_car = LIGHT_GETTING_CLOSER
            elif new_distance > self.distance_from_car:
                self.shift_relative_to_car = LIGHT_GETTING_FARTHER

        self.distance_from_car = new_distance
Пример #17
0
    def cluster(self,
                vectors,
                assign_clusters=False,
                ClusterNum=None,
                DisType='euc',
                Stype='mean',
                trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()  # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if (0 == l):
            return []

        if ('cos' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = cosine_distance(
                        vectors[i], vectors[j])
        elif ('euc' == DisType):
            for i in range(l):
                for j in range(i + 1, l):
                    self._distMap[(i, j)] = euclidean_distance(
                        vectors[i], vectors[j])
        result = VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                              ClusterNum, Stype, trace)

        #/////////////////////// 测试,输出距离 /////////////////
        # m = 0
        # for k,v in self._distMap:
        #     m +=1
        #     print v,"\t",
        #     if (m%7==0):
        #         print
        #/////////////////////////////////////////////////////

        if (2 == len(vectors[0])):  # 二维样本则显示可视化结果
            draw_2D_cluster(vectors, result)

        return result
Пример #18
0
    def get_features(self, data):
        
        for eid in data:
            row = data[eid]
            del row['episode_id']
            del row['position']
            del row['id']
#             row['v_top-skewed'] = 1 if row['v_skew'] == 'top-skewed' else 0
#             row['v_symmetric'] = 1 if row['v_skew'] == 'symmetric' else 0
#             row['v_bottom-skewed'] = 1 if row['v_skew'] == 'bottom-skewed' else 0
#             row['h_top-skewed'] = 1 if row['h_skew'] == 'right-skewed' else 0
#             row['h_symmetric'] = 1 if row['h_skew'] == 'symmetric' else 0
#             row['h_left-skewed'] = 1 if row['h_skew'] == 'left-skewed' else 0
            del row['v_skew']
            del row['h_skew']
            del row['orientation']
            row['c_diff'] = util.euclidean_distance((320,240), (row['pos_x'], row['pos_y'])) # distance from center

        return data
Пример #19
0
    def cluster(self, vectors, assign_clusters=False, DisType='cos',Stype='avg',trace=False):
        # stores the merge order

        #-------------------------------------------------
        self._distMap.clear()   # 每次聚类不同样本之前必须更新
        #-------------------------------------------------

        l = len(vectors)
        if('cos'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = cosine_distance(vectors[i], vectors[j])
        elif('euc'==DisType):
            for i in range(l):
                for j in range(i+1,l):
                    self._distMap[(i,j)] = euclidean_distance(vectors[i], vectors[j])
        self._dendrogram = Dendrogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        result = VectorSpaceClusterer.cluster(self, vectors,assign_clusters, Stype, trace)

        return result
Пример #20
0
    def cluster(self, matrix):
        l = len(matrix)
        #---------------------------------------------------------------------------------------
        self.distList = np.zeros((l,l),np.float)
        for i in range(l):
            self.distList[i][i] = float('inf')      # 自身不参与聚类比较
            for j in range(i+1,l):
                self.distList[i][j] = euclidean_distance(np.array(matrix[i]), np.array(matrix[j]))
                self.distList[j][i] = self.distList[i][j]
        #----------------------------------------------------------------------------------------
        mostSimList = []        # 记录与第 i 个样本最相似的前 m 个样本的距离
        m = 3
        marks = [i for i in range(l)]

        for i in range(l):
            lis = self.distList[i].tolist()
            lis = zip(marks,lis)
            mostSimList.append(sorted(lis, key=lambda x:x[1])[0:m])

        ADist = []
        for i in range(l):
            ADist.append(mostSimList[i][m-1][1])
            ADist = sorted(ADist)

        mostSimList = zip(marks,mostSimList)
        mostSimList = sorted(mostSimList, key=lambda x:x[1][m-1][1], reverse=True)

        noise = []
        for i in mostSimList[0:l/5]:
            noise.append(i[0])

        #-----------------------------------------------------------------------------------------
        print mostSimList
        print ADist
        print noise
        return ADist , noise
Пример #21
0
def targetFunction(data,
                   costF_idx,
                   medoids_idx,
                   cacheOn=False,
                   distDict={},
                   simDict={},
                   affinities={},
                   costType=CostType,
                   namedPoints=True):
    '''
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    '''
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"
        return (1)

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                else:
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp == 0.0 and min_cost == 0.0:  # no connection to either medoid
                rv = bernoulli.rvs(1. / len(medoids_idx), size=1)
                if rv[0] == 1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            assignErrors.append(i)
        else:
            medoids[choice].append(i)
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return (sum(total_cost.values()), medoids)

    elif costType == "average":
        # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return (avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
                    named_medoids[data[medID]].append(data[pointID])
            # "-" because we maximize modularity
            mod = -modularity(data,
                              COST=costF_idx,
                              distDict=distDict,
                              edgeDict=affinities,
                              medoids=named_medoids)
        else:
            mod = -modularity(data,
                              COST=costF_idx,
                              distDict=distDict,
                              edgeDict=affinities,
                              medoids=medoids)
        print "modularity computed"

    else:
        print "unknown target function"
        return (1)

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
    else:
        print "no unassigned nodes, all right"

    return (mod, medoids)
Пример #22
0
def is_BIH_inlier(all_BIH_ip, corner, pix_dist=5):
    return any([(util.euclidean_distance(ip, corner) <= pix_dist)
                for ip in all_BIH_ip])
Пример #23
0
 def pacman_will_die(self, next_pacman_position, next_ghost_positions):
     for next_ghost_position in next_ghost_positions:
         pacman_distance_from_ghost = util.euclidean_distance(next_pacman_position, next_ghost_position)
         if pacman_distance_from_ghost <= ReflexAgent.pacman_distance_from_ghost_coefficient:
             return True
     return False
Пример #24
0
def totalCost(data, costF_idx, medoids_idx, cacheOn=CacheOn, distDict={}, simDict={}, acceleration=0):
    '''
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    '''
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    try:
                        tmp = similarity_distance(data[m], data[i], simDict)
                    except:
                        print m, i
                        print data[m]
                        print data[i]
                else:
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        else:
            medoids[choice].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {} #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids )
Пример #25
0
 def overlaps(self, other: "Planet") -> bool:
     dist = euclidean_distance(*self.position, *other.position)
     return dist <= (self.radius + other.radius)
Пример #26
0
 def _overlaps(self, other: "Cell") -> bool:
     """Check if 2 cells overlaps each other"""
     distance = euclidean_distance(self.x, self.y, other.x, other.y)
     radius_sum = self.radius + other.radius
     return distance <= radius_sum
Пример #27
0
def totalCost(data,
              costF_idx,
              medoids_idx,
              cacheOn=CacheOn,
              distDict={},
              simDict={},
              acceleration=0):
    '''
    Compute the total cost and do the clustering based on certain cost function
    (that is, assign each data point to certain cluster given the medoids)
    '''
    # Init the cluster
    size = len(data)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    # medoids['unassigned'] = []
    unassigned = []
    tmp = None

    # Compute the distance and do the clustering
    for i in xrange(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m, i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    try:
                        tmp = similarity_distance(data[m], data[i], simDict)
                    except:
                        print m, i
                        print data[m]
                        print data[i]
                else:
                    print('Error: unknown cost function idx: %d' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m, i)] = tmp
            # Clustering
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if min_cost == 0:  # 0 similarity to all the medoids
            unassigned.append(i)  # medoids['unassigned'].append(i)
        else:
            medoids[choice].append(i)
        total_cost += min_cost

    if acceleration == 2:
        transformed_medoids = {}  #dict(medoids)
        for i, m in enumerate(medoids.keys()):
            #print i, m
            transformed_medoids[str(i)] = {'med': m, 'nodes': medoids[m]}
            #transformed_medoids[i] = transformed_medoids.pop(m)
        return (total_cost, transformed_medoids)

    # Return the total cost and clustering
    return (total_cost, medoids)
Пример #28
0
def targetFunction(data, costF_idx, medoids_idx, cacheOn=False, distDict={},
                   simDict={}, affinities={}, costType=CostType,
                   namedPoints=True):
    '''
    Compute the average cost of medoids based on certain cost function
    and do the clustering given the medoids
    '''
    if costType not in ["total", "average", "modularity"]:
        print "unknown target function - check the global variables in the code"
        return(1)

    # Init the cluster
    size = len(data)
    total_cost = {}
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
        total_cost[idx] = 0.0
    assignErrors = []

    # Compute the distance and do the clustering
    for i in range(size):
        choice = -1
        # Make a big number
        min_cost = float('inf')
        # medoids themselves are also included into resulting cluster lists
        for m in medoids:
            if cacheOn == True:
                # Check for cache
                tmp = distances_cache.get((m,i), None)
            if cacheOn == False or tmp == None:
                if costF_idx == 0:
                    # euclidean_distance
                    tmp = euclidean_distance(data[m], data[i])
                elif costF_idx == 1:
                    # manhattan_distance
                    tmp = manhattan_distance(data[m], data[i])
                elif costF_idx == 2:
                    # pearson_distance
                    tmp = pearson_distance(data[m], data[i])
                elif costF_idx == 3:
                    # direct_distance
                    tmp = direct_distance(data[m], data[i], distDict)
                elif costF_idx == 4:
                    # similarity_distance
                    tmp = similarity_distance(data[m], data[i], simDict)
                else:
                    print('Error: unknown cost function idx: ' % (costF_idx))
            if cacheOn == True:
                # Save the distance for acceleration
                distances_cache[(m,i)] = tmp
            # Clustering

            # Randomization for nodes/points isolated from all the medoids
            # in order to assign them to random clusters. Hope averaging will
            # be able to glean cases for which some medoids did appear in the
            # same connected component, and group those nodes together.
            if tmp==0.0 and min_cost==0.0: # no connection to either medoid
                rv = bernoulli.rvs(1./len(medoids_idx), size=1)
                if rv[0]==1.: choice = m
            elif tmp < min_cost:
                #if tmp < min_cost:
                choice = m
                min_cost = tmp
        # Done the clustering
        if choice == -1:
            print "ERROR: the node cannot be assigned"
            assignErrors.append(i)
        else:
            medoids[choice].append(i)
            total_cost[choice] += min_cost

    # Compute the target function
    if costType == "total":
        #print total_cost
        return(sum(total_cost.values()), medoids)

    elif costType == "average":
    # Compute the average cost
        avg_cost = 0.0
        for idx in medoids_idx:
            avg_cost += total_cost[idx] / len(medoids[idx])
        # Return the average cost and clustering
        return(avg_cost, medoids)

    elif costType == "modularity":
        # If the points are named, display the names
        if namedPoints == True:
            named_medoids = {}
            for medID in medoids_idx:
                named_medoids[data[medID]] = []
                for pointID in medoids[medID]:
                    named_medoids[data[medID]].append(data[pointID])
            # "-" because we maximize modularity
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=named_medoids)
        else:
            mod = -modularity(data, COST=costF_idx, distDict=distDict, edgeDict=affinities, medoids=medoids)
        print "modularity computed"

    else:
        print "unknown target function"
        return(1)

    if len(assignErrors) > 0:
        print "unassigned nodes: ", assignErrors
    else:
        print "no unassigned nodes, all right"

    return(mod, medoids)
Пример #29
0
def cutNoise(matrix):
    print "total:",len(matrix)
    '''
    适用类似基于密度聚类的方法,识别样本中的无意义样本(特征过少),和
    噪声样本(不属于任何类,或自成一类)
    '''

    discard = []        # discard 无需返回, 根据 noise 和 real 可以得出
    noise = []
    real = []
    noise_matrix = []
    real_matrix = []

    rest = []
    rm = False
    if(len(matrix[0])>100):
        rm = True
    for i,vector in enumerate(matrix):
        No_0 = 0                        # 记录非零特征
        for j in vector:
            if 0!=j:
                No_0 += 1
        if rm:
            low = 3
        else:
            low = 1

        if (low > No_0):                # discard 掉特征数量小于下限的向量
            discard.append(i)
        else:
            real.append(i)
            rest.append(vector)

    print "discard:",len(discard)
    #---------------------------------------------------------------------------------------          
    l = len(rest)
    distList = np.zeros((l,l),np.float)
    for i in range(l):
        distList[i][i] = float('inf')       # 自身不参与聚类比较
        for j in range(i+1,l):
            distList[i][j] = euclidean_distance(np.array(rest[i]), np.array(rest[j]))
            distList[j][i] = distList[i][j]
            # if(distList[i][j]==0):
            #     print i,":",rest[i]
            #     print j,":",rest[j]
    #----------------------------------------------------------------------------------------
    mostSimList = []        # 记录与第 i 个样本第 m 相似的距离
    m = 1
    if(l<=m):
        noise = copy.copy(real)
        real = []
        noise_matrix = rest
        real_matrix = []
        tmp = []
        angles = []
        return real_matrix,noise_matrix,tmp,angles,real,noise

    marks = [i for i in range(l)]

    for i in range(l):
        lis = distList[i].tolist()
        lis = sorted(lis)
        mostSimList.append(lis[m-1])

    ADist = zip(marks,mostSimList)
    ADist = sorted(ADist, key = lambda x: x[1], reverse=True)


    end = l-1
    Dlist = []
    while  end>=0:
        Dlist.append(ADist[end][1])
        end -= 1
    # print Dlist
    # draw_line(Dlist)

    tmp, angles, part = min_Angle_part(Dlist)    # 注意 Dlist 里面应该是从小到大的顺序
    print part

    for e in ADist[0:(1+part)*l/10]:
        noise.append(real[e[0]])

    real0 = copy.copy(real)                 # 对应 rest 中的向量

    real = sorted(list(set(real) - set(noise)))
    noise = sorted(noise) 

    real_matrix = []
    for i , vector in enumerate(rest):
        if real0[i] in noise:
            noise_matrix.append(vector)
        else:
            real_matrix.append(vector)
    return real_matrix,noise_matrix,tmp,angles,real,noise