示例#1
0
def dataset_values(coord_array):
    out = {}
    internal_distances = [
        geom.distance(i[0], i[1])
        for i in itertools.combinations(coord_array, 2)
    ]
    i_d_stdev_avg = statistic.stdev_avg(internal_distances)
    out["internal ditances avg"] = i_d_stdev_avg[1]
    out["internal ditances stdev"] = i_d_stdev_avg[0]
    out["internal distances max"] = max(internal_distances)
    out["internal distances min"] = min(internal_distances)

    centroid_stdev_avg = statistic.stdev_avg_array(coord_array)
    centroid = centroid_stdev_avg[1]
    out["centroid"] = centroid
    out["stdev of coordinates against centroid"] = centroid_stdev_avg[0]
    distances_centroid_point = [
        geom.distance(i, centroid) for i in coord_array
    ]
    avg_stdev_d_c = statistic.stdev_avg(distances_centroid_point)
    out["avg distance from centroid"] = avg_stdev_d_c[1]
    out["stdev distance from centroid"] = avg_stdev_d_c[0]
    out["min distance from centroid"] = min(distances_centroid_point)
    out["max distance from centroid"] = max(distances_centroid_point)
    return out
示例#2
0
def general_evaluate_clustered_object(data_obj):
    out = {}
    cluster_dict = data_obj.clusters_into_lists_dict()

    internal_distances = []
    avg_internal_distances_per_cluster = []
    for key in cluster_dict:
        internal_pairs = itertools.combinations(cluster_dict[key], 2)
        current_internal_distances = [
            geom.distance(a[0], a[1]) for a in internal_pairs
        ]
        internal_distances = internal_distances + current_internal_distances
        avg_internal_distances_per_cluster.append(
            statistic.avg(current_internal_distances))
    out["internal_distances"] = statistic.dict_evaluation(internal_distances)

    out["internal_distances_per_cluster"] = statistic.dict_evaluation(
        avg_internal_distances_per_cluster)

    external_distances = []
    cluster_key_pairs = itertools.combinations(list(cluster_dict.keys()), 2)
    for i in cluster_key_pairs:
        external_distances = external_distances + list(
            itertools.product(cluster_dict[i[0]], cluster_dict[i[1]]))
    if (len(set(data_obj.labels))) > 1:
        external_distances = [
            geom.distance(i[0], i[1]) for i in external_distances
        ]
        out["external_distances"] = statistic.dict_evaluation(
            external_distances)

    cluster_sizes = []
    for i in cluster_dict:
        cluster_sizes.append(len(cluster_dict[key]))

    out["cluster_sizes"] = statistic.dict_evaluation(cluster_sizes)

    #centroids = [statistic.avg_array(cluster_dict[key]) for key in cluster_dict]
    centroids = []

    distances_between_centroids_and_their_points = []
    for key in cluster_dict:
        centroid = statistic.avg_array(cluster_dict[key])
        centroids.append(centroid)
        distances = [geom.distance(centroid, i) for i in cluster_dict[key]]
        distances_between_centroids_and_their_points.extend(distances)

    out["centroid_and_their_points_distances"] = statistic.dict_evaluation(
        distances_between_centroids_and_their_points)
    if (len(set(data_obj.labels))) > 1:
        centroid_distances = [
            geom.distance(a[0], a[1])
            for a in list(itertools.combinations(centroids, 2))
        ]
        out["centroid_distances"] = statistic.dict_evaluation(
            centroid_distances)

    return out
示例#3
0
def dunn_index(data_obj):
    clusters = {}
    for num, i in enumerate(data_obj.coords):
        label = data_obj.labels[num]
        if label in clusters:
            clusters[label].append(i)
        else:
            clusters[label] = [i]

    centroids = {}
    avg_distances_to_center = {}
    for key in clusters:
        centroid = statistic.avg_coords(clusters[key])
        centroids[key] = centroid
        distances_to_center = [
            geom.distance(centroid, i) for i in clusters[key]
        ]
        avg_distances_to_center[key] = sum(distances_to_center) / len(
            distances_to_center)

    max_cluster_size = max(list(avg_distances_to_center.values()))

    cluster_key_pairs = list(itertools.combinations(list(clusters.keys()), 2))
    resulting_distances_between_cluster_pairs = {}

    distances_between_cluster_centroids = {}
    for i in cluster_key_pairs:
        distances_between_cluster_centroids[i] = geom.distance(
            centroids[i[0]], centroids[i[1]])

    distances_between_clusters_as_min_dist_between_pairs = {}
    for i in cluster_key_pairs:
        point_pairs_between_2_clusters = list(
            itertools.product(clusters[i[0]], clusters[i[1]]))
        distances_in_point_pairs = [
            geom.distance(j[0], j[1]) for j in point_pairs_between_2_clusters
        ]
        distances_between_clusters_as_min_dist_between_pairs[i] = min(
            distances_in_point_pairs)

    for i in cluster_key_pairs:
        resulting_distances_between_cluster_pairs[i] = (
            distances_between_cluster_centroids[i] +
            distances_between_clusters_as_min_dist_between_pairs[i]) / 2.0

    min_distance_between_clusters = min(
        list(resulting_distances_between_cluster_pairs.values()))

    return min_distance_between_clusters / max_cluster_size
def _init_plus_plus(data_obj):
    data_obj.c_positions = []
    remaining = data_obj.c_number
    if remaining > 0:
        remaining -= 1
        data_obj.c_positions.append(random.choice(data_obj.coords))
    distances = [float("inf") for i in data_obj.coords]
    while remaining > 0:
        last_center = data_obj.c_positions[-1]
        remaining -= 1
        for num, i in enumerate(distances):
            current_coord = data_obj.coords[num]
            #print(last_center)
            newdistance = math.pow(geom.distance(last_center, current_coord),
                                   2.0)
            distances[num] = min([distances[num], newdistance])

        sums = sum(distances)
        next_distances = [i / sums for i in distances]

        indexes = list(range(len(data_obj.coords)))
        #print(distances)
        new_center = choice(indexes, 1, p=next_distances)
        new_center = data_obj.coords[new_center]
        data_obj.c_positions.append(new_center)
def linear_circle_point(coords,radius):
	while True:
		x = random.uniform(-radius,radius)
		y = random.uniform(-radius,radius)
		if geom.distance((0.0,0.0),(x,y))<radius:
			#print(math.sqrt(math.pow(x-coords[0],2)+math.pow(x-coords[0],2)))
			break
	return (x+coords[0],y+coords[1])
示例#6
0
def triangulation_distance_within(coords):
    triangulated = geom.triangulate_set(coords)
    distances = [geom.distance(i[0], i[1]) for i in triangulated]
    avg = sum(distances) / len(distances)
    stdev = 0
    for i in distances:
        stdev = stdev + math.pow(avg - i, 2)
    stdev = math.sqrt(stdev / len(distances))
    return (avg, stdev)
def agglomerative_single_link(data_obj,**kwargs):

	anim_obj = kwargs.get("anim_obj",None)
	animated = False
	if anim_obj!=None:
		animated = True

	c = [[i] for i in range(len(data_obj.coords))]
	#dict of clusters, keys: int, first value used, val = indexes of points
	clusters = {}
	for i in c:
		clusters[i[0]]=i

	data_obj.labels = [0 for i in data_obj.coords]

	#dict of distances. key = frozenset(cluster_a,cluster_b) val = distance(cluster_a,cluster_b)
	distances ={}
	for i in itertools.combinations(clusters.keys(),2):
		distances[frozenset(i)]=geom.distance(data_obj.coords[i[0]],data_obj.coords[i[1]])

	while len(clusters) > data_obj.c_number:
		# finding a set(c_index,c_index) where distance between indexes is minimal

		min_c = list(min(distances,key=distances.get))
		a = min_c[0]
		b = min_c[1]
		# merging clusters
		clusters[min_c[0]].extend(clusters[min_c[1]])
		clusters.pop(min_c[1],None)

		#merging distances form two clusters so that the larger distance betweeen cluster remains

		#what do we need:
		distance_hash_pairs = []
		for i in clusters:
			added = [frozenset((i,a)),frozenset((i,b))]
			if len(added[0]) == 1:
				added[0] = added[1]
			if len(added[1]) ==1:
				added[1] = added[0]
			distance_hash_pairs.append(added)

		for i in distance_hash_pairs:
			distances[i[0]] = min((distances[i[0]],distances[i[1]]))
			distances.pop(i[1],None)
		#animation
		if animated:
			for key,val in clusters.items():
				for i in val:
					data_obj.labels[i]=key
			anim_obj.add_step(data_obj)

	for key,val in clusters.items():
		for i in val:
			data_obj.labels[i]=key
	return data_obj
def simplified_k_means(data_obj,indexes,iterations,cluster_num):
	centers = random.sample(indexes,cluster_num)
	centers = [data_obj.coords[i] for i in centers]
	#print(indexes)

	for i in range(iterations):
		subclusters =[[] for i in range(cluster_num)]
		distances = [i for i in indexes]
		distances = [[geom.distance(data_obj.coords[i],j) for j in centers] for i in distances ]
		for num,i in enumerate(distances):
			index = i.index(min(i))
			subclusters[index].append(num)

		for num,i in enumerate(subclusters):
			centroided = [data_obj.coords[j] for j in i]
			if len(centroided) > 0:
				print(centroided)
				centers[num] = statistic.avg_coords(centroided)
			else:
				centers[num] = random.choice(data_obj.coords)
	return subclusters
def _balanced_centers(field_size,dist_function,dist_function_params,num_of_clusters):
	data_obj = C()
	data_obj.c_number = num_of_clusters
	counter = 0
	passes = False
	while not passes:
		print("trying")
		data_obj.c_positions = [linear_square_point([field_size/2.0 for x in [0,1]],field_size)]
		while (not passes) and counter < 100*data_obj.c_number:
			newobj = linear_square_point([field_size/2.0 for x in [0,1]],field_size)
			temp_passes = True
			for i in data_obj.c_positions:
				if not dist_function(i,newobj,dist_function_params) and geom.distance(i,newobj) > 0 :
					temp_passes = False
					break
			if temp_passes:
				data_obj.c_positions.append(newobj)

			counter += 1
			passes = (len(data_obj.c_positions)==data_obj.c_number)
	return data_obj
def _reassign_to_cluster_centers(data_obj):
    for num, i in enumerate(data_obj.coords):
        distances = [geom.distance(i, c) for c in data_obj.c_positions]
        mindist = min(distances)
        data_obj.labels[num] = distances.index(mindist)
def _euclidean_balanced(a,b,min_length):
	return geom.distance(a,b) > min_length