def standardised_euclidean_distance(data,data_instance1, data_instance2): """Returns the euclidean distance of two clusters in the n-dimensional space, which is defined by the number of their attributes, weighted by the inverse of the corresponding attribute's variance. """ if len(data_instance1.feature_vector) != len(data_instance2.feature_vector): print("The two points must be defined in the same dimensional space.") return column_list = pick_up_all_data_columns(data) std_list = [] for i,column in enumerate(column_list): if i != 0: #because the first column is always the data labels std_list.append(standard_deviation(column)) sum_of_squares = 0.0 for i in range(0, len(data_instance1.feature_vector)): sum_of_squares += 1/pow(std_list[i],2) * pow(data_instance1.feature_vector[i] - data_instance2.feature_vector[i], 2) #the weight here is the variance not the std return sqrt(sum_of_squares)
def standardisation(data_list): m = mean(data_list) sd = standard_deviation(data_list) return [((row-m)/sd) for row in data_list]