def chi_square_distance(data, data_instance1, data_instance2): """This distance metric is used only for categorical data.""" if len(data_instance1.feature_vector) != len(data_instance2.feature_vector): print("The two points must be defined in the same dimensional space.") return column_list = pick_up_all_data_columns(data) total_attrs = [] for i, data_list in enumerate(column_list): if i != 0: total_attrs.append(sum(data_list)) average_profiles = [value/sum(total_attrs) for value in total_attrs] relative_data = [] for row in data: temp_sum = sum(row[1:]) #skips the label column relative_data.append([feature/temp_sum for i,feature in enumerate(row) if i !=0]) column_list = pick_up_all_data_columns(relative_data) cluster1_list = [] cluster2_list = [] for feature in data_instance1.feature_vector: cluster1_list.append(feature/sum(data_instance1.feature_vector)) for feature in data_instance1.feature_vector: cluster2_list.append(feature/sum(data_instance2.feature_vector)) sum_of_squares = 0.0 for i in range(0, len(cluster1_list)): sum_of_squares += (1/average_profiles[i]) * pow(cluster1_list[i] - cluster2_list[i], 2) return sqrt(sum_of_squares)
def standardised_euclidean_distance(data,data_instance1, data_instance2): """Returns the euclidean distance of two clusters in the n-dimensional space, which is defined by the number of their attributes, weighted by the inverse of the corresponding attribute's variance. """ if len(data_instance1.feature_vector) != len(data_instance2.feature_vector): print("The two points must be defined in the same dimensional space.") return column_list = pick_up_all_data_columns(data) std_list = [] for i,column in enumerate(column_list): if i != 0: #because the first column is always the data labels std_list.append(standard_deviation(column)) sum_of_squares = 0.0 for i in range(0, len(data_instance1.feature_vector)): sum_of_squares += 1/pow(std_list[i],2) * pow(data_instance1.feature_vector[i] - data_instance2.feature_vector[i], 2) #the weight here is the variance not the std return sqrt(sum_of_squares)