def test_matching_dissim(self): a = np.array([[0, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) assert_equal(1, matching_dissim(a, b)) a = np.array([[np.NaN, 1, 2, 0, 1, 2]]) b = np.array([[0, 1, 2, 0, 1, 0]]) assert_equal(2, matching_dissim(a, b)) a = np.array([['a', 'b', 'c', 'd']]) b = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']]) assert_array_equal(np.array([0, 4]), matching_dissim(a, b))
def _get_initial_centers(self, dataset, categorical_indices): dataset_cat = dataset.take(categorical_indices, axis=1).values categorical_labels = [ column for index, column in enumerate(dataset.columns) if index in categorical_indices ] dataset_num = dataset.drop(categorical_labels, axis=1).values categorical_weight = self.categorical_weight if categorical_weight is None or categorical_weight < 0: categorical_weight = 0.5 * dataset_num.std() initial_centroids_num = np.zeros( (self.cluster_number, dataset_num.shape[1])) initial_centroids_cat = np.zeros( (self.cluster_number, dataset_cat.shape[1])) rand_index = randint(0, dataset.shape[0] - 1) initial_centroids_num[0], initial_centroids_cat[0] = dataset_num[ rand_index], dataset_cat[rand_index] for i in range(1, self.cluster_number): distances_num_cat = [ np.zeros((i, dataset.shape[0]), dtype=np.float64), np.zeros((i, dataset.shape[0])) ] for j in range(0, i): distances_num_cat[0][j] = dissimilarity_python.euclidean( dataset_num, initial_centroids_num[j]) distances_num_cat[1][j] = matching_dissim( dataset_cat, initial_centroids_cat[j]) distances = np.amin(distances_num_cat[0] + categorical_weight * distances_num_cat[1], axis=0) probabilities = distances / np.sum(distances) chosen_point = np.random.choice(range(0, dataset.shape[0]), p=probabilities) initial_centroids_num[i] = dataset_num[chosen_point] initial_centroids_cat[i] = dataset_cat[chosen_point] initial_centroids = [initial_centroids_num, initial_centroids_cat] return initial_centroids
def test_ng_dissim(self): X = np.array([[0, 1, 2, 0, 1, 2], [0, 1, 2, 0, 1, 1]]) centroids = X membship = np.array([[1, 0], [0, 1]]) assert_array_equal(np.array([0., 1.]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([1., 0.]), ng_dissim(centroids, X[1], X=X, membship=membship)) # Unit test for initialization (i.e., same as matching_dissim) membship = np.array([[0, 0], [0, 0]]) mdiss_00 = matching_dissim(np.array([X[0]]), np.array([X[0]]))[0] mdiss_01 = matching_dissim(np.array([X[0]]), np.array([X[1]]))[0] mdiss_11 = matching_dissim(np.array([X[1]]), np.array([X[1]]))[0] assert_array_equal(np.array([mdiss_00, mdiss_01]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([mdiss_01, mdiss_11]), ng_dissim(centroids, X[1], X=X, membship=membship)) # Unit test for NaN X = np.array([[np.NaN, 1, 2, 0, 1, 2], [0, 1, 2, 0, 1, 1]]) centroids = X membship = np.array([[1, 0], [0, 1]]) assert_array_equal(np.array([1., 2.]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([2., 0.]), ng_dissim(centroids, X[1], X=X, membship=membship)) # Unit test for initialization with NaN(i.e., same as matching_dissim) membship = np.array([[0, 0], [0, 0]]) mdiss_00 = matching_dissim(np.array([X[0]]), np.array([X[0]]))[0] mdiss_01 = matching_dissim(np.array([X[0]]), np.array([X[1]]))[0] mdiss_11 = matching_dissim(np.array([X[1]]), np.array([X[1]]))[0] assert_array_equal(np.array([mdiss_00, mdiss_01]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([mdiss_01, mdiss_11]), ng_dissim(centroids, X[1], X=X, membship=membship)) X = np.array([['a', 'b', 'c', 'd'], ['a', 'b', 'e', 'd'], ['d', 'c', 'b', 'a']]) centroids = np.array([['a', 'b', 'c', 'd'], ['d', 'c', 'b', 'a']]) membship = np.array([[1, 1, 0], [0, 0, 1]]) assert_array_equal(np.array([0.5, 4.]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([1., 4.]), ng_dissim(centroids, X[1], X=X, membship=membship)) assert_array_equal(np.array([4., 0.]), ng_dissim(centroids, X[2], X=X, membship=membship)) # Unit test for initialization (i.e., same as matching_dissim) membship = np.array([[0, 0, 0], [0, 0, 0]]) mdiss_00 = matching_dissim(np.array([X[0]]), np.array([X[0]]))[0] mdiss_01 = matching_dissim(np.array([X[0]]), np.array([X[1]]))[0] mdiss_11 = matching_dissim(np.array([X[1]]), np.array([X[1]]))[0] mdiss_02 = matching_dissim(np.array([X[0]]), np.array([X[2]]))[0] mdiss_12 = matching_dissim(np.array([X[0]]), np.array([X[2]]))[0] mdiss_22 = matching_dissim(np.array([X[2]]), np.array([X[2]]))[0] assert_array_equal(np.array([mdiss_00, mdiss_02]), ng_dissim(centroids, X[0], X=X, membship=membship)) assert_array_equal(np.array([mdiss_01, mdiss_12]), ng_dissim(centroids, X[1], X=X, membship=membship)) assert_array_equal(np.array([mdiss_12, mdiss_22]), ng_dissim(centroids, X[2], X=X, membship=membship))