def _test_gmm_parameters(covariance_type): n_samples = [1000] n_centers = [2] stds = [.1, .5] n_features = [2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = GMM(c, covariance_type=covariance_type) model.fit(features) covariances = model.covariances for cov in covariances: assert (np.abs(np.sqrt(cov) - s).mean() < 1e-1) means = model.means orderings = permutations(means) distance_to_true_means = [] actual_means = np.array([ features[targets == i, :].mean(axis=0) for i in range(targets.max() + 1) ]) for ordering in orderings: _means = np.array(list(ordering)) distance_to_true_means.append( np.abs(_means - actual_means).sum()) assert (min(distance_to_true_means) < 1e-1) mixing_weights = model.mixing_weights orderings = permutations(mixing_weights) distance_to_true_mixing_weights = [] actual_mixing_weights = np.array([ features[targets == i, :].shape[0] for i in range(targets.max() + 1) ]) actual_mixing_weights = actual_mixing_weights / actual_mixing_weights.sum( ) for ordering in orderings: _mixing_weights = np.array(list(ordering)) distance_to_true_mixing_weights.append( np.abs(_mixing_weights - actual_mixing_weights).sum()) assert (min(distance_to_true_mixing_weights) < 1e-1) # predict and calculate adjusted mutual info labels = model.predict(features) acc = adjusted_mutual_info(targets, labels) assert (acc >= .9)
def test_kmeans_spec(): features, targets = generate_cluster_data(n_samples=100, n_features=2, n_centers=2, cluster_stds=.1) model = KMeans(2) model.fit(features) assert (hasattr(model, 'means'))
def test_kmeans_spec(): features, targets = generate_cluster_data(n_samples=100, n_features=2, n_centers=2, cluster_stds=.1) model = KMeans(2) rng_state = np.random.get_state() model.fit(features) np.random.set_state(rng_state) assert (hasattr(model, 'means'))
def test_gmm_spec(): features, targets = generate_cluster_data(n_samples=100, n_features=2, n_centers=2, cluster_stds=.1) gmm = GMM(2, 'spherical') gmm.fit(features) assert (hasattr(gmm, 'means')) assert (hasattr(gmm, 'covariances')) assert (hasattr(gmm, 'mixing_weights'))
def test_kmeans_on_generated(): n_samples = [1000, 10000] n_centers = [2] stds = [.1] n_features = [1, 2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = KMeans(c) # Depending on how many random() calls the student code # makes, it can mess with the random state used to generate # data for subsequent tests and lead to an "impossible" # input distribution that can't achieve the desired # performance. To avoid this, we save and restore the # random state so the student code can't interfere with it. rng_state = np.random.get_state() model.fit(features) np.random.set_state(rng_state) means = model.means orderings = permutations(means) distance_to_true_means = [] actual_means = np.array([ features[targets == i, :].mean(axis=0) for i in range(targets.max() + 1) ]) for ordering in orderings: _means = np.array(list(ordering)) distance_to_true_means.append( np.abs(_means - actual_means).sum()) assert (min(distance_to_true_means) < 1e-1) # predict and calculate adjusted mutual info labels = model.predict(features) acc = adjusted_mutual_info(targets, labels) assert (acc >= .9)
def test_generate_cluster_data(): n_samples = [2000, 20000] n_centers = [1, 2] stds = [.1, .5, 1.0, 2.0] n_features = [1, 2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: X, y = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) assert (X.shape == (n, f)) assert (y.max() == c - 1) for i in range(y.max()): subset = X[y == i] assert (np.abs(np.std(subset, axis=0) - s).mean() < s)
def test_kmeans_on_generated(): n_samples = [1000, 10000] n_centers = [2] stds = [.1] n_features = [1, 2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = KMeans(c) model.fit(features) means = model.means orderings = permutations(means) distance_to_true_means = [] actual_means = np.array([ features[targets == i, :].mean(axis=0) for i in range(targets.max() + 1) ]) for ordering in orderings: _means = np.array(list(ordering)) distance_to_true_means.append( np.abs(_means - actual_means).sum()) assert (min(distance_to_true_means) < 1e-1) # predict and calculate adjusted mutual info labels = model.predict(features) acc = adjusted_mutual_info(targets, labels) assert (acc >= .9)
def _test_gmm_parameters(covariance_type): n_samples = [1000] n_centers = [2] stds = [.1, .5] n_features = [2, 4] for n in n_samples: for f in n_features: for c in n_centers: for s in stds: features, targets = generate_cluster_data(n_samples=n, n_features=f, n_centers=c, cluster_stds=s) # make model and fit model = GMM(c, covariance_type=covariance_type) model.fit(features) covariances = model.covariances for cov in covariances: print("mean cov: ", np.abs(np.sqrt(cov) - s).mean()) if (np.abs(np.sqrt(cov) - s).mean() < 1e-1): return