Exemplo n.º 1
0
def _test_gmm_parameters(covariance_type):
    n_samples = [1000]
    n_centers = [2]
    stds = [.1, .5]
    n_features = [2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = GMM(c, covariance_type=covariance_type)
                    model.fit(features)
                    covariances = model.covariances
                    for cov in covariances:
                        assert (np.abs(np.sqrt(cov) - s).mean() < 1e-1)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())
                    assert (min(distance_to_true_means) < 1e-1)

                    mixing_weights = model.mixing_weights
                    orderings = permutations(mixing_weights)
                    distance_to_true_mixing_weights = []

                    actual_mixing_weights = np.array([
                        features[targets == i, :].shape[0]
                        for i in range(targets.max() + 1)
                    ])
                    actual_mixing_weights = actual_mixing_weights / actual_mixing_weights.sum(
                    )

                    for ordering in orderings:
                        _mixing_weights = np.array(list(ordering))

                        distance_to_true_mixing_weights.append(
                            np.abs(_mixing_weights -
                                   actual_mixing_weights).sum())
                    assert (min(distance_to_true_mixing_weights) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Exemplo n.º 2
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)

                    # Depending on how many random() calls the student code
                    # makes, it can mess with the random state used to generate
                    # data for subsequent tests and lead to an "impossible"
                    # input distribution that can't achieve the desired
                    # performance.  To avoid this, we save and restore the
                    # random state so the student code can't interfere with it.
                    rng_state = np.random.get_state()
                    model.fit(features)
                    np.random.set_state(rng_state)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)
Exemplo n.º 3
0
def test_kmeans_on_generated():
    n_samples = [1000, 10000]
    n_centers = [2]
    stds = [.1]
    n_features = [1, 2, 4]

    for n in n_samples:
        for f in n_features:
            for c in n_centers:
                for s in stds:
                    features, targets = generate_cluster_data(n_samples=n,
                                                              n_features=f,
                                                              n_centers=c,
                                                              cluster_stds=s)
                    # make model and fit
                    model = KMeans(c)
                    model.fit(features)

                    means = model.means
                    orderings = permutations(means)
                    distance_to_true_means = []

                    actual_means = np.array([
                        features[targets == i, :].mean(axis=0)
                        for i in range(targets.max() + 1)
                    ])

                    for ordering in orderings:
                        _means = np.array(list(ordering))

                        distance_to_true_means.append(
                            np.abs(_means - actual_means).sum())

                    assert (min(distance_to_true_means) < 1e-1)

                    # predict and calculate adjusted mutual info
                    labels = model.predict(features)
                    acc = adjusted_mutual_info(targets, labels)
                    assert (acc >= .9)