예제 #1
0
def plot_km_em_clusters(x_train_scaled, x=0, y=1, z = 0, dataset_name = "",  km_clusters = 12, em_clusters=12):
    kmeans = KMeans(n_clusters=km_clusters, random_state=random_state)
    y_pred = kmeans.fit_predict(x_train_scaled)
    plot_clusters_3d(x_train_scaled, y_pred, x, y, z, dataset_name=dataset_name, classifier = "K-means")
    gm = BayesianGaussianMixture(n_components = em_clusters, random_state=random_state, reg_covar=1e-01)
    y_pred = gm.fit_predict(x_train_scaled)
    plot_clusters_3d(x_train_scaled, y_pred, x, y, z, dataset_name=dataset_name, classifier = "EM")
예제 #2
0
def view_gmm_graph(selected: np.array,
                   validation_type: str = 'default',
                   df: pd.DataFrame = None,
                   n_init: int = 5,
                   max_k=150) -> tuple:
    scores = []
    temp_df = df.copy() if df is not None else None
    temp_selected = selected.toarray()
    if validation_type == 'default':
        for i in range(10, max_k, 10):
            model = BayesianGaussianMixture(i,
                                            covariance_type='diag',
                                            n_init=n_init,
                                            init_params='kmeans',
                                            random_state=12)
            scores.append(model.fit(temp_selected).score(temp_selected))
            print(f'k={i}, score={scores[-1]}')
    elif validation_type == 'purity':
        for i in range(10, max_k, 10):
            model = BayesianGaussianMixture(i,
                                            covariance_type='diag',
                                            n_init=n_init,
                                            init_params='kmeans',
                                            random_state=12)
            temp_df['KMeans'] = model.fit_predict(temp_selected)
            scores.append(cluster_purity(temp_df))
            print(f'k={i}, score={scores[-1]}')
    plt.plot(range(10, max_k, 10), scores)
    plt.xlabel('Number of clusters')
    plt.ylabel('Score')
    plt.show()
    return list(range(10, max_k, 10)), scores
def test_bayesian_mixture_fit_predict_n_init():
    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
    X = np.random.RandomState(0).randn(50, 5)
    gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0)
    y_pred1 = gm.fit_predict(X)
    y_pred2 = gm.predict(X)
    assert_array_equal(y_pred1, y_pred2)
def test_bayesian_mixture_fit_predict_n_init():
    # Check that fit_predict is equivalent to fit.predict, when n_init > 1
    X = np.random.RandomState(0).randn(1000, 5)
    gm = BayesianGaussianMixture(n_components=5, n_init=10, random_state=0)
    y_pred1 = gm.fit_predict(X)
    y_pred2 = gm.predict(X)
    assert_array_equal(y_pred1, y_pred2)
예제 #5
0
def BayesianGaussianMixture(V, **kwargs):
    """Performs clustering on *V* by using Gaussian mixture models with variational inference. The function uses :func:`sklearn.micture.GaussianMixture`. See sklearn documents 
    for details.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg n_clusters: specifies the number of clusters. 
    :type n_clusters: int
    """

    try:
        from sklearn.mixture import BayesianGaussianMixture
    except ImportError:
        raise ImportError('Use of this function (BayesianGaussianMixture) requires the '
                          'installation of sklearn.')
    
    n_components = kwargs.pop('n_components', None)
    if n_components == None:
        n_components = kwargs.pop('n_clusters',None)
        if n_components == None:
            n_components = 1
    
    n_init = kwargs.pop('n_init', 1)
    
    mixture = BayesianGaussianMixture(n_init=n_init, **kwargs).fit(V)

    return mixture.fit_predict(V)
예제 #6
0
def model_dpgmm(k, x_scaled):
    global labels

    print("\nDirichlet Process Gaussian Mixture ... [" + str(len(x_scaled)) +
          ' training samples -> ' + str(k) + ' initial clusters]')
    model = BayesianGaussianMixture(n_components=k, covariance_type='full')
    labels = model.fit_predict(x_scaled)
    print('Done! Final cluster numbers = ', len(np.unique(labels)))
    do_dataframe()
def identifyTransitionsComplete(p, window_size):
    joint_angles = (p['X'][:, :, 0:7])
    joint_velocity = (p['X'][:, :, 7:14])
    torque = (p['U'][:, :, :])
    endeff_pose = (p['EX'][:, :, 0:6])
    endeff_velocity = (p['EX'][:, :, 6:12])
    force_feedback = (p['F'][:, :, :])
    total_rollout = joint_velocity.shape[0]
    traj = []
    for index in range(0, total_rollout):
        traj_feature = np.hstack(
            (joint_angles[index, :, :], endeff_pose[index, :, :],
             force_feedback[index, :, :]))
        traj.append(traj_feature)
    traj = np.array(traj)
    traj_time = traj.shape[1]
    dim = traj.shape[2]
    total_size = total_rollout * traj_time
    demo_data_array = np.zeros((total_size - window_size, dim * window_size))
    inc = 0
    for j in range(0, total_rollout):
        trajC = traj[j, :, :]
        for i in range(window_size, traj_time):
            window = trajC[i - window_size:i, :]
            demo_data_array[inc, :] = np.reshape(window,
                                                 (1, dim * window_size))
            inc = inc + 1

    estimator = BayesianGaussianMixture(n_components=10,
                                        n_init=10,
                                        max_iter=300,
                                        weight_concentration_prior=1e-1,
                                        init_params='random',
                                        verbose=False)
    labels = estimator.fit_predict(demo_data_array)
    # print(estimator.weights_)
    filtabels = smoothing(labels)
    # print(labels)
    inc = 0
    transitions = []
    for j in range(window_size, total_size):

        if inc == 0 or j == window_size:
            pass  # self._transitions.append((i,0))
        elif j == (total_size - 1):
            pass  # self._transitions.append((i,n-1))
        elif filtabels[inc - 1] != filtabels[inc]:
            transitions.append(j - window_size)
        inc = inc + 1

    transitions.append(0)
    transitions.append(total_size - 1)
    transitions.sort()
    print("[TSC] Discovered Transitions (number): ", len(transitions))

    return transitions
예제 #8
0
 def partition_data(self,j):
     dp = BayesianGaussianMixture(n_components = int(self.alpha*np.log(self.N)) ,weight_concentration_prior = self.alpha, init_params='kmeans',weight_concentration_prior_type='dirichlet_process')
     Z = dp.fit_predict(self.X[self.U[j]])
     le = LE()
     Z = le.fit_transform(Z)
     Z_count = np.bincount(Z)
     assert(Z.max()+1 == Z_count.size)
     self.K[j] = int(Z_count.size)
     self.marginal_LL_k[j] = {k:0 for k in range(int(self.K[j])) }
     return(Z,Z_count)
예제 #9
0
def cluster(points,
            clouds=None,
            concentration_prior=None,
            K=100,
            restarts=10,
            seed=0):
    """
    Clusters a set of data points lying in an arbitrary number of clusters.
    Arguments:
        data (list of lists of floats): list of data points to be clustered.
        clouds (list or lists of floats, same second dimension as data): bootstrapped bins for clustering
        sampleName (string): The name of the input sample.
        concentration_prior (float): Tuning parameter for clustering, must be between 0 and 1. Used to determine concentration
            of points in clusters -- higher favors more clusters, lower favors fewer clusters.
        K (int): maximum number of clusters to infer
        restarts (int): number of initializations to try for GMM
        seed (int): random number generator seed for GMM
    Returns:
        mus (list of lists of floats): List of cluster means.
        sigmas (list of 2D lists of floats): List of cluster covariances.
        clusterAssignments (list of ints): The assignment of each interval to a cluster, where an entry
                                            j at index i means the ith interval has been assigned to the
                                            jth meta-interval.
        numPoints (list of ints): Number of points assigned to each cluster
        numClusters (int): The number of clusters.
    """
    from sklearn.mixture import BayesianGaussianMixture
    from collections import Counter

    sp.log(msg="## Clustering with K={} and c={}...\n".format(
        K, concentration_prior),
           level="INFO")
    total = list(points)
    if clouds is not None:
        total.extend(list(clouds))
    npArray = np.array(total)

    gmm = BayesianGaussianMixture(
        n_components=K,
        n_init=restarts,
        weight_concentration_prior=concentration_prior,
        max_iter=int(1e6),
        random_state=seed)
    targetAssignments = gmm.fit_predict(npArray)
    targetAssignments = targetAssignments[:len(points)]
    mus = gmm.means_
    sigmas = gmm.covariances_
    cntr = Counter(targetAssignments)
    numPoints = [cntr[i] if i in cntr else 0 for i in range(K)]
    numClusters = len(cntr)

    return mus, sigmas, targetAssignments, numPoints, numClusters
예제 #10
0
def add_gmm_labels(df: pd.DataFrame,
                   selected: np.array,
                   k: int = 60,
                   n_init=5):
    dense_selected = selected.toarray()
    model = BayesianGaussianMixture(k,
                                    covariance_type='diag',
                                    n_init=n_init,
                                    init_params='kmeans',
                                    random_state=12)
    labels = model.fit_predict(dense_selected)
    df['GMM'] = labels
    return model
예제 #11
0
 def ms_VB(self):
     # 直接使用 sklearn 中的 VBGMM
     self.K = MAX_K
     clf = BayesianGaussianMixture(n_components=MAX_K,
                                   covariance_type="full",
                                   max_iter=200,
                                   random_state=0)
     y = self.correct_order(clf.fit_predict(self.x), clf)
     self.mus = clf.means_
     print(y)
     self.show_scatter(y, "VBGMM")
     accuracy = np.mean(self.real_y.ravel() == y.ravel())
     print(accuracy)
예제 #12
0
def relabel(dataset, n_components=4):
    if not len(dataset):
        return np.array([])

    new_labels = -1 * np.ones(len(dataset.labels))
    for l in np.unique(dataset.labels):
        gmm = BayesianGaussianMixture(n_components=n_components,
                                      weight_concentration_prior=1 /
                                      (n_components * 2),
                                      max_iter=200)
        ttt = PCA(n_components=6).fit_transform(
            dataset.waveforms[dataset.labels == l])
        gmm_labels = gmm.fit_predict(ttt)
        new_labels[dataset.labels == l] = gmm_labels + 1 + np.max(new_labels)
    return new_labels
def identifyTransitions(traj, window_size, weight_prior, n_components):
    '''
    Transition detection function based on DPGMM and windowing approach
    :param traj: trajectory with states, action, contact forces
    :param window_size: windows size used to accumulate states
    :param weight_prior, n_components: parameter used for DPGMM
    :return: transition points (index) in the trajectory
    '''
    total_size = traj.shape[0]
    dim = traj.shape[1]
    demo_data_array = np.zeros((total_size - window_size, dim * window_size))
    inc = 0
    for i in range(window_size, total_size):
        window = traj[i - window_size:i, :]
        demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size))
        inc = inc + 1

    estimator = BayesianGaussianMixture(
        n_components=n_components,
        n_init=10,
        max_iter=300,
        weight_concentration_prior=weight_prior,
        init_params='random',
        verbose=False)
    labels = estimator.fit_predict(demo_data_array)
    filtabels = smoothing(labels)
    inc = 0
    transitions = []
    for j in range(window_size, total_size):

        if inc == 0 or j == window_size:
            pass  # self._transitions.append((i,0))
        elif j == (total_size - 1):
            pass  # self._transitions.append((i,n-1))
        elif filtabels[inc - 1] != filtabels[inc]:
            transitions.append(j - window_size)
        inc = inc + 1

    transitions.append(0)
    transitions.append(total_size - 1)
    transitions.sort()

    # print("[TSC] Discovered Transitions (number): ", len(transitions))
    return transitions
예제 #14
0
    def execute(self, namespace):
        from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
        from PYME.IO import MetaDataHandler

        points = namespace[self.input_points]
        X = np.stack([points['x'], points['y'], points['z']], axis=1)

        if self.mode == 'n':
            gmm = GaussianMixture(n_components=self.n,
                                  covariance_type=self.covariance)
            predictions = gmm.fit_predict(X)

        elif self.mode == 'bic':
            n_components = range(1, self.n + 1)
            bic = np.zeros(len(n_components))
            for ind in range(len(n_components)):
                gmm = GaussianMixture(n_components=n_components[ind],
                                      covariance_type=self.covariance)
                gmm.fit(X)
                bic[ind] = gmm.bic(X)
                logger.debug('%d BIC: %f' % (n_components[ind], bic[ind]))

            best = n_components[np.argmin(bic)]
            if best == self.n or (self.n > 10 and best > 0.9 * self.n):
                logger.warning(
                    'BIC optimization selected n components near n max')

            gmm = GaussianMixture(n_components=best,
                                  covariance_type=self.covariance)
            predictions = gmm.fit_predict(X)

        elif self.mode == 'bayesian':
            bgm = BayesianGaussianMixture(n_components=self.n,
                                          covariance_type=self.covariance)
            predictions = bgm.fit_predict(X)

        out = tabular.MappingFilter(points)
        try:
            out.mdh = MetaDataHandler.DictMDHandler(points.mdh)
        except AttributeError:
            pass

        out.addColumn(self.label_key, predictions)
        namespace[self.output_labeled] = out
예제 #15
0
    def get_direction(self, X, mu):
        r"""
        Generate direction vectors.

        Parameters
        ----------
            X : array
                Array of shape ``(nwalkers//2, ndim)`` with the walker positions of the complementary ensemble.
            mu : float
                The value of the scale factor ``mu``.
        
        Returns
        -------
            directions : array
                Array of direction vectors of shape ``(nwalkers//2, ndim)``.
        """

        if not self.tune:
            mu = self.mu0

        n = X.shape[0]

        mixture = BayesianGaussianMixture(n_components=self.n_components)
        labels = mixture.fit_predict(X)
        means = mixture.means_
        covariances = mixture.covariances_

        i, j = np.random.choice(labels, 2, replace=False)
        if i != j:
            directions = np.random.multivariate_normal(
                means[i], covariances[i] * self.rescale_cov,
                size=n) - np.random.multivariate_normal(
                    means[j], covariances[j] * self.rescale_cov, size=n)
            tune_once = False
        else:
            directions = mu * np.random.multivariate_normal(
                np.zeros_like(means[i]), covariances[i], size=n)
            if self.tune:
                tune_once = True
            else:
                tune_once = False

        return 2.0 * directions, tune_once
예제 #16
0
def identifyTransitions(traj, window_size):
    """
        Identify transition by accumulating data points using sliding window and using DP GMM to find
        clusters in a single trajectory
    """
    total_size = traj.shape[0]
    dim = traj.shape[1]
    demo_data_array = np.zeros((total_size - window_size, dim * window_size))
    inc = 0
    for i in range(window_size, total_size):
        window = traj[i - window_size:i, :]
        demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size))
        inc = inc + 1

    estimator = BayesianGaussianMixture(n_components=5,
                                        n_init=10,
                                        max_iter=300,
                                        weight_concentration_prior=0.01,
                                        init_params='random',
                                        verbose=False)
    labels = estimator.fit_predict(demo_data_array)
    # print(estimator.weights_)
    filtabels = smoothing(labels)
    # print(labels)
    inc = 0
    transitions = []
    for j in range(window_size, total_size):

        if inc == 0 or j == window_size:
            pass  # self._transitions.append((i,0))
        elif j == (total_size - 1):
            pass  # self._transitions.append((i,n-1))
        elif filtabels[inc - 1] != filtabels[inc]:
            transitions.append(j - window_size)
        inc = inc + 1

    transitions.append(0)
    transitions.append(total_size - 1)
    transitions.sort()

    print("[TSC] Discovered Transitions (number): ", len(transitions))
    return transitions
예제 #17
0
def identifyTransitions(traj, window_size):
    total_size = traj.shape[0]
    dim = traj.shape[1]
    demo_data_array = np.zeros((total_size - window_size, dim * window_size))
    inc = 0
    for i in range(window_size, total_size):
        window = traj[i - window_size:i, :]
        demo_data_array[inc, :] = np.reshape(window, (1, dim * window_size))
        inc = inc + 1

    estimator = BayesianGaussianMixture(n_components=10,
                                        n_init=10,
                                        max_iter=300,
                                        weight_concentration_prior=1e-2,
                                        init_params='random',
                                        verbose=False)
    labels = estimator.fit_predict(demo_data_array)
    # print(estimator.weights_)
    filtabels = smoothing(labels)
    # print(labels)
    inc = 0
    transitions = []
    for j in range(window_size, total_size):

        if inc == 0 or j == window_size:
            pass  # self._transitions.append((i,0))
        elif j == (total_size - 1):
            pass  # self._transitions.append((i,n-1))
        elif filtabels[inc - 1] != filtabels[inc]:
            transitions.append(j - window_size)
        inc = inc + 1

    transitions.append(0)
    transitions.append(total_size - 1)
    transitions.sort()

    # print("[TSC] Discovered Transitions (time): ", transitions)
    return transitions
예제 #18
0
def local_gmm(global_label, levels, n=10, image=None):
    assert (
        type(image) is np.ndarray
        and image.dtype == np.uint8
        and len(image.shape) == 2
        or image is None
    ), "The input image has to be a uint8 2D numpy array, or omitted."
    assert (
        type(global_label) is np.ndarray
        and global_label.dtype == np.uint8
        and len(global_label.shape) == 2
    ), "The input global_label has to be a uint8 2D numpy array."
    assert len(levels) == np.max(
        global_label
    ), "The nubmer of levels should match that of the global labels."
    assert type(n) is int
    local_label = np.zeros(global_label.shape, dtype=np.uint8)
    blob_levels = []
    for i in range(len(levels)):
        lvl_ind = (global_label == i + 1).nonzero()
        data = np.transpose(lvl_ind)
        gmm = BayesianGaussianMixture(n_components=min(n, len(data)), random_state=123)
        prediction = gmm.fit_predict(data)
        for j in np.unique(prediction):
            blob_levels.append(levels[i])
            blob_pts = prediction == j
            local_label[lvl_ind[0][blob_pts], lvl_ind[1][blob_pts]] = len(blob_levels)
    label_resized = (
        local_label
        if image.shape == local_label.shape
        else cv2.resize(local_label, image.shape[::-1], interpolation=cv2.INTER_NEAREST)
    )
    image_labeled = (
        None
        if image is None
        else img_as_ubyte(label2rgb(label_resized, image, bg_label=0))
    )
    return image_labeled, local_label, blob_levels
예제 #19
0
def cluster_vbgm(aligned_maps):
    # sample_by_features = np.vstack([xmap.flatten() for xmap in aligned_maps])
    embedding = embed(aligned_maps)
    clusterer = BayesianGaussianMixture(n_components=10)
    return clusterer.fit_predict(embedding)
예제 #20
0
                      n_features=2,
                      center_box=[-5, 5],
                      centers=nb_centers,
                      random_state=1000)

    # Train the model with concentration 1000 and 0.1
    for c in (1000.0, 0.1):
        gm = BayesianGaussianMixture(n_components=5,
                                     weight_concentration_prior=c,
                                     max_iter=10000,
                                     random_state=1000)
        gm.fit(X)

        print('Weights: {}'.format(gm.weights_))

        Y_pred = gm.fit_predict(X)

        print((Y_pred == 0).sum())
        print((Y_pred == 1).sum())
        print((Y_pred == 2).sum())
        print((Y_pred == 3).sum())
        print((Y_pred == 4).sum())

        # Compute the parameters of the Gaussian mixture
        m1 = gm.means_[0]
        m2 = gm.means_[1]
        m3 = gm.means_[2]
        m4 = gm.means_[3]
        m5 = gm.means_[4]

        c1 = gm.covariances_[0]
예제 #21
0
    # for i in range(len(nc_array)):
    #     dp = GaussianMixture(n_components=nc_array[i], covariance_type='full', max_iter=10000, verbose=0)
    #     dp.fit(label_data)
    #
    #     dpgmm_list.append(dp)
    #     log_acc[i] = dp.lower_bound_ - nc_array[i]/rho
    #
    # dpgmm = dpgmm_list[np.argmax(log_acc)]
    # print(len(dpgmm.covariances_), log_acc)

    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=50000,
                                    weight_concentration_prior_type='dirichlet_process', mean_precision_prior=.8,
                                    weight_concentration_prior=gamma, n_init=1, init_params='random', reg_covar=1e-6)

    # gmm_labels = dpgmm.predict(label_data)
    gmm_labels = dpgmm.fit_predict(label_data)
    # print('Variational', max(gmm_labels)+1)

    for k, cov in enumerate(dpgmm.covariances_):
        c1, c2 = np.diag(cov)
        if c1/c2 < cov_ratio and c2/c1 < cov_ratio:
            em_data.extend(label_data[gmm_labels == k])
            # import pdb; pdb.set_trace()

    if np.mod(m, 10) == 0:
        print('cluster {} out of {}'.format(m, len(arg_labels)))

    # plot_results(label_data, gmm_labels, dpgmm.means_, dpgmm.covariances_)
    # plt.show()

def model_dpgmm(k, x_scaled):
    global labels

    model = BayesianGaussianMixture(n_components=k, covariance_type='full')
    labels = model.fit_predict(x_scaled)
    X_pen_scaled = X_pen_scaled.reshape(-1, 1)

    X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split(
        X_pen_scaled, ypen, test_size=0.20)

    pen_classifier.fit(X_train_pen, y_train_pen)
    pen_pred = pen_classifier.predict(X_test_pen)
    pen_error_kmean.append(1 - metrics.accuracy_score(pen_pred, y_test_pen))
#===========================================================
#===========================EM=============================
from sklearn.decomposition import FastICA
for i in range(1, 31):
    X_pen_scaled = pen_scaler.fit_transform(Xpen)

    pen_bgm = BayesianGaussianMixture(n_components=i)
    X_pen_scaled = pen_bgm.fit_predict(X_pen_scaled)
    X_pen_scaled = X_pen_scaled.reshape(-1, 1)

    X_train_pen, X_test_pen, y_train_pen, y_test_pen = train_test_split(
        X_pen_scaled, ypen, test_size=0.20)

    pen_classifier.fit(X_train_pen, y_train_pen)
    pen_pred = pen_classifier.predict(X_test_pen)
    pen_error_em.append(1 - metrics.accuracy_score(pen_pred, y_test_pen))
#===========================================================

plt.figure(figsize=(12, 6))
plt.plot(range(1, 31),
         pen_error,
         label='No Clustering',
         color='red',
예제 #24
0
    row_ix = np.where(yhat == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()

# advanced
#================================================
ddgmm = BayesianGaussianMixture(
    n_components=2,
    covariance_type='full',
    weight_concentration_prior=100,
    weight_concentration_prior_type="dirichlet_distribution",
    max_iter=100,
    random_state=1337).fit(X)
yhat = ddgmm.fit_predict(X)
# retrieve unique clusters
clusters = np.unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = np.where(yhat == cluster)
    # create scatter of these samples
    plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()

dpgmm = BayesianGaussianMixture(
    n_components=2,
    covariance_type='full',
    weight_concentration_prior=100,
예제 #25
0
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])

mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07)
train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_km, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='CV (+K-means Result)', linestyle='dashed')
plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="orange", label='Training (+K-means Result')
print(train_sizes)
print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])

gm = BayesianGaussianMixture(n_components = 14, random_state=random_state, reg_covar=1e-01)
y_pred = gm.fit_predict(x_projected_pca)
x_train_scaled_em = np.column_stack((x_train_scaled,y_pred))


mlp_learner = MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='sgd', learning_rate = 'adaptive', learning_rate_init = 0.07)
train_sizes, train_scores, test_scores = learning_curve(mlp_learner, x_train_scaled_em, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 100))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
plot_data(train_sizes, test_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='CV (+EM Result)', linestyle='dashed')
plot_data(train_sizes, train_mean, title="Neural Network Learning Curve - Dataset 1 + Clustering Result", x_label="Training Size", y_label="Accuracy Score", color="blue", label='Training (+EM Result)')
print(train_sizes)
print((train_sizes[np.argmax(test_mean)]))
print(test_mean[np.argmax(test_mean)])
print(train_mean[np.argmax(test_mean)])

plt.savefig('Dataset 1 + Clustering NN learning curve.png')
예제 #26
0
scl = [0, "rgb(150,0,90)"], [0.125, "rgb(0, 0, 200)"], [0.25, "rgb(0, 25, 255)"], \
      [0.375, "rgb(0, 152, 255)"], [0.5, "rgb(44, 255, 150)"], [0.625, "rgb(151, 255, 0)"], \
      [0.75, "rgb(255, 234, 0)"], [0.875, "rgb(255, 111, 0)"], [1, "rgb(255, 0, 0)"]

if __name__ == '__main__':

    scoords = SitesCoords()
    sites_i = 31265
    sites_f = 12100
    nc = 50
    mutual = True
    lsites = scoords.get_direct_neighbors(sites_i, 0.35)
    # lsites = range(sites_i, sites_f)
    lclust = compute_clusterings(lsites, nc, mutual=mutual)
    mdist = compute_distance_matrix(lclust, mutual=mutual)
    #plot_md_scaling(mdist)
    tdata = md_scaling(mdist)

    #cs = adjust_nc(tdata)
    #kmeans = KMeans(n_clusters=cs)
    #labels = kmeans.fit_predict(tdata)

    gmm = BayesianGaussianMixture(n_components=10,
                                  covariance_type='full',
                                  max_iter=1000,
                                  n_init=10,
                                  tol=0.00001)
    labels = gmm.fit_predict(tdata)
    create_plot(data_plot(lsites, labels), str(sites_i))
#============================1. Uncomment this section for use with plots A, B, C, D =======================================================
for i in range(20):

    from sklearn.mixture import BayesianGaussianMixture  # Chose BayesianGaussianMixture because it received better accuracy scores than just GaussianMixture
    bgm_sat = BayesianGaussianMixture(
        n_components=7,
        covariance_type='tied',
        weight_concentration_prior=params[i],
        max_iter=500)  # 7 categories, domain knowledge
    bgm_pen = BayesianGaussianMixture(
        n_components=10,
        covariance_type='full',
        weight_concentration_prior=params[i],
        max_iter=500)  # 10 categories, domain knowledge
    start_time = time.time()
    sat_labels_pred = bgm_sat.fit_predict(X_train_sat_og)
    #=======2. Use only for Plot C==================================
    # sat_labels_train = bgm_sat.fit_predict(X_train_sat_og)
    # sat_labels_test = bgm_sat.predict(X_test_sat_og)
    #=======================================================
    end_time = time.time()
    sat_time = end_time - start_time
    start_time = time.time()
    pen_labels_pred = bgm_pen.fit_predict(X_train_pen_og)
    #==========3. Use only for making Plot C======================
    # pen_labels_train = bgm_sat.fit_predict(X_train_pen_og)
    # pen_labels_test = bgm_sat.predict(X_test_pen_og)
    #=====================================================
    end_time = time.time()
    pen_time = end_time - start_time
    #=====================================================================================================================================
예제 #28
0
    def execute(self, namespace):
        from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
        from PYME.IO import MetaDataHandler

        points = namespace[self.input_points]
        X = np.stack([points['x'], points['y'], points['z']], axis=1)

        if self.mode == 'n':
            gmm = GaussianMixture(n_components=self.n,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bic':
            n_components = range(1, self.n + 1)
            bic = np.zeros(len(n_components))
            for ind in range(len(n_components)):
                gmm = GaussianMixture(n_components=n_components[ind],
                                      covariance_type=self.covariance,
                                      max_iter=self.max_iter,
                                      init_params=self.init_params)
                gmm.fit(X)
                bic[ind] = gmm.bic(X)
                logger.debug('%d BIC: %f' % (n_components[ind], bic[ind]))

            best = n_components[np.argmin(bic)]
            if best == self.n or (self.n > 10 and best > 0.9 * self.n):
                logger.warning(
                    'BIC optimization selected n components near n max')

            gmm = GaussianMixture(n_components=best,
                                  covariance_type=self.covariance,
                                  max_iter=self.max_iter,
                                  init_params=self.init_params)
            predictions = gmm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = gmm.score_samples(X)
            if not gmm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        elif self.mode == 'bayesian':
            bgm = BayesianGaussianMixture(n_components=self.n,
                                          covariance_type=self.covariance,
                                          max_iter=self.max_iter,
                                          init_params=self.init_params)
            predictions = bgm.fit_predict(X) + 1  # PYME labeling scheme
            log_prob = bgm.score_samples(X)
            if not bgm.converged_:
                logger.error('GMM fitting did not converge')
                predictions = np.zeros(len(points), int)
                log_prob = -np.inf * np.ones(len(points))

        out = tabular.MappingFilter(points)
        try:
            out.mdh = MetaDataHandler.DictMDHandler(points.mdh)
        except AttributeError:
            pass

        out.addColumn(self.label_key, predictions)
        out.addColumn(self.label_key + '_log_prob', log_prob)
        avg_log_prob = np.empty_like(log_prob)
        for label in np.unique(predictions):
            mask = label == predictions
            avg_log_prob[mask] = np.mean(log_prob[mask])
        out.addColumn(self.label_key + '_avg_log_prob', avg_log_prob)
        namespace[self.output_labeled] = out
예제 #29
0
    n_components=args.nclusters,
    covariance_type='diag',
    max_iter=1000,
    weight_concentration_prior_type='dirichlet_process')

dimred = TSNE(n_components=2)
fig2, ax2 = plt.subplots(1, 1)
cmap = iter([plt.cm.tab20(x) for x in range(0, 20)])

with torch.no_grad():
    diter = iter(train_loader)
    y, lab = diter.next()
    mu, lvar = model.encode(y.view(-1, n_genes))
    y2 = model.reparam(mu, lvar)
    clustering.fit(y2.numpy())
    idx = clustering.fit_predict(y2.numpy())
    #    idx = km.fit_predict(y2.numpy())
    dr = dimred.fit_transform(y2.numpy())
    mx, mn = np.max(dr), np.min(dr)

    ax2.set_xlim([mn, mx])

    for ii in np.unique(idx):
        clr = np.array(next(cmap)).reshape(1, -1)
        ax2.scatter(dr[idx == ii, 0], dr[idx == ii, 1], c=clr, **params)

fig2.tight_layout()
fig2.savefig(osp.join(gdir, ''.join([tag, '_tsne.png'])))
inlatent = pd.DataFrame(y2.numpy(),
                        index=lab,
                        columns=[str(x) for x in range(args.latent_dim)])
예제 #30
0
def train(data:np.ndarray,
		  obs_len:int,
		  filter_name:str,
		  model_dir:str,
		  result_dir:str,
		  save_model:bool=True)->NoReturn:
	
	print('[Bayesian Gaussian Mixture Clustering][train] creating model...')

	bgm = BayesianGaussianMixture(n_components=3,
						  		  covariance_type="full",
						  		  max_iter=1000,
						  		  tol=1e-5,
						  		  n_init=10,
						  		  random_state=7,
						  		  weight_concentration_prior_type='dirichlet_process',
						  		  init_params="kmeans")

	print('[Bayesian Gaussian Mixture Clustering][train] training...')

	_y = bgm.fit_predict(X=data)
	_y = np.expand_dims(_y, axis=1)

	print(f'[Bayesian Gaussian Mixture Clustering][train] converged?:{bgm.converged_}')

	print('[Bayesian Gaussian Mixture Clustering][train] params (center and covariance):')
	for i, m, c, w in zip(range(1, 4), bgm.means_, bgm.covariances_, bgm.weights_):
		print(f'\tc_{i}-> mean: {m}')
		print(f'\t\tcov: {c}')
		print(f'\t\tweight: {w}')

	print('[Bayesian Gaussian Mixture Clustering][train] results:')
	_c, _l = np.unique(_y, return_counts=True)
	for i, c in zip(_c,_l):
		print (f'\tc_{i}: {c}')

	if save_model:
		model_file=f'bgm_{obs_len}s_{filter_name}.pkl'
		print (f'[Bayesian Gaussian Mixture Clustering][train] saving model ({model_file})...')
		with open(os.path.join(model_dir, model_file), 'wb') as f:
			pickle.dump(bgm, f)


	result_file = f'results_bgm_train_{obs_len}s_{filter_name}.csv'
	print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...')
	labels = ['mean_velocity', 
			  'mean_acceleration', 
			  'mean_deceleration', 
			  'std_lateral_jerk', 
			  'driving_style']

	result = np.concatenate((data, _y), axis=1)
	df = pd.DataFrame(data=result, columns=labels)
	df.to_csv(os.path.join(result_dir,result_file))

	result_file = result_file.replace('results', 'params').replace('csv', 'json')
	print (f'[Bayesian Gaussian Mixture Clustering][train] saving results ({result_file})...')
	_d = {}
	_d['means'] = bgm.means_.tolist()
	_d['covariances'] = bgm.covariances_.tolist()
	_d['weights'] = bgm.weights_.tolist()
	with open(os.path.join(result_dir, result_file), 'w') as f:
		json.dump(_d, f)
예제 #31
0
def km_em(x_train_scaled, dataset_name="", true_vals = y_train, reg_covar = 1e-01):
    distortions = []
    sil = []
    n = 22
    # v_measure = []
    homogeneity = []
    completeness = []
    mutual_info = []
    adj_rand_score = []
    sil = []
    kmeans_times = []
    homogeneity_em = []
    completeness_em = []
    mutual_info_em = []
    adj_rand_score_em = []
    sil_em = []
    em_times = []
    em_likelihood = []
    for i in range(2,n+1):
#         print(i)
        start_time = time.time()
        kmeans = KMeans(n_clusters=i, random_state=random_state)
        kmeans.fit(x_train_scaled)
        distortions.append(kmeans.inertia_)
        y_pred = kmeans.predict(x_train_scaled)
        kmeans_times.append(time.time()-start_time)
        homogeneity.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        sil.append(silhouette_score(x_train_scaled, kmeans.labels_, metric='euclidean'))
        start_time = time.time()
        gm = BayesianGaussianMixture(n_components = i, random_state=random_state, reg_covar=reg_covar)
        y_pred = gm.fit_predict(x_train_scaled)
        em_times.append(time.time()-start_time)
        homogeneity_em.append(homogeneity_score(true_vals, y_pred.tolist()))
        completeness_em.append(completeness_score(true_vals, y_pred.tolist()))
        mutual_info_em.append(adjusted_mutual_info_score(true_vals, y_pred.tolist()))
        adj_rand_score_em.append(adjusted_rand_score(true_vals, y_pred.tolist()))
        if len(set(y_pred))>1:
            sil_em.append(silhouette_score(x_train_scaled, y_pred, metric='euclidean'))
        else:
            sil_em.append(1)
        em_likelihood.append(gm.score(x_train_scaled))
    # plot
    plt.plot(range(2, n+1), distortions, marker='o')
    plt.title("K-means Elbow ("+(str(dataset_name))+")")
    plt.xlabel('Number of clusters')
    plt.ylabel('Sum of Squared Distances')
    plt.savefig((str(dataset_name))+' km elbow.png')
    plt.show()

    plt.plot(range(2, n+1), sil, marker='o')
    plt.title('K-means Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' km silho.png')
    plt.show()

    plt.plot(range(2, n+1), em_likelihood, marker='o')
    plt.title('EM likelihood ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Likelihood')
    plt.savefig((str(dataset_name))+' em likelihood.png')
    plt.show()
    
    plt.plot(range(2, n+1), sil_em, marker='o')
    plt.title('EM Silhouette Scores ('+(str(dataset_name))+')')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.savefig((str(dataset_name))+' em silho.png')
    plt.show()
    
    plt.close()
    plot_data(list(range(1, n)), homogeneity, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score, title="Performance Evaluation k-means ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation k-means", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' km perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), homogeneity_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="blue", label='Homogeneity')
    plot_data(list(range(1, n)), completeness_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="orange", label='Completeness')
    plot_data(list(range(1, n)), mutual_info_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="red", label='Adgusted Mutual Info')
    plot_data(list(range(1, n)), adj_rand_score_em, title="Performance Evaluation EM ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Score", color="green", label='Adjusted random index')
    # plot_data(list(range(1, n)), v_measure, title="Performance Evaluation EM", x_label="Number of Clusters", y_label="Score", color="brown", label='V-measure')
    plt.savefig((str(dataset_name))+' em perfo.png')
    plt.show()

    plt.close()
    plot_data(list(range(1, n)), kmeans_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="red", label='k-means')
    plot_data(list(range(1, n)), em_times, title="k-means/EM Running Time ("+(str(dataset_name))+")", x_label="Number of Clusters", y_label="Time", color="blue", label='EM')
    plt.savefig((str(dataset_name))+' km-em time.png')
    plt.show()
    print('kmeans_times')
    print(kmeans_times)
    print('em_times')
    print(em_times)
    
    return {'sil': sil, 'kmeans_times':kmeans_times, 'em_times':em_times, 'homogeneity':homogeneity, 'completeness':completeness, 'mutual_info':mutual_info, 'adj_rand_score':adj_rand_score, 'homogeneity_em':homogeneity_em, 'completeness_em':completeness_em, 'mutual_info_em':mutual_info_em, 'adj_rand_score_em':adj_rand_score_em}