Exemplo n.º 1
0
def categorise_dataset(contents):
    iris_setosa = []
    iris_versicolor = []
    iris_virginica = []
    for each_tuple in contents:
        if each_tuple[4] == 'Iris-virginica':
            iris_virginica.append(each_tuple[:4])
        elif each_tuple[4] == 'Iris-versicolor':
            iris_versicolor.append(each_tuple[:4])
        elif each_tuple[4] == 'Iris-setosa':
            iris_setosa.append(each_tuple[:4])

    kwargs = {
        'n_init': 5,
        # depends on number of cores in your machine.
        'n_jobs': 3,
        'n_clusters': 3,
    }
    kmeans = KMeans()
    kmeans.set_params(**kwargs)
    # apply kmeans
    iris_setosa_centroids_indices = kmeans.fit_predict(np.array(iris_setosa))
    iris_setosa_centroids = kmeans.cluster_centers_

    iris_versicolor_centroids_indices = kmeans.fit_predict(
        np.array(iris_versicolor))
    iris_versicolor_centroids = kmeans.cluster_centers_

    iris_virginica_centroids_indices = kmeans.fit_predict(
        np.array(iris_virginica))
    iris_virginica_centroids = kmeans.cluster_centers_
    return (iris_setosa_centroids, iris_versicolor_centroids,
            iris_virginica_centroids)
Exemplo n.º 2
0
def run_kmeans_2(trainX):
    #find k
    cluster_counts = {
        'wine': 3,
        'wage': 2,
    }
    model = KMeans()
    visualizer = KElbowVisualizer(model,
                                  k=(2, 100),
                                  metric='calinski_harabasz',
                                  timings=True)
    visualizer.fit(X_train)
    visualizer.show()
    plt.tight_layout()
    plt.savefig('plots/km_sl_' + dataset + '.png')

    #validation
    model.set_params(n_clusters=cluster_counts[dataset])
    model.fit(X_train)
    score_fns = [
        v_measure_score,
        homogeneity_score,
        completeness_score,
    ]
    cluster_validation_df = pd.DataFrame()
    for score in score_fns:
        cluster_validation_df.loc[score.__name__,
                                  'score'] = score(y_test[y_test.columns[0]],
                                                   model.predict(X_test))
    print(cluster_validation_df)
Exemplo n.º 3
0
def partition_cells_by_kmeans(data: AnnData, rep: str, n_jobs: int, n_clusters: int, n_clusters2: int, n_init: int, random_state: int) -> List[int]:
    start = time.time()

    n_jobs = effective_n_jobs(n_jobs)

    rep_key = "X_" + rep
    X = data.obsm[rep_key].astype("float64")

    km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, n_init = n_init, random_state=random_state)
    km.fit(X)
    coarse = km.labels_.copy()

    km.set_params(n_init = 1)
    labels = coarse.copy()
    base_sum = 0
    for i in range(n_clusters):
        idx = coarse == i
        nc = min(n_clusters2, idx.sum())
        km.set_params(n_clusters=nc)
        km.fit(X[idx,:])
        labels[idx] = base_sum + km.labels_
        base_sum += nc

    end = time.time()
    logger.info("partition_cells_by_kmeans finished in {:.2f}s.".format(end - start))

    return labels
Exemplo n.º 4
0
def categorise_dataset(contents):
    iris_setosa = []
    iris_versicolor = []
    iris_virginica = []
    for each_tuple in contents:
        if each_tuple[4] == 'Iris-virginica':
            iris_virginica.append(each_tuple[:4])
        elif each_tuple[4] == 'Iris-versicolor':
            iris_versicolor.append(each_tuple[:4])
        elif each_tuple[4] == 'Iris-setosa':
            iris_setosa.append(each_tuple[:4])

    kwargs = {
        'n_init': 5,
        # depends on number of cores in your machine.
        'n_jobs': 3,
        'n_clusters': 3,
    }
    kmeans = KMeans()
    kmeans.set_params(**kwargs)
    # apply kmeans
    iris_setosa_centroids_indices = kmeans.fit_predict(np.array(iris_setosa))
    iris_setosa_centroids = kmeans.cluster_centers_

    iris_versicolor_centroids_indices = kmeans.fit_predict(np.array(iris_versicolor))
    iris_versicolor_centroids = kmeans.cluster_centers_

    iris_virginica_centroids_indices = kmeans.fit_predict(np.array(iris_virginica))
    iris_virginica_centroids = kmeans.cluster_centers_
    return (iris_setosa_centroids,
            iris_versicolor_centroids,
            iris_virginica_centroids)
Exemplo n.º 5
0
 def cluster(self, X, num_clusters, k_means=None):
     if k_means is None:
         k_means = KMeans()
     k_means.set_params(n_clusters=num_clusters)
     X_cluster_space = k_means.fit_transform(X)
     I = k_means.predict(X)
     return k_means, X_cluster_space, I
Exemplo n.º 6
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        random_state = check_random_state(self.random_state)

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # Create the clustering object
        kmeans = KMeans(n_clusters=num_samples, random_state=random_state)
        kmeans.set_params(**self.kwargs)

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it.
            if key == self.min_c_:
                continue

            # Find the centroids via k-means
            kmeans.fit(X[y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            X_resampled = np.concatenate((X_resampled, centroids), axis=0)
            y_resampled = np.concatenate(
                (y_resampled, np.array([key] * num_samples)), axis=0)

        self.logger.info('Under-sampling performed: %s', Counter(y_resampled))

        return X_resampled, y_resampled
def kMeansScore(data, allKs=[1]):
    # K-MEANS
    km = KMeans()
    ks = []
    kmScore = []
    for k in allKs:
        km.set_params(n_clusters=k)
        ks.append(k)
        km.fit(data)
        kmScore.append(-km.inertia_)

    return ks, kmScore
def cluster_pieces(array, k_start = 3, k_stop = 15):
    
    '''
    cluster pieces of puzzle
    '''
    
    km = KMeans()
    param_grid_km = {'n_clusters':np.arange(k_start, k_stop), 'algorithm':['full','elkan']}
    km_cv = GridSearchCV(km, param_grid_km, cv = 5).fit(array)
    print(f'Best parameters for KMeans model: {km_cv.best_params_}')
    km.set_params(**km_cv.best_params_).fit(array)
    return km
Exemplo n.º 9
0
def evaluate_kmeans(X, y, problem, out='./results/Clustering/'):
    """Also evaluate kmeans and em both"""
    sm = SMOTE()
    X_res, y_res = sm.fit_sample(X, y)

    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    distort_km = []
    distort_gm = []
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = KMeans(random_state=5)
    gm = GM(random_state=5)

    st = clock()
    clusters = [2, 3, 4, 5, 6]
    for k in clusters:
        print('now doing k=' + str(k))
        km.set_params(n_clusters=k)
        gm.set_params(n_components=k)
        km.fit(X_res)
        gm.fit(X_res)

        #distort_km.append(sum(np.min(cdist(X, km.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        ##distort_gm.append(sum(np.min(cdist(X, gm.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
        SSE[k][problem] = km.score(X_res)
        ll[k][problem] = gm.score(X_res)
        print('km score:', SSE[k][problem])
        print('gm score:', ll[k][problem])
        acc[k][problem]['Kmeans'] = cluster_acc(y_res, km.predict(X_res))
        acc[k][problem]['GM'] = cluster_acc(y_res, gm.predict(X_res))
        adjMI[k][problem]['Kmeans'] = metrics.adjusted_mutual_info_score(
            y_res, km.predict(X_res))
        adjMI[k][problem]['GM'] = metrics.adjusted_mutual_info_score(
            y_res, gm.predict(X_res))

    print(k, clock() - st)

    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + problem + ' SSE.csv')
    ll.to_csv(out + problem + ' logliklihood.csv')
    acc.ix[:, :, problem].to_csv(out + problem + ' acc.csv')
    acc.ix[:, :, problem, ].to_csv(out + problem + ' acc.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')
    adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv')

    return SSE, ll, acc, adjMI, km, gm
Exemplo n.º 10
0
def test_cluster_purity(X, Y, X_test=None, Y_test=None, label_names=None, mnist=False):
    num_clusters = 4
    num_dims = None
    if mnist and num_dims is not None:
        pca = PCA(n_components=num_dims)
        X_orig = X
        X_pca = pca.fit_transform(X)
        X = X_pca
        print 'PCA dims=' + str(num_dims)

    k_means = KMeans()
    k_means.set_params(n_clusters=num_clusters)
    print_cluster_purity(k_means, X, Y)
Exemplo n.º 11
0
Arquivo: cw1.py Projeto: arlyon/dmml
def cluster_sweep(features,
                  labels,
                  seed,
                  save_plot,
                  show_plot,
                  n_clusters=20,
                  step=1):
    """
    Performs a sweep across different numbers of clusters to determine the optimal number of
    for classification for this data set.
    """
    cluster_analysis = []
    model = KMeans()

    # Sweeps through the range of numbers of clusters, starting at 10 because of the 10 initial classes.
    print("Performing sweep of clusters...")
    with click.progressbar(range(10, n_clusters + 1, step)) as cluster_range:
        for cluster_size in cluster_range:
            model.set_params(n_clusters=cluster_size)
            numpy.random.set_state(seed)
            predictions = model.fit_predict(features)
            cluster_analysis.append(
                (cluster_size, score_clustering(labels, predictions)))

    # Plots the results from the cluster sweep
    data = list(zip(*[(x, *y.values()) for x, y in cluster_analysis]))
    if show_plot or save_plot:
        name = "Cluster"
        handles = plt.plot(data[0], data[3], '-b', label=name + " V Score")
        handles += plt.plot(data[0], data[4], '--b', label=name + " Rand")
        plt.legend(handles, loc="lower left")
        plt.xlabel("Number of Clusters")
        plt.title("Performance Comparison with K-Clusters")
        if save_plot is not None:
            path = os.path.join(save_plot, "cluster_sweep.png")
            plt.savefig(path)
            print("")
            print("saved figure to " + path)
        if show_plot:
            plt.show()
        plt.clf()

    # Returns the best scoring number of clusters based only on the adjusted_random_score
    # because this score is more accurate for high numbers of clusters.
    score = [
        rand for _, _, _, rand in
        [scores.values() for (_, scores) in cluster_analysis]
    ]
    score = numpy.argmax(score) + 10
    print(f"Best performance out of {n_clusters} clusters: {score}")
    return score
Exemplo n.º 12
0
    def kmeans_training(self, X, num_clusters):
        """
		in : - X < [studentID, x, y] >
		     - max_num_clusters
		out: - < [studentID, x, y, clusterID] >
		"""
        model = KMeans()
        model.set_params(n_clusters=num_clusters)
        # No need for studentID column during training
        model.fit(X.T[1:].T)
        students_clusters = model.labels_
        # Add a final column with students_clusters labels
        output = np.zeros((len(X), len(X[0]) + 1))
        output[:, :-1] = X
        output.T[-1] = students_clusters
        return output
Exemplo n.º 13
0
    def tester(self):
        meandist = []
        homogeneity_scores = []
        completeness_scores = []
        accuracy_scores = []
        silhouette_scores = []
        km = KMeans(max_iter=500, random_state=rand_state, init='k-means++')

        for k in self.num_clusters:
            km = km.set_params(n_clusters=k)
            km.fit_transform(self.data)
            predicts = km.labels_

            min = np.min(np.square(
                cdist(self.data, km.cluster_centers_, 'euclidean')),
                         axis=1)
            value = np.mean(min)
            meandist.append(value)
            homogeneity_scores.append(
                metrics.homogeneity_score(self.target, predicts))
            completeness_scores.append(
                metrics.completeness_score(self.target, predicts))
            silhouette_scores.append(
                metrics.silhouette_score(self.data, predicts))
            y_pred = cluster_predictions(self.target, predicts)
            accuracy_scores.append(metrics.accuracy_score(self.target, y_pred))
        df_sil = pd.DataFrame(silhouette_scores)
        df_acc = pd.DataFrame(accuracy_scores)
        df_sil.to_csv('../data/plots/' + self.title +
                      '-KM-silhouette_scores.csv')
        df_acc.to_csv('../data/plots/' + self.title +
                      '-KM-accuracy_scores.csv')

        if self.gen_plot:
            self.plot(meandist, homogeneity_scores, completeness_scores)
Exemplo n.º 14
0
    def resample(self):
        '''


        :param ratio:
            The ratio of number of majority cluster centroids with respect to

        :param n_jobs:
        :param kargs:
        :return:
        '''

        # Create the clustering object
        from sklearn.cluster import KMeans
        kmeans = KMeans(random_state=self.rs)
        kmeans.set_params(**self.kargs)

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        print('Finding cluster centroids...', end="")
        for key in self.ucd.keys():
            # If the minority class is up, skip it.
            if key == self.minc:
                continue

            # Set the number of clusters to be no more than the number of samples
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                nclusters =  self.ucd[key]
            else:
                nclusters = int(self.ratio * self.ucd[self.minc])

            # Set the number of clusters and find the centroids
            kmeans.set_params(n_clusters = nclusters)
            kmeans.fit(self.x[self.y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            underx = concatenate((underx, centroids), axis = 0)
            undery = concatenate((undery, ones(nclusters) * key), axis = 0)
            print(".", end="")

        print("done!")

        return underx, undery
Exemplo n.º 15
0
def run_kmeans(X_train, X_test, y_train, y_test):
    LOGGER.info('kmeans, train: {}, test: {}'.format(X_train.shape[0],
                                                     X_test.shape[0]))

    # max_clusters = 7
    # clusters=[2**x for x in range(1,max_clusters)]
    clusters = [x for x in range(1, 100)]
    # split_ratio = 0.33
    # X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=split_ratio,random_state=0)
    # LOGGER.debug('train test split: {}'.format(split_ratio))

    model = KMeans(random_state=0)

    # kms = [KMeans(n_clusters=i) for i in clusters]
    # choose_scores = [kms[i].fit(X_train).score(X) for i in range(len(kms))]

    # Validation

    score_fns = [
        # mutual_info_score,
        v_measure_score,
        homogeneity_score,
        completeness_score,
        # adjusted_mutual_info_score,
        # calinski_harabasz_score,
    ]

    # validation_score = pd.DataFrame(index=clusters,columns=sum([[score.__name__+'_train',score.__name__+'_test'] for score in score_fns],[]))
    validation_score = pd.DataFrame(index=clusters)
    choose_score = pd.DataFrame(index=clusters, columns=['score'])

    for k in clusters:
        # model = KMeans(random_state=0)
        LOGGER.debug('clusters: {}'.format(k))
        model.set_params(n_clusters=k)
        model.fit(X_train)
        sse_score = model.score(X_train)
        choose_score.loc[k, 'score'] = sse_score

        for score in score_fns:
            LOGGER.debug('evaluation: {}'.format(score.__name__))
            # validation_score.loc[k,score.__name__+'_train'] = score(y_train[y_train.columns[0]],model.predict(X_train))
            validation_score.loc[k, score.__name__ + '_test'] = score(
                y_test[y_test.columns[0]], model.predict(X_test))
            # validation_score.loc[k,'k'] = k

    return validation_score, choose_score, model
Exemplo n.º 16
0
    def resample(self):
        '''


        :param ratio:
            The ratio of number of majority cluster centroids with respect to

        :param n_jobs:
        :param kargs:
        :return:
        '''

        # Create the clustering object
        from sklearn.cluster import KMeans
        kmeans = KMeans(random_state=self.rs)
        kmeans.set_params(**self.kargs)

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        print('Finding cluster centroids...', end="")
        for key in self.ucd.keys():
            # If the minority class is up, skip it.
            if key == self.minc:
                continue

            # Set the number of clusters to be no more than the number of samples
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                nclusters =  self.ucd[key]
            else:
                nclusters = int(self.ratio * self.ucd[self.minc])

            # Set the number of clusters and find the centroids
            kmeans.set_params(n_clusters = nclusters)
            kmeans.fit(self.x[self.y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            underx = concatenate((underx, centroids), axis = 0)
            undery = concatenate((undery, ones(nclusters) * key), axis = 0)
            print(".", end="")

        print("done!")

        return underx, undery
Exemplo n.º 17
0
    def resample(self):
        """
        ???

        :return:
        """

        # Compute the ratio if it is auto
        if self.ratio == 'auto':
            self.ratio = 1.

        # Create the clustering object
        from sklearn.cluster import KMeans
        kmeans = KMeans(random_state=self.rs)
        kmeans.set_params(**self.kwargs)

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():
            # If the minority class is up, skip it.
            if key == self.minc:
                continue

            # Set the number of clusters to be no more than the number of
            # samples
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                n_clusters = self.ucd[key]
            else:
                n_clusters = int(self.ratio * self.ucd[self.minc])

            # Set the number of clusters and find the centroids
            kmeans.set_params(n_clusters=n_clusters)
            kmeans.fit(self.x[self.y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            underx = concatenate((underx, centroids), axis=0)
            undery = concatenate((undery, ones(n_clusters) * key), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
Exemplo n.º 18
0
def partition_cells_by_kmeans(
    X: np.ndarray,
    n_clusters: int,
    n_clusters2: int,
    n_init: int,
    n_jobs: int,
    random_state: int,
    min_avg_cells_per_final_cluster: Optional[int] = 10,
) -> List[int]:

    n_clusters = min(n_clusters,
                     max(X.shape[0] // min_avg_cells_per_final_cluster, 1))
    if n_clusters == 1:
        return np.zeros(X.shape[0], dtype=np.int32)

    n_jobs = eff_n_jobs(n_jobs)

    kmeans_params = {
        'n_clusters': n_clusters,
        'n_init': n_init,
        'random_state': random_state,
    }
    km = KMeans(**kmeans_params)

    with threadpool_limits(limits=n_jobs):
        km.fit(X)
        coarse = km.labels_.copy()

        km.set_params(n_init=1)
        labels = coarse.copy()
        base_sum = 0
        for i in range(n_clusters):
            idx = coarse == i
            nc = min(n_clusters2,
                     max(idx.sum() // min_avg_cells_per_final_cluster, 1))
            if nc == 1:
                labels[idx] = base_sum
            else:
                km.set_params(n_clusters=nc)
                km.fit(X[idx, :])
                labels[idx] = base_sum + km.labels_
            base_sum += nc

    return labels
Exemplo n.º 19
0
def plot_selected_clusternumber_silhouette_scores(
    data,
    min,
    max,
):
    print("Tuning Silhouette: ")
    # candidate values for our number of cluster
    parameters = [
        18, 19, 20, 23, 25, 28, 19, 30, 31, 32, 33, 34, 35, 40, 41, 45, 46
    ]
    parameters = range(min, max)
    # instantiating ParameterGrid, pass number of clusters as input
    parameter_grid = ParameterGrid({'n_clusters': parameters})
    best_score = -1
    kmeans_model = KMeans(random_state=42)  # instantiating KMeans model
    silhouette_scores = []
    # evaluation based on silhouette_score
    for p in parameter_grid:
        kmeans_model.set_params(**p)  # set current hyper parameter
        kmeans_model.fit(
            data
        )  # fit model on wine dataset, this will find clusters based on parameter p
        ss = silhouette_score(
            data, kmeans_model.labels_)  # calculate silhouette_score
        silhouette_scores += [ss]  # store all the scores
        #print('Parameter:', p, 'Score', ss)
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
    # plotting silhouette score
    plt.figure(figsize=(10, 8))
    plt.bar(range(len(silhouette_scores)),
            list(silhouette_scores),
            align='center',
            color='#722f59',
            width=0.4)
    plt.xticks(range(len(silhouette_scores)), list(parameters))
    plt.title('Silhouette Score', fontweight='bold')
    plt.xlabel('Number of Clusters', fontsize=14)
    plt.show()
Exemplo n.º 20
0
    def elbow_method(self, X, max_num_clusters):
        """
		in : - X < [studentID, x, y] >
		     - max_num_clusters
		out: - output scores for different #clusters
		"""

        # Remove the studentID column
        X = X.T[1:].T
        model = KMeans()
        k_clusters = np.arange(1, max_num_clusters)
        scores = []
        for k in k_clusters:
            model.set_params(n_clusters=k)
            model.fit(X)
            scores.append(model.score(X))
        plt.plot(k_clusters, scores)
        plt.title('Elbow Method')
        plt.xlabel('Number of clusters')
        plt.ylabel('Scores')
        plt.show()
Exemplo n.º 21
0
def kMeansScore(data, allKs=[1], datasetType=None, target=None):
    # K-MEANS
    km = KMeans()
    ks = []
    kmScore = []
    f = open('plots/carPlots/KMeanClusterStats_' + datasetType + '.txt', 'w')
    targetLabels, targetStats = np.unique(target, return_counts=True)
    f.write("Data Target Labels: " + str(targetLabels) + "\n")
    f.write("Data Target Stats: " + str(targetStats) + "\n\n\n")
    for k in allKs:
        km.set_params(n_clusters=k)
        ks.append(k)
        km.fit(data)
        kmScore.append(-km.inertia_)
        labels, stats = np.unique(km.labels_, return_counts=True)
        f.write("Cluster Stats For K = " + str(k) + "\n")
        f.write("Unique Labels: " + str(labels) + "\n")
        f.write("Status Corresponding To The Labels: " + str(stats) + "\n\n")

    f.close()
    return ks, kmScore
Exemplo n.º 22
0
class ClusterCentroids(BaseUnderSampler):
    """Perform under-sampling by generating centroids based on
    clustering methods.

    Method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans
    algorithm.  This algorithm keeps N majority samples by fitting the
    KMeans algorithm with N cluster to the majority class and using
    the coordinates of the N cluster centroids as the new majority
    samples.

    Read more in the :ref:`User Guide <cluster_centroids>`.

    Parameters
    ----------
    {sampling_strategy}

    {random_state}

    estimator : object, optional(default=KMeans())
        Pass a :class:`sklearn.cluster.KMeans` estimator.

    voting : str, optional (default='auto')
        Voting strategy to generate the new samples:

        - If ``'hard'``, the nearest-neighbors of the centroids found using the
          clustering algorithm will be used.
        - If ``'soft'``, the centroids found by the clustering algorithm will
          be used.
        - If ``'auto'``, if the input is sparse, it will default on ``'hard'``
          otherwise, ``'soft'`` will be used.

        .. versionadded:: 0.3.0

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    ratio : str, dict, or callable
        .. deprecated:: 0.4
           Use the parameter ``sampling_strategy`` instead. It will be removed
           in 0.6.

    Notes
    -----
    Supports multi-class resampling by sampling each class independently.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{1: 900, 0: 100}})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_resample(X, y)
    >>> print('Resampled dataset shape %s' % Counter(y_res))
    ... # doctest: +ELLIPSIS
    Resampled dataset shape Counter({{...}})

    """

    def __init__(self,
                 sampling_strategy='auto',
                 random_state=None,
                 estimator=None,
                 voting='auto',
                 n_jobs=1,
                 ratio=None):
        super().__init__(
            sampling_strategy=sampling_strategy, ratio=ratio)
        self.random_state = random_state
        self.estimator = estimator
        self.voting = voting
        self.n_jobs = n_jobs

    def _validate_estimator(self):
        """Private function to create the KMeans estimator"""
        if self.estimator is None:
            self.estimator_ = KMeans(
                random_state=self.random_state, n_jobs=self.n_jobs)
        elif isinstance(self.estimator, KMeans):
            self.estimator_ = clone(self.estimator)
        else:
            raise ValueError('`estimator` has to be a KMeans clustering.'
                             ' Got {} instead.'.format(type(self.estimator)))

    def _generate_sample(self, X, y, centroids, target_class):
        if self.voting_ == 'hard':
            nearest_neighbors = NearestNeighbors(n_neighbors=1)
            nearest_neighbors.fit(X, y)
            indices = nearest_neighbors.kneighbors(
                centroids, return_distance=False)
            X_new = safe_indexing(X, np.squeeze(indices))
        else:
            if sparse.issparse(X):
                X_new = sparse.csr_matrix(centroids, dtype=X.dtype)
            else:
                X_new = centroids
        y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype)

        return X_new, y_new

    def _fit_resample(self, X, y):
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled, dtype=y.dtype)

    def _more_tags(self):
        return {'sample_indices': False}
Exemplo n.º 23
0
from sklearn.neighbors.nearest_centroid import NearestCentroid

km = KMeans(random_state=6, n_init=10)
gmm = GaussianMixture(random_state=6, n_init=1)


def plot_sil(X, mod, t):
    mod.fit(X)
    mod.score(X)
    pred = mod.predict(X)
    clf = NearestCentroid()
    clf.fit(X, pred)
    plot_silhouettes(X, pred, clf.centroids_, title=t)


km.set_params(n_clusters=5)
t = "Silhouette Analysis, Titanic k-Means with n_clusters = %d" % km.n_clusters
X = datasetsPCA['Titanic']['X_train']
plot_sil(X, km, t)

gmm.set_params(n_components=15)
t = "Post-PCA Silhouette Analysis, Titanic GMM with %d components" % gmm.n_components
plot_sil(X, gmm, t)
#plt.gca().set_xlim([-0.6,1.5])

km.set_params(n_clusters=6)
t = "Silhouette Analysis, Wilt k-Means with n_clusters = %d" % km.n_clusters
X = datasetsPCA['Wilt']['X_train']
plot_sil(X, km, t)
gmm.set_params(n_components=6)
t = "Post-PCA Silhouette Analysis, Wilt GMM with %d components" % gmm.n_components
class ClusterCentroids(BaseMulticlassSampler):
    """Perform under-sampling by generating centroids based on
    clustering methods.

    Method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans
    algorithm.  This algorithm keeps N majority samples by fitting the
    KMeans algorithm with N cluster to the majority class and using
    the coordinates of the N cluster centroids as the new majority
    samples.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random.

    estimator : object, optional(default=KMeans())
        Pass a `sklearn.cluster.KMeans` estimator.

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    Attributes
    ----------
    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    X_shape_ : tuple of int
        Shape of the data `X` during fitting.

    Notes
    -----
    This class support multi-class.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
    ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape {}'.format(Counter(y)))
    Original dataset shape Counter({1: 900, 0: 100})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_sample(X, y)
    >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
    Resampled dataset shape Counter({0: 100, 1: 100})

    """

    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 estimator=None,
                 n_jobs=1):
        super(ClusterCentroids, self).__init__(
            ratio=ratio, random_state=random_state)
        self.estimator = estimator
        self.n_jobs = n_jobs

    def _validate_estimator(self):
        """Private function to create the NN estimator"""

        if self.estimator is None:
            self.estimator_ = KMeans(
                random_state=self.random_state, n_jobs=self.n_jobs)
        elif isinstance(self.estimator, KMeans):
            self.estimator_ = self.estimator
        else:
            raise ValueError('`estimator` has to be a KMeans clustering.'
                             ' Got {} instead.'.format(type(self.estimator)))

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        super(ClusterCentroids, self).fit(X, y)

        self._validate_estimator()

        return self

    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # Set the number of sample for the estimator
        self.estimator_.set_params(**{'n_clusters': num_samples})

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it.
            if key == self.min_c_:
                continue

            # Find the centroids via k-means
            self.estimator_.fit(X[y == key])
            centroids = self.estimator_.cluster_centers_

            # Concatenate to the minority class
            X_resampled = np.concatenate((X_resampled, centroids), axis=0)
            y_resampled = np.concatenate(
                (y_resampled, np.array([key] * num_samples)), axis=0)

        self.logger.info('Under-sampling performed: %s', Counter(y_resampled))

        return X_resampled, y_resampled
Exemplo n.º 25
0
class KMeans(ModelBase):
    """
    KMeans:

    Fits an Sklearn KMeans model to X.


    See also
    --------
    http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html


    Attributes
    ----------
    n_clusters_ : int
                  The number of clusters, K

    cluster_inertia_ : float
                       Sum of squared distances of samples to their closest cluster center

    cluster_labels_ : array, [n_clusters_]
                      Labels indicating the membership of each point

    cluster_centers_ : array, [n_clusters, n_features]
                       Coordinates of cluster centers

    sample_labels_ : array, [n_samples]
                     Labels for each of the samples in X

    sample_distances_ : array, [n_samples]
                        The distance between each sample point and its cluster's center


    Constants
    ---------
    SAMPLE_CUTOFF_ : int
                     If n_samples > SAMPLE_CUTOFF_ then sample distances
                     are NOT recorded
    """

    SAMPLE_CUTOFF_ = 1000

    def __init__(self):
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    @property
    def cluster_inertia_(self):
        # Sum of squared distances of samples to their closest cluster center
        return None if self.model_ is None else \
            self.model_.inertia_

    @property
    def cluster_labels_(self):
        # Cluster membership labels for each point
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.labels_)

    @property
    def cluster_centers_(self):
        # Coordinates of the cluster centers
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.cluster_centers_)

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    def fit(self, X, K, sample_labels=None, estimator_params=None):
        """Fits a Sklearn KMeans model to X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        K : int
            The number of clusters.

        sample_labels : array-like, shape (n_samples), optional
                        Labels for each of the samples in X.

        estimator_params : dict, optional
                           The parameters to pass to the KMeans estimators.


        Returns
        -------
        self
        """
        self._reset()
        # Note: previously set n_init=50
        self.model_ = SklearnKMeans(K)
        if estimator_params is not None:
            assert isinstance(estimator_params, dict)
            self.model_.set_params(**estimator_params)

        # Compute Kmeans model
        self.model_.fit(X)
        if sample_labels is None:
            sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
        assert len(sample_labels) == X.shape[0]
        self.sample_labels_ = np.array(sample_labels)
        self.n_clusters_ = K

        # Record sample label/distance from its cluster center
        self.sample_distances_ = OrderedDict()
        for cluster_label in range(self.n_clusters_):
            assert cluster_label not in self.sample_distances_
            member_rows = X[self.cluster_labels_ == cluster_label, :]
            member_labels = self.sample_labels_[self.cluster_labels_ ==
                                                cluster_label]
            centroid = np.expand_dims(self.cluster_centers_[cluster_label],
                                      axis=0)

            # "All clusters must have at least 1 member!"
            if member_rows.shape[0] == 0:
                return None

            # Calculate distance between each member row and the current cluster
            dists = np.empty(member_rows.shape[0])
            dist_labels = []
            for j, (row, label) in enumerate(zip(member_rows, member_labels)):
                dists[j] = cdist(np.expand_dims(row, axis=0), centroid,
                                 "euclidean").squeeze()
                dist_labels.append(label)

            # Sort the distances/labels in ascending order
            sort_order = np.argsort(dists)
            dists = dists[sort_order]
            dist_labels = np.array(dist_labels)[sort_order]
            self.sample_distances_[cluster_label] = {
                "sample_labels": dist_labels,
                "distances": dists,
            }
        return self

    def get_closest_samples(self):
        """Returns a list of the labels of the samples that are located closest
           to their cluster's center.


        Returns
        ----------
        closest_samples : list
                  A list of the sample labels that are located the closest to
                  their cluster's center.
        """
        if self.sample_distances_ is None:
            raise Exception("No model has been fit yet!")

        return [
            samples['sample_labels'][0]
            for samples in list(self.sample_distances_.values())
        ]

    def get_memberships(self):
        '''
        Return the memberships in each cluster
        '''
        memberships = OrderedDict()
        for cluster_label, samples in list(self.sample_distances_.items()):
            memberships[cluster_label] = OrderedDict([
                (l, d)
                for l, d in zip(samples["sample_labels"], samples["distances"])
            ])
        return json.dumps(memberships, indent=4)
    'X,y,oversampler',
    [
        (
            np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0),
                      (4.0, 4.0)]),
            np.array([0, 0, 1, 1, 1]),
            ClusterOverSampler(
                oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE)),
        ),
        (
            np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0),
                      (4.0, 4.0)]),
            np.array([0, 0, 1, 1, 1]),
            ClusterOverSampler(
                oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE),
                clusterer=CLUSTERER.set_params(n_clusters=3),
                random_state=RANDOM_STATE,
            ),
        ),
    ],
)
def test_fit_resample_intra_corner_cases(X, y, oversampler):
    """Test the fit_resample method for various
    corner cases and oversamplers."""
    X_res, y_res = oversampler.fit_resample(X, y)
    y_count = Counter(y_res)
    assert y_count[0] == y_count[1]
    assert X.item(0, 0) <= X_res.item(-1, 0) <= X.item(1, 0)
    assert X.item(0, 1) <= X_res.item(-1, 1) <= X.item(1, 1)

Exemplo n.º 27
0
    def fit(self, X, y=None, **kwargs):
        """Fit the encoder on a collection of data, e.g. image patches.

        Parameters
        ----------
        X: array-like, shape: n_samples, n_features
            the patch data to be fitted

        Returns
        -------
        self: object
            Returns the object itself
        """
        X = np.atleast_2d(X)
        n_samples, n_features = X.shape
        # normalize each patch individually
        if self.local_contrast:
            if self.verbose:
                print "Local contrast normalization of the data"
            X = self.local_contrast_normalization(X)

        # kmeans model to find the filters
        if self.verbose:
            print "About to extract atoms from %d samples" % n_samples
        kmeans = KMeans(n_clusters=self.n_atoms,
                        init='k-means++',
                        max_iter=self.max_iter,
                        n_init=self.n_init,
                        tol=self.tol,
                        verbose=self.verbose)

        if self.whiten:
            if self.verbose:
                print "Whitening PCA of the samples"
            self.pca = pca = PCA(whiten=True, n_components=self.n_components)
            pca.fit(X)
            X = pca.transform(X)

            # compute the KMeans centers
            if 0 < self.n_prefit < pca.n_components:
                if self.verbose:
                    print "First KMeans in simplified curriculum space"
                # starting the kmeans on a the projection to the first singular
                # components: curriculum learning trick by Andrej Karpathy
                kmeans.fit(X[:, :self.n_prefit])

                # warm restart by padding previous centroids with zeros
                # with full dimensionality this time
                kmeans.init = np.zeros((self.n_atoms, pca.n_components),
                                       dtype=kmeans.cluster_centers_.dtype)
                kmeans.init[:, :self.n_prefit] = kmeans.cluster_centers_
                if self.verbose:
                    print "Second KMeans in full whitened sample space"
                kmeans.set_params(n_init=1).fit(X)
            else:
                if self.verbose:
                    print "KMeans in full original sample space"
                # regular kmeans fit (without the curriculum trick)
                kmeans.fit(X)

            # project back the centers in original, non-whitened space (useful
            # for qualitative inspection of the filters)
            self.components_ = self.pca.inverse_transform(
                kmeans.cluster_centers_)
        else:
            # find the kernel in the raw original dimensional space
            # TODO: experiment with component wise scaling too
            self.pca = None
            kmeans.fit(X)
            self.components_ = kmeans.cluster_centers_

        self.kmeans = kmeans
        self.inertia_ = kmeans.inertia_
        return self
class ClusterCentroids(BaseMulticlassSampler):
    """Perform under-sampling by generating centroids based on
    clustering methods.

    Experimental method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans algorithm.
    This algorithm keeps N majority samples by fitting the KMeans algorithm
    with N cluster to the majority class and using the coordinates of the N
    cluster centroids as the new majority samples.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balance
        the dataset. Otherwise, the ratio is defined as the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by np.random.

    estimator : object, optional(default=KMeans())
        Pass a `sklearn.cluster.KMeans` estimator.

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    Attributes
    ----------
    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    X_shape_ : tuple of int
        Shape of the data `X` during fitting.

    Notes
    -----
    This class support multi-class.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
    ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape {}'.format(Counter(y)))
    Original dataset shape Counter({1: 900, 0: 100})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_sample(X, y)
    >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
    Resampled dataset shape Counter({0: 100, 1: 100})

    """

    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 estimator=None,
                 n_jobs=1):
        super(ClusterCentroids, self).__init__(
            ratio=ratio, random_state=random_state)
        self.estimator = estimator
        self.n_jobs = n_jobs

    def _validate_estimator(self):
        """Private function to create the NN estimator"""

        if self.estimator is None:
            self.estimator_ = KMeans(
                random_state=self.random_state, n_jobs=self.n_jobs)
        elif isinstance(self.estimator, KMeans):
            self.estimator_ = self.estimator
        else:
            raise ValueError('`estimator` has to be a KMeans clustering.')

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """

        super(ClusterCentroids, self).fit(X, y)

        self._validate_estimator()

        return self

    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # Set the number of sample for the estimator
        self.estimator_.set_params(**{'n_clusters': num_samples})

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it.
            if key == self.min_c_:
                continue

            # Find the centroids via k-means
            self.estimator_.fit(X[y == key])
            centroids = self.estimator_.cluster_centers_

            # Concatenate to the minority class
            X_resampled = np.concatenate((X_resampled, centroids), axis=0)
            y_resampled = np.concatenate(
                (y_resampled, np.array([key] * num_samples)), axis=0)

        self.logger.info('Under-sampling performed: %s', Counter(y_resampled))

        return X_resampled, y_resampled
Exemplo n.º 29
0
class ClusterCentroids(BaseUnderSampler):
    """Perform under-sampling by generating centroids based on
    clustering methods.

    Method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans
    algorithm.  This algorithm keeps N majority samples by fitting the
    KMeans algorithm with N cluster to the majority class and using
    the coordinates of the N cluster centroids as the new majority
    samples.

    Read more in the :ref:`User Guide <cluster_centroids>`.

    Parameters
    ----------
    ratio : str, dict, or callable, optional (default='auto')
        Ratio to use for resampling the data set.

        - If ``str``, has to be one of: (i) ``'minority'``: resample the
          minority class; (ii) ``'majority'``: resample the majority class,
          (iii) ``'not minority'``: resample all classes apart of the minority
          class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``:
          correspond to ``'all'`` with for over-sampling methods and ``'not
          minority'`` for under-sampling methods. The classes targeted will be
          over-sampled or under-sampled to achieve an equal number of sample
          with the majority or minority class.
        - If ``dict``, the keys correspond to the targeted classes. The values
          correspond to the desired number of samples.
        - If callable, function taking ``y`` and returns a ``dict``. The keys
          correspond to the targeted classes. The values correspond to the
          desired number of samples.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, ``random_state`` is the seed used by the random number
        generator; If ``RandomState`` instance, random_state is the random
        number generator; If ``None``, the random number generator is the
        ``RandomState`` instance used by ``np.random``.

    estimator : object, optional(default=KMeans())
        Pass a :class:`sklearn.cluster.KMeans` estimator.

    voting : str, optional (default='auto')
        Voting strategy to generate the new samples:

        - If ``'hard'``, the nearest-neighbors of the centroids found using the
          clustering algorithm will be used.
        - If ``'soft'``, the centroids found by the clustering algorithm will
          be used.
        - If ``'auto'``, if the input is sparse, it will default on ``'hard'``
          otherwise, ``'soft'`` will be used.

        .. versionadded:: 0.3.0

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    Notes
    -----
    Supports mutli-class resampling by sampling each class independently.

    See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py`.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape {}'.format(Counter(y)))
    Original dataset shape Counter({1: 900, 0: 100})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_sample(X, y)
    >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
    ... # doctest: +ELLIPSIS
    Resampled dataset shape Counter({...})

    """

    def __init__(self,
                 ratio='auto',
                 random_state=None,
                 estimator=None,
                 voting='auto',
                 n_jobs=1):
        super(ClusterCentroids, self).__init__(
            ratio=ratio)
        self.random_state = random_state
        self.estimator = estimator
        self.voting = voting
        self.n_jobs = n_jobs

    def _validate_estimator(self):
        """Private function to create the KMeans estimator"""
        if self.estimator is None:
            self.estimator_ = KMeans(
                random_state=self.random_state, n_jobs=self.n_jobs)
        elif isinstance(self.estimator, KMeans):
            self.estimator_ = self.estimator
        else:
            raise ValueError('`estimator` has to be a KMeans clustering.'
                             ' Got {} instead.'.format(type(self.estimator)))

    def _generate_sample(self, X, y, centroids, target_class):
        if self.voting_ == 'hard':
            nearest_neighbors = NearestNeighbors(n_neighbors=1)
            nearest_neighbors.fit(X, y)
            indices = nearest_neighbors.kneighbors(centroids,
                                                   return_distance=False)
            X_new = safe_indexing(X, np.squeeze(indices))
        else:
            if sparse.issparse(X):
                X_new = sparse.csr_matrix(centroids)
            else:
                X_new = centroids
        y_new = np.array([target_class] * centroids.shape[0])

        return X_new, y_new

    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.ratio_.keys():
                n_samples = self.ratio_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled)
Exemplo n.º 30
0
dim = [2, 3, 4, 5]

km = KM(random_state=42)
gmm = GMM(random_state=42)

Score = defaultdict(list)
adjMI = defaultdict(list)
S_homog = defaultdict(list)
S_adjMI = defaultdict(list)
S_vm = defaultdict(list)

for i in dim:
    reduced_X = PCA(n_components=i,
                    random_state=42).fit_transform(X_train_scaled)
    k = 30
    km.set_params(n_clusters=k)
    gmm.set_params(n_components=k)
    km.fit(reduced_X)
    gmm.fit(reduced_X)
    S_homog['km'].append(
        metrics.homogeneity_score(labels, km.predict(reduced_X)))
    S_homog['gmm'].append(
        metrics.homogeneity_score(labels, gmm.predict(reduced_X)))
    S_adjMI['km'].append(
        metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X)))
    S_adjMI['gmm'].append(
        metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X)))
    S_vm['km'].append(metrics.v_measure_score(labels, km.predict(reduced_X)))
    S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(reduced_X)))

#plt.legend(['Train', 'Test'], loc='lower right')
Exemplo n.º 31
0
class assignment4:
    def __init__(self):
        # data processing
        self.dataSetPath = './data_set/'
        self.dataSetName = ""
        self.csv_delimiter = ','
        self.data = None
        self.allFeatures = []
        self.allTarget = []

        # not used
        self.XTrain = None
        self.XTest = None
        self.YTrain = None
        self.YTest = None

        # k-mean clustering
        self.kNum = range(1, 21)
        self.kmean = None
        self.kmeanRD = None
        # expectation maximization
        self.em = None
        self.emRD = None
        # PCA
        self.pca = None
        self.pcaDims = range(1, 21)

        # ICA
        self.icaDims = range(1, 21)
        self.ica = None

        # RP
        self.rp = None
        self.rpDims = range(1, 21)

        # TSVD
        self.tsvd = None
        self.tsvdDims = range(1, 10)

    def read_data_voice(self, dataName):
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=self.csv_delimiter)
            self.data = list(reader)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.data)))
        print('Number of attributes: {}'.format(len(self.data[0]) - 1))

    def read_data_haptX(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)

        print(len(self.data))
        for elim in self.data:
            feature = []
            for i in elim:
                feature.append(i)
            self.allFeatures.append(feature)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allFeatures)))
        print('Number of attributes: {}'.format(len(self.allFeatures[0])))

    def read_data_haptY(self, dataName):
        self.data = None
        with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file:
            reader = csv.reader(file, delimiter=',')
            self.data = list(reader)
        for elim in self.data:
            self.allTarget.append(elim)
        print("Reading data set: '{}'".format(self.dataSetPath + dataName))
        print('Number of instances: {}'.format(len(self.allTarget)))
        print('Number of attributes: {}'.format(len(self.allTarget[0])))

        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.allTarget = self.allTarget.ravel()

    def split_data_to_train_test(self, testSize=0.3):
        # in case the data set are very different in format
        sample_len = len(self.data[0])
        for elem in self.data:
            feature = elem[0:sample_len - 1]
            feature_vector = []
            for f in feature:
                feature_vector.append(float(f))
            self.allFeatures.append(feature_vector)
            if elem[-1] == '0':
                val = 0
            else:
                val = 1
            self.allTarget.append((float(val)))
        self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32)
        self.allTarget = np.asarray(self.allTarget, dtype=np.float32)
        self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split(
            self.allFeatures,
            self.allTarget,
            test_size=testSize,
            random_state=42)
        print(
            'Total X train data -> {}%'.format(
                int((len(self.XTrain) / len(self.data)) * 100)), 'Size:',
            len(self.XTrain))
        print(
            'Total X test data -> {}%'.format(
                int((len(self.XTest) / len(self.data)) * 100)), 'Size:',
            len(self.XTest))
        print(
            'Total Y train data -> {}%'.format(
                int((len(self.YTrain) / len(self.data)) * 100)), 'Size:',
            len(self.YTrain))
        print(
            'Total Y test data -> {}%'.format(
                int((len(self.YTest) / len(self.data)) * 100)), 'Size',
            len(self.YTest))

    def get_max_idx(self, input):
        maxVal = input[0]
        maxIdx = 0
        for i in range(1, len(input)):
            if input[i] > maxVal:
                maxIdx = i
                maxVal = input[i]
        return maxIdx

    def pairwiseDistCorr(self, X1, X2):
        assert X1.shape[0] == X2.shape[0]

        d1 = pairwise_distances(X1)
        d2 = pairwise_distances(X2)
        return np.corrcoef(d1.ravel(), d2.ravel())[0, 1]

    def k_mean_cluster(self):
        print("-" * 50)
        print('{}: K-mean clustering'.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.kmean = KMeans(random_state=5, max_iter=1000)
        for i in self.kNum:
            self.kmean.set_params(n_clusters=i)
            self.kmean.fit(dataX)
            scores.append(sm.accuracy_score(self.allTarget,
                                            self.kmean.labels_))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, self.kmean.labels_))
        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:", confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('K-mean Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_KMEAN.png'.format(self.dataSetName))
        print("-" * 50)

    def k_mean_cluster_reduced(self, n_clusters, reduced_data, name):
        print("-" * 50)
        print('{}: K-mean clustering {}'.format(self.dataSetName, name))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.kmeanRD = KMeans(n_clusters=n_clusters,
                              random_state=5,
                              max_iter=1000)
        self.kmeanRD.fit(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, self.kmeanRD.labels_)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_))

        print("-" * 50)

    def expectation_maximization_reduced(self, n_components, reduced_data,
                                         name):
        print("-" * 50)
        print('{}: Expectation maximization {}'.format(self.dataSetName, name))

        self.emRD = GaussianMixture(n_components=n_components, random_state=5)
        self.emRD.fit(reduced_data)
        y_predict = self.emRD.predict(reduced_data)

        print("Accuracy score:{0:.2f}".format(
            sm.accuracy_score(self.allTarget, y_predict)))
        print("Confusion Matrix:")
        print(sm.confusion_matrix(self.allTarget, y_predict))
        print("-" * 50)

    def expectation_maximization(self):
        print("-" * 50)
        print('{}: Expectation maximization'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        scores = []
        confusionMatrix = []
        self.em = GaussianMixture(random_state=5)
        for i in self.kNum:
            self.em.set_params(n_components=i)
            self.em.fit(dataX)
            y_predict = self.em.predict(dataX)
            scores.append(sm.accuracy_score(self.allTarget, y_predict))
            confusionMatrix.append(
                sm.confusion_matrix(self.allTarget, y_predict))

        bestScoreIdx = self.get_max_idx(scores)
        print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx]))
        print("Confusion Matrix:")
        print(confusionMatrix[bestScoreIdx])

        plt.figure()
        plt.ylabel('Accuracy')
        plt.xlabel('# of Clusters')
        plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName))

        plt.style.context('seaborn-whitegrid')
        plt.xticks(self.kNum)
        plt.plot(self.kNum, scores)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_EM.png'.format(self.dataSetName))
        print("-" * 50)

    def PCA(self):
        print("-" * 50)
        print('{}: Principal component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)

        self.pca = PCA(random_state=5)
        grid = {'pca__n_components': self.pcaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('pca', self.pca), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)

        print("Best number PCA components:", search.best_params_)

        self.pca.fit(dataX)
        var = np.cumsum(
            np.round(self.pca.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.pcaDims)
        plt.ylim([0, 1])
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.pcaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName))

        print("-" * 50)

    def ICA(self):
        print("-" * 50)
        print('{}: Independent component analysis '.format(self.dataSetName))

        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.ica = FastICA(random_state=5, max_iter=6000)
        # kurtosis
        kurt = []
        for dim in self.icaDims:
            self.ica.set_params(n_components=dim)
            tmp = self.ica.fit_transform(dataX)
            tmp = pd.DataFrame(tmp)
            tmp = tmp.kurt(axis=0)
            kurt.append(tmp.abs().mean())

        # grid search
        grid = {'ica__n_components': self.icaDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('ica', self.ica), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number ICA components:", search.best_params_)

        plt.figure()
        plt.ylabel('Kurtosis')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(kurt)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_kurtosis.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.icaDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(self.icaDims, search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def RP(self):
        print("-" * 50)
        print('{}: Random Projection'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        disCorr = []
        self.rp = SparseRandomProjection(random_state=5)
        for dim in self.rpDims:
            self.rp.set_params(n_components=dim)
            disCorr.append(
                self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX))
        print(disCorr)

        # grid search
        grid = {'rp__n_components': self.rpDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('rp', self.rp), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number RP components:", search.best_params_)

        plt.figure()
        plt.ylabel('Distance')
        plt.xlabel('# of Features')
        plt.title('RP Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(disCorr)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_distance.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.rpDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_RP_GS.png'.format(self.dataSetName))
        print("-" * 50)

    def TSVD(self):
        print("-" * 50)
        print('{}: TruncatedSVD'.format(self.dataSetName))
        dataX = StandardScaler().fit_transform(self.allFeatures)
        self.tsvd = TruncatedSVD(random_state=5)

        # grid search
        grid = {'tsvd__n_components': self.tsvdDims}
        mlp = MLPClassifier(max_iter=2000,
                            alpha=1e-5,
                            early_stopping=False,
                            random_state=5,
                            hidden_layer_sizes=[17] * 11)
        pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)])
        search = GridSearchCV(pipe, grid, verbose=2, cv=5)
        search.fit(dataX, self.allTarget)
        print("Best number TSVD components:", search.best_params_)

        self.tsvd.fit(dataX)
        var = np.cumsum(
            np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100)

        plt.figure()
        plt.ylabel('% Variance Explained')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(var)
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName))

        plt.figure()
        plt.ylabel('Score')
        plt.xlabel('# of Features')
        plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName))
        plt.xticks(self.tsvdDims)
        plt.style.context('seaborn-whitegrid')
        plt.plot(search.cv_results_['mean_test_score'])
        plt.grid()
        plt.draw()
        plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName))
        print("-" * 50)
Exemplo n.º 32
0
class KMeans(ModelBase):
    """
    KMeans:

    Fits an Sklearn KMeans model to X.


    See also
    --------
    http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html


    Attributes
    ----------
    n_clusters_ : int
                  The number of clusters, K

    cluster_inertia_ : float
                       Sum of squared distances of samples to their closest cluster center

    cluster_labels_ : array, [n_clusters_]
                      Labels indicating the membership of each point

    cluster_centers_ : array, [n_clusters, n_features]
                       Coordinates of cluster centers

    sample_labels_ : array, [n_samples]
                     Labels for each of the samples in X

    sample_distances_ : array, [n_samples]
                        The distance between each sample point and its cluster's center


    Constants
    ---------
    SAMPLE_CUTOFF_ : int
                     If n_samples > SAMPLE_CUTOFF_ then sample distances
                     are NOT recorded
    """

    SAMPLE_CUTOFF_ = 1000

    def __init__(self):
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    @property
    def cluster_inertia_(self):
        # Sum of squared distances of samples to their closest cluster center
        return None if self.model_ is None else \
            self.model_.inertia_

    @property
    def cluster_labels_(self):
        # Cluster membership labels for each point
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.labels_)

    @property
    def cluster_centers_(self):
        # Coordinates of the cluster centers
        return None if self.model_ is None else \
            copy.deepcopy(self.model_.cluster_centers_)

    def _reset(self):
        """Resets all attributes (erases the model)"""
        self.model_ = None
        self.n_clusters_ = None
        self.sample_labels_ = None
        self.sample_distances_ = None

    def fit(self, X, K, sample_labels=None, estimator_params=None):
        """Fits a Sklearn KMeans model to X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data.

        K : int
            The number of clusters.

        sample_labels : array-like, shape (n_samples), optional
                        Labels for each of the samples in X.

        estimator_params : dict, optional
                           The parameters to pass to the KMeans estimators.


        Returns
        -------
        self
        """
        self._reset()
        # Note: previously set n_init=50
        self.model_ = SklearnKMeans(K)
        if estimator_params is not None:
            assert isinstance(estimator_params, dict)
            self.model_.set_params(**estimator_params)

        # Compute Kmeans model
        self.model_.fit(X)
        if sample_labels is None:
            sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])]
        assert len(sample_labels) == X.shape[0]
        self.sample_labels_ = np.array(sample_labels)
        self.n_clusters_ = K

        # Record sample label/distance from its cluster center
        self.sample_distances_ = OrderedDict()
        for cluster_label in range(self.n_clusters_):
            assert cluster_label not in self.sample_distances_
            member_rows = X[self.cluster_labels_ == cluster_label, :]
            member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label]
            centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0)

            # "All clusters must have at least 1 member!"
            if member_rows.shape[0] == 0:
                return None

            # Calculate distance between each member row and the current cluster
            dists = np.empty(member_rows.shape[0])
            dist_labels = []
            for j, (row, label) in enumerate(zip(member_rows, member_labels)):
                dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze()
                dist_labels.append(label)

            # Sort the distances/labels in ascending order
            sort_order = np.argsort(dists)
            dists = dists[sort_order]
            dist_labels = np.array(dist_labels)[sort_order]
            self.sample_distances_[cluster_label] = {
                "sample_labels": dist_labels,
                "distances": dists,
            }
        return self

    def get_closest_samples(self):
        """Returns a list of the labels of the samples that are located closest
           to their cluster's center.


        Returns
        ----------
        closest_samples : list
                  A list of the sample labels that are located the closest to
                  their cluster's center.
        """
        if self.sample_distances_ is None:
            raise Exception("No model has been fit yet!")

        return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())]

    def get_memberships(self):
        '''
        Return the memberships in each cluster
        '''
        memberships = OrderedDict()
        for cluster_label, samples in list(self.sample_distances_.items()):
            memberships[cluster_label] = OrderedDict(
                [(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])])
        return json.dumps(memberships, indent=4)
Exemplo n.º 33
0
Arquivo: cw1.py Projeto: arlyon/dmml
def k_clustering(ctx, sweep_features, sweep_variance, sweep_clusters):
    """
    K-means clustering function to be run on dataset.
    Includes simple analysis of results.
    """
    save_plot = ctx.obj["save_plot"]
    show_plot = ctx.obj["show_plot"]

    print("loading data...")
    features, boolean_labels, labels = load_data(ctx.obj["data_folder"],
                                                 shuffle_seed=ctx.obj["seed"])
    n_samples, n_features = features.shape
    features_with_labels = features.copy()
    features_with_labels[n_features] = labels

    # Save seed for consistent runs for data analysis
    seed = numpy.random.get_state()

    # Run k-clustering excluding the class attribute
    model = KMeans(n_clusters=10)
    print("running k-means clustering on all features except class...")
    numpy.random.set_state(seed)
    base_predictions = model.fit_predict(features)
    score_clustering(labels, base_predictions, print_score=True)

    # Run k-clustering including the class attribute
    print("running k-means clustering on all features including class...")
    numpy.random.set_state(seed)
    score_clustering(labels,
                     model.fit_predict(features_with_labels),
                     print_score=True)

    # Perform Analytical sweeps of features, variance and clusters
    best_feature_n = None
    if sweep_features:
        best_feature_n = feature_sweep(features,
                                       boolean_labels,
                                       labels,
                                       seed,
                                       save_plot,
                                       show_plot,
                                       n_features=20)

    if sweep_variance:
        variance_sweep(features, labels, seed, save_plot, show_plot, step=500)

    best_cluster_n = None
    if sweep_clusters:
        best_cluster_n = cluster_sweep(features,
                                       labels,
                                       seed,
                                       save_plot,
                                       show_plot,
                                       n_clusters=50,
                                       step=1)

    # Plotting the contingency matrix for the base prediction
    matrix = metrics.cluster.contingency_matrix(column_or_1d(labels),
                                                base_predictions)
    plt.imshow(matrix, cmap="hot")
    plt.title("Base Prediction mapping centroids against class labels")
    plt.xlabel("Cluster Centroid Label")
    plt.ylabel("Actual Label")
    if save_plot is not None:
        path = os.path.join(save_plot, "base_prediction_matrix.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()

    # Running k-clustering with results from sweep analysis
    print("Running k-means clustering with optimal settings")
    model.set_params(n_clusters=28)
    selector = SelectKBest(k=122)
    numpy.random.set_state(seed)
    optimal_predictions = model.fit_predict(
        selector.fit_transform(features, column_or_1d(labels)))
    score_clustering(labels, optimal_predictions, print_score=True)

    # Plotting contingency matrix from optimal predictions
    matrix = metrics.cluster.contingency_matrix(column_or_1d(labels),
                                                optimal_predictions)
    plt.imshow(matrix, cmap="hot")
    plt.title("Optimal Prediction mapping centroids against class labels")
    plt.xlabel("Cluster Centroid Label")
    plt.ylabel("Actual Label")
    if save_plot is not None:
        path = os.path.join(save_plot, "optimal_prediction_matrix.png")
        plt.savefig(path)
        print("")
        print("saved figure to " + path)
    if show_plot:
        plt.show()
    plt.clf()

    # Print out optimal results from sweep analysis
    if best_feature_n:
        print(f"Ideal number of k-best features is {best_feature_n}.")
    if best_feature_n:
        print(f"Ideal number of clusters is {best_cluster_n}.")

    print("Analysis Completed.")
Exemplo n.º 34
0
def plot_loss_vs_cluster_number(X: numpy.ndarray,
                                k_min: int,
                                k_max: int,
                                distance_function: Callable[
                                    [numpy.ndarray, numpy.ndarray], float],
                                *,
                                algorithm_parameters: Dict[str, Any] = None,
                                ax: Optional[axes.Axes] = None,
                                **kwargs) -> axes.Axes:
    """
    k-means requires you to decide the number of clusters ``k`` beforehand. This method runs the KMean algorithm and
    increases the cluster number at each try. The Total magnitude or sum of distance is used as loss.

    Right now the method only works with ``sklearn.cluster.KMeans``.

    :param X: Training instances.
    :param k_min: The minimum cluster number.
    :param k_max: The maximum cluster number.
    :param distance_function: The function used to calculate the distance between an instance to its cluster center.
            The function receives two ndarrays, one the instance and the second is the center and return a float number
            representing the distance between them.
    :param algorithm_parameters: parameters to use for the algorithm. If None, deafult parameters of ``KMeans`` will
            be used.
    :param ax: Axes object to draw the plot onto, otherwise uses the current Axes.
    :param kwargs: other keyword arguments

                   All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``.
    :return: Returns the Axes object with the plot drawn onto it.
    """
    if algorithm_parameters is None:
        algorithm_parameters = KMeans().get_params()

    if "n_clusters" in algorithm_parameters:
        del algorithm_parameters["n_clusters"]

    if ax is None:
        pyplot.figure()
        ax = pyplot.gca()

    result = []

    for k in range(k_min, k_max + 1):
        estimator = KMeans(n_clusters=k)
        estimator.set_params(**algorithm_parameters)
        estimator.fit(X)
        magnitude = pandas.DataFrame(
            _extract_magnitude(X, estimator.labels_,
                               estimator.cluster_centers_, distance_function))
        result.append({"k": k, "magnitude": magnitude["distance"].sum()})

    pandas.DataFrame(result).plot("k",
                                  "magnitude",
                                  kind="scatter",
                                  ax=ax,
                                  **kwargs)
    pyplot.xticks(range(max(0, k_min - 1), k_max + 2), rotation=0)
    ax.set_xlabel("Number of clusters")
    ax.set_ylabel("Total Point-to-Centroid Distance")
    ax.set_title("Loss vs Clusters Used")

    return ax
Exemplo n.º 35
0
class CodeBook(BaseEstimator, ClusterMixin, TransformerMixin):
    """Code Book creation and manimpulation for Bag-of-(visual)Fetures.

    Parameters
    ----------

    n_words : int, optional, default: 36
        The number of clusters to form as well as the number of
        words (centroids) to generate.

    cluster_core : sklearn.cluster, default: KMeans
        Clustering technique used to quantisize the feature space to
        generate the code book.
        #TODO: its default should be described by _default_clustering()

    max_iter : int, default: 300
        Maximum number of iterations of the k-means algorithm for a
        single run.

    n_init : int, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    init : {'k-means++', 'random' or an ndarray}
        Method for initialization, defaults to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.


    precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).

        'auto' : do not precompute distances if n_samples * n_words > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.

        True : always precompute distances

        False : never precompute distances

    tol : float, default: 1e-4
        Relative tolerance with regards to inertia to declare convergence

    n_jobs : int
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    verbose : int, default 0
        Verbosity mode.

    copy_x : boolean, default True
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True, then the original data is not
        modified.  If False, the original data is modified, and put back before
        the function returns, but small numerical differences may be introduced
        by subtracting and then adding the data mean.

    Attributes
    ----------
    cook_book_ : array, [n_words, n_features]
        Coordinates of cluster centers

    labels_ :
        Labels of each point

    inertia_ : float
        Sum of distances of samples to their closest cluster center.

    Notes
    ------
    The k-means problem is solved using Lloyd's algorithm.

    The average complexity is given by O(k n T), were n is the number of
    samples and T is the number of iteration.

    The worst case complexity is given by O(n^(k+2/p)) with
    n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
    'How slow is the k-means method?' SoCG2006)

    In practice, the k-means algorithm is very fast (one of the fastest
    clustering algorithms available), but it falls in local minima. That's why
    it can be useful to restart it several times.

    See also
    --------

    dictionary_code
    """
    #TODO: test n_words default = 36
    #TODO: does make sense these paremters: max_iter, n_init
    #TODO: change the cluter_core from cluster_code=None to
    #      cluster_code=_default_cluster(), doing all the apropiated
    #      changes. Check that BaseEstimator asks for strict declaration
    #
    #      def _default_cluster(self, n_words=36,
    #                   init='k-means++', n_init=10, max_iter=300,
    #                   tol=1e-4, precompute_distances='auto',
    #                   verbose=0, random_state=None, copy_x=True, n_jobs=1):
    #          """Default space clustering strategy to determine the code book"""
    #          from sklearn.cluster import KMeans
    #          return KMeans(n_clusters=n_words, ...)
    #
    #       Then self.set_param can also be used to setup the parameters for the
    #       current classification methodology

    def __init__(self, n_words=36, cluster_core=None, init='k-means++',
                 n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                 verbose=0, random_state=None, copy_x=True, n_jobs=1):

        if hasattr(init, '__array__'):
            n_words = init.shape[0]
            init = np.asarray(init, dtype=np.float64)

        self.n_words = n_words
        self.cluster_core_name = cluster_core
        self.init = init
        self.max_iter = max_iter
        self.tol = tol
        self.precompute_distances = precompute_distances
        self.n_init = n_init
        self.verbose = verbose
        self.random_state = random_state
        self.copy_x = copy_x
        self.n_jobs = n_jobs

        if self.cluster_core_name == 'random-words':
            self.n_init = 1
            self.max_iter = 1
            print 'The number of iterations and try as been fixed to 1.'

        if ( (self.cluster_core_name is     None      ) or 
             (self.cluster_core_name == 'random-words')    ):
            from sklearn.cluster import KMeans
            self.cluster_core = KMeans(n_clusters=self.n_words, init=self.init,
                                       max_iter=self.max_iter, tol=self.tol,
                                       precompute_distances=self.precompute_distances,
                                       n_init=self.n_init, verbose=self.verbose,
                                       random_state=self.random_state,
                                       copy_x=self.copy_x, n_jobs=self.n_jobs)
    def _check_fit_data(self, X):
        """Verify that the number of samples given is larger than n_words"""
        X = check_array(X, accept_sparse='csr', dtype=np.float64)
        if X.shape[0] < self.n_words:
            raise ValueError("n_samples=%d should be >= n_words=%d" % (
                X.shape[0], self.n_words))
        return X

    def _check_test_data(self, X):
        X = check_array(X, accept_sparse='csr')
        n_samples, n_features = X.shape
        expected_n_features = self.cook_book_.shape[1]
        if not n_features == expected_n_features:
            raise ValueError("Incorrect number of features. "
                             "Got %d features, expected %d" % (
                                 n_features, expected_n_features))
        if X.dtype.kind != 'f':
            warnings.warn("Got data type %s, converted to float "
                          "to avoid overflows" % X.dtype,
                          RuntimeWarning, stacklevel=2)
            X = X.astype(np.float)

        return X

    def fit(self, X, y=None):
        """Compute the clustering of the space.
        #TODO: right now only for K_means, however a dispatcher is
               needed so that other clustering stragegies are called
               indisticntly

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
        """
        self.cluster_core = self.cluster_core.fit(X, y)
        return self

    def fit_predict(self, X, y=None):
        """Compute cluster centers and predict cluster index for each sample.

        Convenience method; equivalent to calling fit(X) followed by
        predict(X).
        """
        #return self.fit(X).labels_
        raise NotImplementedError

    def fit_transform(self, X, y=None):
        """Compute clustering and transform X to cluster-distance space.

        Equivalent to fit(X).transform(X), but more efficiently implemented.
        """
        # Currently, this just skips a copy of the data if it is not in
        # np.array or CSR format already.
        # XXX This skips _check_test_data, which may change the dtype;
        # we should refactor the input validation.
        # 
        # X = self._check_fit_data(X)
        # return self.fit(X)._transform(X)
        raise NotImplementedError

    def transform(self, X, y=None):
        """Transform X to a cluster-distance space.

        In the new space, each dimension is the distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------
        X_new : array, shape [n_samples, k]
            X transformed in the new space.
        """
        # check_is_fitted(self, 'cook_book_')

        # X = self._check_test_data(X)
        # return self._transform(X)
        raise NotImplementedError

    def _transform(self, X):
        """guts of transform method; no input validation"""
        # return euclidean_distances(X, self.cook_book_)
        raise NotImplementedError


    def predict(self, X):
        """Predicts the index value of the closest word within the code book.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the closest word within the code book.
        """
        return self.cluster_core.predict(X)

    def get_dictionary(self):
        """Retrieves the words forming the code book

        Returns
        -------
        dictionary : array, shape [n_words, n_features]
            Code book elements (words of the dictionary) represented
            in the feature space
        """
        #TODO: check that the coodebook is fitted
        return self.cluster_core.cluster_centers_

    def get_BoF_descriptor(self, X):

        # norm = lambda x: x.astype(float)/np.linalg.norm(x)
        # return norm(np.bincount(self.predict(X)))
        return np.histogram(self.predict(X),
                            bins=range(self.n_words+1),
                            density=True)

    def get_BoF_pramide_descriptor(self, X):
        """ Split the image (or volume) in a piramide manner and get
        a descriptor for each level (and part). Concatenate the output.
        TODO: build proper documentaiton

        """
        def split_data_by2(X):
            # TODO: rewrite this in a nice manner that uses len(X.shape)
            # TODO: this can rise ERROR if length of X is odd
            parts = [np.split(x, 2, axis=2) for x in [np.split(x, 2, axis=1) for x in
             np.slit(X, 2, axis=0) ]]
            return parts

        def get_occurrences(X):
            return np.histogram(X, bins=range(self.n_words+1))

        def build_piramide(X, level=2):
            if level is 0:
                return get_occurrences(X)
            else:
                return [get_occurrences(X)] + [build_piramide(Xpart, level-1)
                       for Xpart in split_data_by2(X)]

        return build_piramide(self.predict(X))

    def get_params(self, deep=True):
        return self.cluster_core.get_params()

    def set_params(self, **params):
        self.cluster_core.set_params(**params)
Exemplo n.º 36
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        random_state = check_random_state(self.random_state)

        # Compute the number of cluster needed
        if self.ratio == 'auto':
            num_samples = self.stats_c_[self.min_c_]
        else:
            num_samples = int(self.stats_c_[self.min_c_] / self.ratio)

        # Create the clustering object
        kmeans = KMeans(n_clusters=num_samples, random_state=random_state)
        kmeans.set_params(**self.kwargs)

        # Start with the minority class
        X_min = X[y == self.min_c_]
        y_min = y[y == self.min_c_]

        # All the minority class samples will be preserved
        X_resampled = X_min.copy()
        y_resampled = y_min.copy()

        # Loop over the other classes under picking at random
        for key in self.stats_c_.keys():

            # If the minority class is up, skip it.
            if key == self.min_c_:
                continue

            # Find the centroids via k-means
            kmeans.fit(X[y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            X_resampled = np.concatenate((X_resampled, centroids), axis=0)
            y_resampled = np.concatenate((y_resampled, np.array([key] *
                                                                num_samples)),
                                         axis=0)

        self.logger.info('Under-sampling performed: %s', Counter(
            y_resampled))

        return X_resampled, y_resampled
Exemplo n.º 37
0
class ClusterCentroids(BaseUnderSampler):
    """Perform under-sampling by generating centroids based on
    clustering methods.

    Method that under samples the majority class by replacing a
    cluster of majority samples by the cluster centroid of a KMeans
    algorithm.  This algorithm keeps N majority samples by fitting the
    KMeans algorithm with N cluster to the majority class and using
    the coordinates of the N cluster centroids as the new majority
    samples.

    Read more in the :ref:`User Guide <cluster_centroids>`.

    Parameters
    ----------
    {sampling_strategy}

    {random_state}

    estimator : object, optional(default=KMeans())
        Pass a :class:`sklearn.cluster.KMeans` estimator.

    voting : str, optional (default='auto')
        Voting strategy to generate the new samples:

        - If ``'hard'``, the nearest-neighbors of the centroids found using the
          clustering algorithm will be used.
        - If ``'soft'``, the centroids found by the clustering algorithm will
          be used.
        - If ``'auto'``, if the input is sparse, it will default on ``'hard'``
          otherwise, ``'soft'`` will be used.

        .. versionadded:: 0.3.0

    n_jobs : int, optional (default=1)
        The number of threads to open if possible.

    ratio : str, dict, or callable
        .. deprecated:: 0.4
           Use the parameter ``sampling_strategy`` instead. It will be removed
           in 0.6.

    Notes
    -----
    Supports multi-class resampling by sampling each class independently.

    Examples
    --------

    >>> from collections import Counter
    >>> from sklearn.datasets import make_classification
    >>> from imblearn.under_sampling import \
ClusterCentroids # doctest: +NORMALIZE_WHITESPACE
    >>> X, y = make_classification(n_classes=2, class_sep=2,
    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
    >>> print('Original dataset shape %s' % Counter(y))
    Original dataset shape Counter({{1: 900, 0: 100}})
    >>> cc = ClusterCentroids(random_state=42)
    >>> X_res, y_res = cc.fit_resample(X, y)
    >>> print('Resampled dataset shape %s' % Counter(y_res))
    ... # doctest: +ELLIPSIS
    Resampled dataset shape Counter({{...}})

    """

    def __init__(self,
                 sampling_strategy='auto',
                 random_state=None,
                 estimator=None,
                 voting='auto',
                 n_jobs=1,
                 ratio=None):
        super(ClusterCentroids, self).__init__(
            sampling_strategy=sampling_strategy, ratio=ratio)
        self.random_state = random_state
        self.estimator = estimator
        self.voting = voting
        self.n_jobs = n_jobs

    def _validate_estimator(self):
        """Private function to create the KMeans estimator"""
        if self.estimator is None:
            self.estimator_ = KMeans(
                random_state=self.random_state, n_jobs=self.n_jobs)
        elif isinstance(self.estimator, KMeans):
            self.estimator_ = clone(self.estimator)
        else:
            raise ValueError('`estimator` has to be a KMeans clustering.'
                             ' Got {} instead.'.format(type(self.estimator)))

    def _generate_sample(self, X, y, centroids, target_class):
        if self.voting_ == 'hard':
            nearest_neighbors = NearestNeighbors(n_neighbors=1)
            nearest_neighbors.fit(X, y)
            indices = nearest_neighbors.kneighbors(
                centroids, return_distance=False)
            X_new = safe_indexing(X, np.squeeze(indices))
        else:
            if sparse.issparse(X):
                X_new = sparse.csr_matrix(centroids, dtype=X.dtype)
            else:
                X_new = centroids
        y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype)

        return X_new, y_new

    def _fit_resample(self, X, y):
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled, dtype=y.dtype)
Exemplo n.º 38
0
def Mahalanobis_stratified_CV(data, y, pca_var, nfolds):

    import numpy as np
    y = np.array(y)

    #Get numerical and categorical column indices

    def IsNumeric(data):

        data_types = data.dtypes

        index = 0

        for i in data_types:

            if (i == "int64" or i == "float64"):
                data_types[index] = True

            else:
                data_types[index] = False

            index += 1

        return data_types

    #Obtain a centered / scaled matrix with numerical values
    #n factor levels -> (0, ... , n-1)

    numerical_data = np.empty(shape=data.shape)

    data_type_index = IsNumeric(data)

    for i in range(0, data.shape[1]):

        if data_type_index[i]:

            numerical_data[:, i] = data.iloc[:, i]

        else:

            levels = data.iloc[:, i].unique()

            dummy_value = 0

            for j in levels:

                index = np.where(data.iloc[:, i] == j)

                numerical_data[index, i] = dummy_value

                dummy_value += 1

    numerical_data = (numerical_data - np.mean(
        numerical_data, axis=0)) / np.std(numerical_data, axis=0)

    #PCA decomposition

    from sklearn.decomposition import PCA

    pca = PCA(n_components=pca_var, svd_solver='full')

    numerical_data_PCA = pca.fit_transform(numerical_data)

    numerical_data_PCA = (numerical_data_PCA - np.mean(
        numerical_data_PCA, axis=0)) / np.std(numerical_data_PCA, axis=0)

    from sklearn.cluster import KMeans
    from multiprocessing.dummy import Pool
    import functools
    import multiprocessing

    #Chooses the best number of clusers based on the best silhouette score
    #minimum group size of cluster >= nfolds
    #best silhouette score

    kmeans = KMeans()

    clusters = kmeans.fit_predict(numerical_data)

    ncore = max(multiprocessing.cpu_count() - 2, 1)

    def min_cluster_size(n_clusters, kmeans, X):

        from sklearn.cluster import KMeans
        from sklearn.metrics import silhouette_score

        kmeans.set_params(n_clusters=n_clusters, n_jobs=ncore, n_init=30)
        clusters = kmeans.fit_predict(X)
        score = silhouette_score(X, clusters)

        ids = np.unique(clusters)

        min_size = np.empty(shape=len(ids))

        count = 0

        for i in ids:
            min_size[count] = np.shape(np.where(clusters == i))[1]
            count += 1

        min_size = min_size.astype(int)

        return [score, np.min(min_size)]

    results = np.empty(shape=(nfolds, 2))

    for i in range(2, nfolds + 2):
        results[(i - 2), :] = min_cluster_size(i, kmeans, numerical_data_PCA)

    treshold = int(len(y) / nfolds)

    accepted_n_clusters = np.where(results[:, 1] >= treshold)[0]

    if len(accepted_n_clusters) == 0:

        accepted_n_clusters = np.array(list(range(2, nfolds + 2)))

    best_accepted_n_clusters = int(accepted_n_clusters[np.argsort(
        -results[accepted_n_clusters, 0])[0]]) + 2

    kmeans.set_params(n_clusters=best_accepted_n_clusters, n_jobs=ncore)
    clusters = kmeans.fit_predict(numerical_data_PCA)

    #Quantile-based k-fold stratification

    def stratified_sample(y, nfolds, index):

        y = np.array(y[index])

        n = max(int(len(y) / nfolds), 1)

        q = np.linspace(0, 1, num=(n + 1))
        q = np.quantile(y, q)
        q[0] = q[0] - 1
        q[len(q) - 1] = q[len(q) - 1] + 1

        out = [[] for i in range(nfolds)]

        for i in range(0, n):

            index_temp = np.where((y >= q[i]) & (y < q[i + 1]))[0]
            index_temp = index[0][index_temp]
            np.random.shuffle(index_temp)
            folds = np.array_split(index_temp, nfolds)

            for j in range(0, nfolds):

                out[j].append(folds[j])

        for i in range(0, nfolds):

            out[i] = np.concatenate(out[i])

        return out

    #Fuse two list of indices together

    def fuse(f1, f2):

        f3 = []

        for i in range(0, len(f1)):

            f3.append(np.concatenate([f1[i], f2[i]]))

        return np.array(f3)

    #Initiate fold list, then fill it sequentially by looping through clusters

    folds = np.array([[] for i in range(0, nfolds)])

    for i in range(0, best_accepted_n_clusters):

        index = np.where(clusters == i)
        folds = fuse(folds, stratified_sample(y, nfolds, index))
        np.random.shuffle(folds)

    for i in range(0, nfolds):
        folds[i] = folds[i].astype(int)

    y_vecs = []

    for i in folds:
        y_vecs.append(y[i])

    metrics = np.empty(shape=(len(y_vecs), 4))

    count = 0

    from scipy.stats import kurtosis
    from scipy.stats import skew

    for i in y_vecs:

        metrics[count, 0] = np.mean(np.array(i))
        metrics[count, 1] = np.std(np.array(i))
        metrics[count, 2] = skew(np.array(i))
        metrics[count, 3] = kurtosis(np.array(i))

        count += 1

    metrics_final = np.empty(shape=4)

    for i in range(0, 4):

        metrics_final[i] = np.std(metrics[:, i]) / np.mean(metrics[:, i])

    print("Coefficient of variation (MEAN): " +
          str(round(metrics_final[0], 5)))
    print("Coefficient of variation (SD): " + str(round(metrics_final[1], 5)))
    print("Coefficient of variation (SKEW): " +
          str(round(metrics_final[2], 5)))
    print("Coefficient of variation (KURT): " +
          str(round(metrics_final[3], 5)))

    return folds
Exemplo n.º 39
0
class DataCluster():

    def __init__(self, nCluster, minDist, nQuatCluster, minQuatDist):
        print 'Init DataCluster.'
        self.set_params(nCluster, minDist, nQuatCluster, minQuatDist)

    def set_params(self, nCluster, minDist, nQuatCluster, minQuatDist):
        self.nCluster = nCluster
        self.fMinDist = minDist

        self.nQuatCluster = nQuatCluster
        self.fMinQuatDist = minQuatDist
        
        self.ml = KMeans(n_clusters=nCluster, max_iter=300, n_jobs=6)
        
    def readData(self):
        print 'Read data manually.'
        data_start=0
        data_finish=1000 #'end'
        model = 'bed'
        subject='sub6_shaver'
        print 'Starting to convert data!'
        self.runData = dr.DataReader(subject=subject,data_start=data_start,data_finish=data_finish,model=model)      
        #dr_obs = dr.DataReader(subject=subject,data_start=data_start,data_finish=data_finish,model=model)        
        #self.runData = dr_obs.get_raw_data(self)

        
    def mat_to_pos_quat(self, raw_data):

        raw_pos  = np.zeros((len(raw_data),3)) #array
        raw_quat = np.zeros((len(raw_data),4))
        
        #-----------------------------------------------------------#
        ## Decompose data into pos,quat pairs
        for i in xrange(len(raw_data)):  
            raw_pos[i,:]  = np.array([raw_data[i][0,3],raw_data[i][1,3],raw_data[i][2,3]])
            raw_quat[i,:] = tft.quaternion_from_matrix(raw_data[i]) # order should be xyzw because ROS uses xyzw order.       

        return raw_pos, raw_quat

    def pos_clustering(self, raw_pos):

        while True:
            dict_params={}
            dict_params['n_clusters']=self.nCluster
            self.ml.set_params(**dict_params)
            self.ml.fit(raw_pos)

            # co-distance matrix
            bReFit = False
            co_pos_mat = np.zeros((self.nCluster,self.nCluster))
            for i in xrange(self.nCluster):

                # For refitting
                if bReFit == True: break
                
                for j in xrange(i, self.nCluster):
                    if i==j: 
                        co_pos_mat[i,j] = 1000000 # to avoid minimum check
                        continue
                    co_pos_mat[i,j] = co_pos_mat[j,i] = np.linalg.norm(self.ml.cluster_centers_[i] - self.ml.cluster_centers_[j])
                                        
                    if co_pos_mat[i,j] < self.fMinDist:
                        bReFit = True
                        break
                        
            if bReFit == True:
                self.nCluster -= 1
                print "New # of clusters: ", self.nCluster
                continue
            else:
                break
            
        raw_pos_index = self.ml.fit_predict(raw_pos)
        return raw_pos_index

    # Return a list of clustered index.
    def grouping(self, raw_data):
        print 'Start clustering.'
        print raw_data.shape

        #-----------------------------------------------------------#
        ## Initialization
        raw_pos, raw_quat = self.mat_to_pos_quat(raw_data)
        
        #-----------------------------------------------------------#
        ## K-mean Clustring by Position
        raw_pos_index = self.pos_clustering(raw_pos)
        
        return raw_pos_index
        
    def clustering(self, raw_data):
        print 'Start clustering.'
        print raw_data.shape

        #-----------------------------------------------------------#
        ## Initialization
        raw_pos, raw_quat = self.mat_to_pos_quat(raw_data)

        #-----------------------------------------------------------#
        ## K-mean Clustering by Position
        raw_pos_index = self.pos_clustering(raw_pos)

        pos_clustered_group = []
        for i in xrange(self.nCluster):
            raw_group = []
            for j in xrange(len(raw_data)):
                if raw_pos_index[j] == i:
                    if raw_group == []:
                        raw_group = np.array([np.hstack([raw_pos[j],raw_quat[j]])])
                    else:
                        raw_group = np.vstack([raw_group, np.hstack([raw_pos[j],raw_quat[j]])])

            pos_clustered_group.append(raw_group)

        print "Number of pos groups: ", len(pos_clustered_group)
            
        #-----------------------------------------------------------#
        ## Grouping by orientation
        clustered_group = []        
        for group in pos_clustered_group:

            # samples
            X = group[:,3:]            
            ## print "Total X: ", X.shape[0], len(X)

            # Clustering parameters
            nQuatCluster = self.nQuatCluster
            kmsample = nQuatCluster  # 0: random centres, > 0: kmeanssample
            kmdelta = .001
            kmiter = 10
            metric = "quaternion"  # "chebyshev" = max, "cityblock" L1,  Lqmetric

            # the number of clusters should be smaller than the number of samples
            if nQuatCluster > len(X):
                nQuatCluster = len(X)
                kmsample = len(X)
                
            # Clustering
            while True:
                centres, xtoc, dist = km.kmeanssample( X, nQuatCluster, nsample=kmsample,
                                                    delta=kmdelta, maxiter=kmiter, metric=metric, verbose=0 )                          
        
                # co-distance matrix
                bReFit = False
                co_pos_mat = np.zeros((nQuatCluster,nQuatCluster))
                for i in xrange(nQuatCluster):

                    # For refitting
                    if bReFit == True: break
                    for j in xrange(i, nQuatCluster):
                        if i==j: 
                            co_pos_mat[i,j] = 1000000 # to avoid minimum check
                            continue
                        co_pos_mat[i,j] = co_pos_mat[j,i] = ut.quat_angle(centres[i],centres[j])                                         
                        if co_pos_mat[i,j] < self.fMinQuatDist:
                            bReFit = True
                            break

                if bReFit == True:
                    nQuatCluster -= 1
                    ## print "New # of clusters ", nQuatCluster, " in a sub group "
                    continue
                else:
                    break

            for i in xrange(nQuatCluster):
                raw_group = []
                for j in xrange(len(group)):
                    if xtoc[j] == i:
                        if raw_group == []:
                            raw_group = np.array([group[j,:]])
                        else:
                            raw_group = np.vstack([raw_group, group[j,:]])
                clustered_group.append(raw_group)

        print "Number of pos+quat groups: ", len(clustered_group)

        #-----------------------------------------------------------#
        ## Averaging
        avg_clustered_data = []
        num_clustered_data = []
        count = 0
        for i,g in enumerate(clustered_group):
            if len(g)==0: continue

            count += len(g)
            ## print "Number of sub samples: ", len(g)

            # Position
            pos_sum = np.array([0.,0.,0.])
            for j,s in enumerate(g):
                pos_sum += s[0:3]

                if j==0:
                    quat_array = np.array([s[3:]])
                else:
                    quat_array = np.vstack([quat_array, s[3:]])
            pos_avg = pos_sum/float(len(g))

            # Quaternion
            quat_avg = qt.quat_avg( quat_array )         
            avg_clustered_data.append([pos_avg, quat_avg])
            num_clustered_data.append([len(g)])
                
        ## print "total: ", count
                  
        # Reshape the pairs into tranformation matrix
        for i, g in enumerate(avg_clustered_data):            

            mat = tft.quaternion_matrix(g[1])
            mat[0,3] = g[0][0]
            mat[1,3] = g[0][1]
            mat[2,3] = g[0][2]

            if i==0:
                clustered_data = np.array([mat])
            else:
                clustered_data = np.vstack([clustered_data,  np.array([mat])])    

        print "Final clustered data: ", clustered_data.shape, len(num_clustered_data)
        return clustered_data, num_clustered_data, len(pos_clustered_group)
             

    # X is a set of quaternion
    def q_image_axis_angle(self, X):

        print "Number of data: ", X.shape[0]
        
        angle_array = np.zeros((X.shape[0],1))
        direc_array = np.zeros((X.shape[0],3))
        
        for i in xrange(len(X)):
            angle, direc = qt.quat_to_angle_and_axis(X[i,:])
            angle_array[i,0] = angle
            direc_array[i,:] = direc

        # Normalize angles
        angle_array = (angle_array)/np.pi*180.0

        # matplot setup            
        fig = plt.figure(figsize=(12,12))
        plt.rc('text', usetex=True)
        plt.rc('font', family='serif')

        ax = fig.add_subplot(111, projection='3d')
        
        # Plot a sphere
        r = 0.999
        u = np.linspace(0, 2 * np.pi, 120)
        v = np.linspace(0, np.pi, 60)
        x = np.outer(np.cos(u), np.sin(v))
        y = np.outer(np.sin(u), np.sin(v))
        z = np.outer(np.ones(np.size(u)), np.cos(v))
        ax.plot_surface(x*r, y*r, z*r,  rstride=1, cstride=1, color='c', alpha = 0.4, linewidth = 0)

        # Plot quaternions
        cmap = plt.cm.hsv        
        sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=angle_array,cmap=cmap,vmin=-180.0, vmax=180.0,s=100) #edgecolor='none'
        cbar = plt.colorbar(sc, ticks=np.arange(-180,180+30,30))
        
        ax.set_aspect("equal")
        ax.set_xlim([-1.0,1.0])
        ax.set_ylim([-1.0,1.0])
        ax.set_zlim([-1.0,1.0])
               
        font_dict={'fontsize': 30, 'family': 'serif'}        
        ax.set_xlabel('x', fontdict=font_dict)
        ax.set_ylabel('y', fontdict=font_dict)
        ax.set_zlabel('z', fontdict=font_dict)
        ax.view_init(20,80)
               
        plt.ion()    
        plt.show()
        #ax.mouse_init()
        ut.get_keystroke('Hit a key to proceed next')
                    
        return

    # X is a set of quaternion
    # Y is a set of label
    def q_image_axis_cluster(self, X, Y):

        print "Number of data: ", X.shape[0]
        
        angle_array = np.zeros((X.shape[0],1))
        direc_array = np.zeros((X.shape[0],3))
        
        for i in xrange(len(X)):
            angle, direc = qt.quat_to_angle_and_axis(X[i,:])
            angle_array[i,0] = angle
            direc_array[i,:] = direc

        ## # Normalize angles 
        angle_array = (angle_array)/np.pi*180.0
            
        # Normalize labels         
        max_label = float(np.max(Y))
        fY = np.zeros((len(Y),1))
        if max_label != 0:
            for i in xrange(len(Y)):
                fY[i] = float(Y[i])/max_label
            
        # matplot setup 
        fig = plt.figure(figsize=(24,12))
        plt.rc('text', usetex=True)
        plt.rc('font', family='serif')

        #-------------- matplot 1 --------------
        ax = fig.add_subplot(121, projection='3d')
        font_dict={'fontsize': 45, 'family': 'serif'}            
        ax.set_title("QuTEM distribution", fontdict=font_dict)
        
        # Plot a sphere
        r = 1.0
        u = np.linspace(0, 2 * np.pi, 120)
        v = np.linspace(0, np.pi, 60)
        x = np.outer(np.cos(u), np.sin(v))
        y = np.outer(np.sin(u), np.sin(v))
        z = np.outer(np.ones(np.size(u)), np.cos(v))
        ax.plot_surface(x*r, y*r, z*r,  rstride=1, cstride=1, color='c', alpha = 0.4, linewidth = 0)

        # Plot quaternions
        cmap = plt.cm.hsv
        sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=angle_array,cmap=cmap,vmin=-180.0, vmax=180.0,s=100) #edgecolor='none'
        cbar = plt.colorbar(sc, ticks=np.arange(-180,180+30,30))
        ## cbar.set_clim(-180.0, 180.0)
        
        ax.set_aspect("equal")
        ax.set_xlim([-1.0,1.0])
        ax.set_ylim([-1.0,1.0])
        ax.set_zlim([-1.0,1.0])

        font_dict={'fontsize': 30, 'family': 'serif'}        
        ax.set_xlabel('x', fontdict=font_dict)
        ax.set_ylabel('y', fontdict=font_dict)
        ax.set_zlabel('z', fontdict=font_dict)
        ax.view_init(20,40)

        #-------------- matplot 2 --------------
        ax = fig.add_subplot(122, projection='3d')
        font_dict={'fontsize': 45, 'family': 'serif'}            
        ax.set_title("Clustering", fontdict=font_dict)
        
        # Plot a sphere
        r = 0.92
        u = np.linspace(0, 2 * np.pi, 120)
        v = np.linspace(0, np.pi, 60)
        x = np.outer(np.cos(u), np.sin(v))
        y = np.outer(np.sin(u), np.sin(v))
        z = np.outer(np.ones(np.size(u)), np.cos(v))
        ## ax.plot_surface(x*r, y*r, z*r, rstride=1, cstride=1, color='c', alpha = 1.0, linewidth = 0)

        # Plot quaternions
        cmap = plt.cm.jet
        sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=Y,vmin=0,vmax=abs(Y).max(),s=100)
        ## plt.colorbar(sc)
        
        ax.set_aspect("equal")
        ax.set_xlim([-1.0,1.0])
        ax.set_ylim([-1.0,1.0])
        ax.set_zlim([-1.0,1.0])
        
        font_dict={'fontsize': 30, 'family': 'serif'}        
        ax.set_xlabel('x', fontdict=font_dict)
        ax.set_ylabel('y', fontdict=font_dict)
        ax.set_zlabel('z', fontdict=font_dict)
        ax.view_init(20,40)
               
        plt.ion()    
        plt.show()
        #ax.mouse_init()
        ut.get_keystroke('Hit a key to proceed next')
                    
        return
    

    def test(self, raw_data):
        print 'Start clustering.'
        print raw_data.shape

        N = 1000
        
        #-----------------------------------------------------------#
        ## Initialization
        raw_pos  = np.zeros((N,3)) #array
        raw_quat = np.zeros((N,4))
        
        #-----------------------------------------------------------#
        ## Decompose data into pos,quat pairs
        for i in xrange(N):            
            raw_pos[i,:]  = np.array([0,0,0])

        ## raw_quat = qt.quat_random( N )
        
        quat_mean = np.array([1.,0.,0.,1.5]);
        raw_quat = qt.quat_QuTem( quat_mean/np.linalg.norm(quat_mean), N, [0.03,0.3,0.3,1.0] )

        ## quat_mean = np.array([0.,1.,0.,-1.5]);
        ## raw_quat2 = qt.quat_QuTem( quat_mean/np.linalg.norm(quat_mean), N/2.0, [0.1,1.0,0.1,1.0] )
        ## raw_quat = np.vstack([raw_quat1,raw_quat2])

        ## raw_quat1 = np.array([[1.,  0.,  0.,  0.],
        ##                       [1.,  0.1, 0.,  0.],
        ##                       [1.,  0.,  0.1, 0.],
        ##                       [1.,  0.,  0.,  0.1],
        ##                       [1.,  0.2, 0.,  0.],
        ##                       [1.,  0.,  0.2, 0.],
        ##                       [1.,  0.,  0.,  0.2],
        ##                       [1.1, 0.1, 0.,  0.],
        ##                       [1.1, 0.,  0.1, 0.],
        ##                       [1.1, 0.,  0.,  0.1]])
        ## raw_quat2 = np.array([[0.,  0.,  1.,  0.],
        ##                       [0.1, 0.,  1.1, 0.],
        ##                       [0.1, 0.1, 1.,  0.],
        ##                       [0.,  1.,  0.,  0.],
        ##                       [0.1, 1.,  0.1, 0.],
        ##                       [0.1, 1.1, 0.,  0.],
        ##                       [0.1, 1.,  0.4, 0.],
        ##                       [0.1, 1.,  1.1, 0.2],
        ##                       [0.1, 1.,  1.4, 0.],
        ##                       [1.1, 1.,  0.1, 0.2],
        ##                       [1.1, 1.1,  0.1, 0.1]
        ##                       ])

        ## raw_quat = np.vstack([raw_quat1,raw_quat2])
        
        for i in xrange(len(raw_quat)):
            raw_quat[i,:] /= np.linalg.norm(raw_quat[i,:])
            
        #-----------------------------------------------------------#
        pos_clustered_group = []
        raw_group = np.hstack([raw_pos,raw_quat])
        pos_clustered_group.append(raw_group)

        print "Number of pos groups: ", len(pos_clustered_group)
            
        #-----------------------------------------------------------#
        ## Grouping by orientation
        clustered_group = []        
        for group in pos_clustered_group:

            # samples
            X = group[:,3:]            

            # Clustering parameters
            nQuatCluster = self.nQuatCluster
            kmsample = nQuatCluster  # 0: random centres, > 0: kmeanssample
            kmdelta = .001
            kmiter = 10
            metric = "quaternion"  # "chebyshev" = max, "cityblock" L1,  Lqmetric

            # the number of clusters should be smaller than the number of samples
            if nQuatCluster > len(X):
                nQuatCluster = len(X)
                kmsample = len(X)
                
            # Clustering
            while True:
                centres, xtoc, dist = km.kmeanssample( X, nQuatCluster, nsample=kmsample,
                                                       delta=kmdelta, maxiter=kmiter, metric=metric, verbose=0 )                                          
                # co-distance matrix
                bReFit = False
                co_pos_mat = np.zeros((nQuatCluster,nQuatCluster))
                for i in xrange(nQuatCluster):

                    # For refitting
                    if bReFit == True: break
                    for j in xrange(i, nQuatCluster):
                        if i==j: 
                            co_pos_mat[i,j] = 1000000 # to avoid minimum check
                            continue
                        co_pos_mat[i,j] = co_pos_mat[j,i] = ut.quat_angle(centres[i],centres[j])                                         
                        if co_pos_mat[i,j] < self.fMinQuatDist:
                            bReFit = True
                            break

                if bReFit == True:
                    nQuatCluster -= 1
                    ## print "New # of clusters ", nQuatCluster, " in a sub group "
                    continue
                else:
                    break

            ## for i in xrange(nQuatCluster):
            ##     raw_group = []
            ##     for j in xrange(len(group)):
            ##         if xtoc[j] == i:
            ##             if raw_group == []:
            ##                 raw_group = np.array([group[j,:]])
            ##             else:
            ##                 raw_group = np.vstack([raw_group, group[j,:]])
            ##     clustered_group.append(raw_group)

        print "Number of pos+quat groups: ", len(clustered_group)
        
        self.q_image_axis_cluster(X, xtoc)