Пример #1
0
def accuracy(matrix, bin_cluster, bin_index, ground_truth):
    idx_list = []

    # loop through every cluster and get the length of each of them
    for i in range(8):
        idx_list.append(len(bin_cluster[i]))

    # get the max index appeared in the clusters
    max_idx = idx_list.index(max(idx_list))
    feature_m = []
    for i in bin_cluster[max_idx]:
        feature_m.append(matrix[i])
    bin_index_1 = bin_index[max_idx]

    # call kmeans to cluster labels
    kmeans = KMeans(n_clusters = 60, random_state=0).fit(feature_m)
    label_1 = list(kmeans.labels_)

    # train the cluster and index by kmeans model
    density_cluster, density_index = k_means.kmeans_model(ground_truth, label_1, 60, bin_index_1)

    #     final_bin1 = bin_index[0]
    #     final_bin2 = bin_index[1]
    #     final_bin3 = bin_index[2]
    #     final_bin4 = bin_index[3]
    #     final_bin5 = bin_index[4]
    #     final_bin6 = bin_index[5]

    # initialize the 250 labels as 0s
    final_result = len(matrix) * [0]

    # bin indices that holds all the possible labels
    bin1 = density_index[0]
    bin2 = bin_index[1] + density_index[1]
    bin3 = bin_index[2] + density_index[2]
    bin4 = bin_index[3] + density_index[3]
    bin5 = bin_index[4] + density_index[4]
    bin6 = bin_index[5] + density_index[5]

    # create variables that hold how many 1,2,3,4,5,6s are there in each bin
    bin1_l = len(bin1) * [1]
    bin2_l = len(bin2) * [2]
    bin3_l = len(bin3) * [3]
    bin4_l = len(bin4) * [4]
    bin5_l = len(bin5) * [5]
    bin6_l = len(bin6) * [6]

    # print(len(final_bin1))
    # print(len(final_bin2))
    # print(len(final_bin3))
    # print(len(final_bin4))
    # print(len(final_bin5))
    # print(len(final_bin6))

    # replace labels in final result by each of the bins
    for i in bin1:
        final_result[i] = bin1_l.pop()
    for i in bin2:
        final_result[i] = bin2_l.pop()
    for i in bin3:
        final_result[i] = bin3_l.pop()
    for i in bin4:
        final_result[i] = bin4_l.pop()
    for i in bin5:
        final_result[i] = bin5_l.pop()
    for i in bin6:
        final_result[i] = bin6_l.pop()

    sse = kmeans.inertia_
    print("SSE value :", sse)

    return final_result
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        if X is None:
            return []

        # check the datatype of user-defined columns
        if not isinstance(include_columns, list):
            raise ValueError("Variable: 'include_columns' should be <list>")
        if not isinstance(ignore_columns, list):
            raise ValueError("Column: 'ignore_columns' should be <list>")
        if not isinstance(num_clusters, int):
            raise ValueError("Column: 'num_clusters' should be <int>")

        ## validate user-inputs and override the columns given by user
        features = list(X.names)
        if len(include_columns) > 0:
            for _ in include_columns:
                if _ not in list(X.names):
                    raise ValueError("Column: '" + str(_) + "' is not present in the dataset")
            features = include_columns

        ## list to ignore specific columns given by user
        features = [_f for _f in features if _f not in ignore_columns]

        ## handle columns with missing values 
        ignore_ = []
        X_df = X.to_pandas()
        for col in features:
            # label encode categorical columns
            # refer - https://github.com/h2oai/driverlessai-recipes/pull/68#discussion_r365133392

            if X_df[col].dtype == "object":
                X_df[f"{col}_enc"] = LabelEncoder().fit_transform(X_df[col].to_numpy())
                ignore_.append(col)

            miss_percent = X_df[col].isna().sum() / X_df.shape[0]
            if miss_percent >= 0.3:  # ignore columns having more than 30% missing values
                ignore_.append(col)
            elif miss_percent > 0.0:  # impute by mean for other columns with missing values
                X_df[col] = X_df[col].fillna(X_df[col].mean())

        features = [f for f in features if f not in ignore_]
        features += [_f for _f in X_df.columns if "_enc" in _f]
        if len(features) == 0:
            raise ValueError("Unable to cluster: No useful features available")

        X_clust = X_df[features].values

        # Apply min max scaling
        X_clust = MinMaxScaler().fit_transform(X_clust)

        # Go through possible numbers of clusters
        best_score = None
        best_n_clust = None
        best_clust_ids = None

        ## if number of clusters is pre-defined by user, then dont find the optimal
        if num_clusters > 1:
            model = KMeans(n_clusters=num_clusters, n_jobs=NUM_JOBS).fit(X_clust)
            clust_ids = model.predict(X_clust)
            score = get_score(
                X_clust,
                clust_ids
            )
            best_score = score
            best_n_clust = num_clusters
            best_clust_ids = clust_ids

        else:
            for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS, CLUSTER_STEP_SIZE):
                model = KMeans(n_clusters=n_clusters, n_jobs=NUM_JOBS).fit(X_clust)
                clust_ids = model.predict(X_clust)

                X1 = X.to_pandas()
                score = get_score(
                    list(X1[target]),
                    clust_ids
                )
                improve = False
                if best_score is None:
                    improve = True
                elif best_score > score:
                    improve = True

                if improve:
                    best_score = score
                    best_n_clust = n_clusters
                    best_clust_ids = clust_ids

        if best_score is None:
            return []
        else:
            X[:, f'cluster_ids_{best_n_clust}'] = dt.Frame(best_clust_ids)
        return X
Пример #3
0
class TestModelTypeChecking(object):
    """
    Test model type checking utilities
    """

    ##////////////////////////////////////////////////////////////////////
    ## is_estimator testing
    ##////////////////////////////////////////////////////////////////////

    def test_estimator_alias(self):
        """
        Assert isestimator aliases is_estimator
        """
        assert isestimator is is_estimator

    @pytest.mark.parametrize("model", ESTIMATORS, ids=obj_name)
    def test_is_estimator(self, model):
        """
        Test that is_estimator works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_estimator(model)

        obj = model()
        assert is_estimator(obj)

    @pytest.mark.parametrize("cls", [
        list, dict, tuple, set, str, bool, int, float
    ], ids=obj_name)
    def test_not_is_estimator(self, cls):
        """
        Assert Python objects are not estimators
        """
        assert inspect.isclass(cls)
        assert not is_estimator(cls)

        obj = cls()
        assert not is_estimator(obj)

    def test_is_estimator_pipeline(self):
        """
        Test that is_estimator works for pipelines
        """
        assert is_estimator(Pipeline)
        assert is_estimator(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LinearRegression())
        ])

        assert is_estimator(model)

    def test_is_estimator_search(self):
        """
        Test that is_estimator works for search
        """
        assert is_estimator(GridSearchCV)
        assert is_estimator(RandomizedSearchCV)

        model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']})
        assert is_estimator(model)

    @pytest.mark.parametrize("viz,params", [
        (Visualizer, {}),
        (ScoreVisualizer, {'model': LinearRegression()}),
        (ModelVisualizer, {'model': LogisticRegression()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_estimator_visualizer(self, viz, params):
        """
        Test that is_estimator works for Visualizers
        """
        assert inspect.isclass(viz)
        assert is_estimator(viz)

        obj = viz(**params)
        assert is_estimator(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_regressor testing
    ##////////////////////////////////////////////////////////////////////

    def test_regressor_alias(self):
        """
        Assert isregressor aliases is_regressor
        """
        assert isregressor is is_regressor

    @pytest.mark.parametrize("model", REGRESSORS, ids=obj_name)
    def test_is_regressor(self, model):
        """
        Test that is_regressor works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_regressor(model)

        obj = model()
        assert is_regressor(obj)

    @pytest.mark.parametrize("model",
        CLASSIFIERS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_regressor(self, model):
        """
        Test that is_regressor does not match non-regressor estimators
        """
        assert inspect.isclass(model)
        assert not is_regressor(model)

        obj = model()
        assert not is_regressor(obj)

    def test_is_regressor_pipeline(self):
        """
        Test that is_regressor works for pipelines
        """
        assert not is_regressor(Pipeline)
        assert not is_regressor(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LinearRegression())
        ])

        assert is_regressor(model)

    @pytest.mark.xfail(reason="grid search has no _estimator_type it seems")
    def test_is_regressor_search(self):
        """
        Test that is_regressor works for search
        """
        assert is_regressor(GridSearchCV)
        assert is_regressor(RandomizedSearchCV)

        model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']})
        assert is_regressor(model)

    @pytest.mark.parametrize("viz,params", [
        (ScoreVisualizer, {'model': LinearRegression()}),
        (ModelVisualizer, {'model': Ridge()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_regressor_visualizer(self, viz, params):
        """
        Test that is_regressor works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_regressor(viz)

        obj = viz(**params)
        assert is_regressor(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_classifier testing
    ##////////////////////////////////////////////////////////////////////

    def test_classifier_alias(self):
        """
        Assert isclassifier aliases is_classifier
        """
        assert isclassifier is is_classifier

    @pytest.mark.parametrize("model", CLASSIFIERS, ids=obj_name)
    def test_is_classifier(self, model):
        """
        Test that is_classifier works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_classifier(model)

        obj = model()
        assert is_classifier(obj)

    @pytest.mark.parametrize("model",
        REGRESSORS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_classifier(self, model):
        """
        Test that is_classifier does not match non-classifier estimators
        """
        assert inspect.isclass(model)
        assert not is_classifier(model)

        obj = model()
        assert not is_classifier(obj)

    def test_classifier_pipeline(self):
        """
        Test that is_classifier works for pipelines
        """
        assert not is_classifier(Pipeline)
        assert not is_classifier(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('linreg', LogisticRegression())
        ])

        assert is_classifier(model)

    @pytest.mark.xfail(reason="grid search has no _estimator_type it seems")
    def test_is_classifier_search(self):
        """
        Test that is_classifier works for search
        """
        assert is_classifier(GridSearchCV)
        assert is_classifier(RandomizedSearchCV)

        model = GridSearchCV(SVC(), {'kernel': ['linear', 'rbf']})
        assert is_classifier(model)

    @pytest.mark.parametrize("viz,params", [
        (ScoreVisualizer, {'model': MultinomialNB()}),
        (ModelVisualizer, {'model': MLPClassifier()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_classifier_visualizer(self, viz, params):
        """
        Test that is_classifier works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_classifier(viz)

        obj = viz(**params)
        assert is_classifier(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_clusterer testing
    ##////////////////////////////////////////////////////////////////////

    def test_clusterer_alias(self):
        """
        Assert isclusterer aliases is_clusterer
        """
        assert isclusterer is is_clusterer

    @pytest.mark.parametrize("model", CLUSTERERS, ids=obj_name)
    def test_is_clusterer(self, model):
        """
        Test that is_clusterer works for instances and classes
        """
        assert inspect.isclass(model)
        assert is_clusterer(model)

        obj = model()
        assert is_clusterer(obj)

    @pytest.mark.parametrize("model",
        REGRESSORS+CLASSIFIERS+TRANSFORMERS+DECOMPOSITIONS,
    ids=obj_name)
    def test_not_is_clusterer(self, model):
        """
        Test that is_clusterer does not match non-clusterer estimators
        """
        assert inspect.isclass(model)
        assert not is_clusterer(model)

        obj = model()
        assert not is_clusterer(obj)

    def test_clusterer_pipeline(self):
        """
        Test that is_clusterer works for pipelines
        """
        assert not is_clusterer(Pipeline)
        assert not is_clusterer(FeatureUnion)

        model = Pipeline([
            ('reduce_dim', PCA()),
            ('kmeans', KMeans())
        ])

        assert is_clusterer(model)

    @pytest.mark.parametrize("viz,params", [
        (ModelVisualizer, {'model': KMeans()})
    ], ids=lambda i: obj_name(i[0]))
    def test_is_clusterer_visualizer(self, viz, params):
        """
        Test that is_clusterer works on visualizers
        """
        assert inspect.isclass(viz)
        assert not is_clusterer(viz)

        obj = viz(**params)
        assert is_clusterer(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_gridsearch testing
    ##////////////////////////////////////////////////////////////////////

    def test_gridsearch_alias(self):
        """
        Assert isgridsearch aliases is_gridsearch
        """
        assert isgridsearch is is_gridsearch

    @pytest.mark.parametrize("model", SEARCH, ids=obj_name)
    def test_is_gridsearch(self, model):
        """
        Test that is_gridsearch works correctly
        """
        assert inspect.isclass(model)
        assert is_gridsearch(model)

        obj = model(SVC, {"C": [0.5, 1, 10]})
        assert is_gridsearch(obj)

    @pytest.mark.parametrize("model",
        [MLPRegressor, MLPClassifier, Imputer], ids=obj_name)
    def test_not_is_gridsearch(self, model):
        """
        Test that is_gridsearch does not match non grid searches
        """
        assert inspect.isclass(model)
        assert not is_gridsearch(model)

        obj = model()
        assert not is_gridsearch(obj)

    ##////////////////////////////////////////////////////////////////////
    ## is_probabilistic testing
    ##////////////////////////////////////////////////////////////////////

    def test_probabilistic_alias(self):
        """
        Assert isprobabilistic aliases is_probabilistic
        """
        assert isprobabilistic is is_probabilistic

    @pytest.mark.parametrize("model", [
        MultinomialNB, GaussianNB, LogisticRegression, SVC,
        RandomForestClassifier, GradientBoostingClassifier, MLPClassifier,
    ], ids=obj_name)
    def test_is_probabilistic(self, model):
        """
        Test that is_probabilistic works correctly
        """
        assert inspect.isclass(model)
        assert is_probabilistic(model)

        obj = model()
        assert is_probabilistic(obj)

    @pytest.mark.parametrize("model", [
        MLPRegressor, Imputer, StandardScaler, KMeans,
        RandomForestRegressor,
    ], ids=obj_name)
    def test_not_is_probabilistic(self, model):
        """
        Test that is_probabilistic does not match non probablistic estimators
        """
        assert inspect.isclass(model)
        assert not is_probabilistic(model)

        obj = model()
        assert not is_probabilistic(obj)
Пример #4
0
print(total)
# For total, Let threshold be 10
result = Binarizer(10)
# transformed feature
print(result.fit_transform(total))

# Assignment 5
features = np.array([[50, 50], [49, 50],
                     [48, 49], [-1.83, 3.52], [-2.76, 5.55], [-7.57, 4.90],
                     [-1.85, 3.51], [-7.587, 3.72], [-17, -15], [-1.78, 3.47],
                     [-1.98, 4.022], [-1.97, 2.34], [-5.25,
                                                     3.30], [-2.35, 4.0],
                     [2.42, 5.14], [-1.61, 4.989], [-2.18, 3.33], [-20, -18],
                     [-20, -20], [-21, -19]])
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
clusterer = KMeans(3, random_state=0)
# fit clusterer
clusterer.fit(features)
# Predict values
dataframe["group"] = clusterer.predict(features)
# View first few observation
result = dataframe.head(20)
print(result)

# plot
a = []
b = []
for i in range(0, 20):
    a.append(features[i, 0])
    b.append(features[i, 1])
Пример #5
0
           data=all_data13, order=1,line_kws={'color': 'blue'},scatter_kws={'color': 'grey'}).set(ylim=(0, 1))

palette=sns.cubehelix_palette(5, start=2, rot=0, dark=0, light=.95, reverse=False)
sns.lmplot(x='oil_price', y='share_price_scaled',hue='year', col='name',ci=None, 
           col_wrap=3, data=all_data13, order=1,palette=palette,size=4).set(ylim=(0, 1))

#==============================================================================
# Unsupervised Learning - Cluster analysis on Shell data
#==============================================================================
from sklearn.cluster import KMeans

shell=pd.DataFrame()
shell=all_data13[all_data13['name']=='RDSB.L']
# We need to scale also oil price, so clustering is not influenced by the relative size of one axis.
shell['oil_price_scaled']=scaler.fit_transform(shell['oil_price'].to_frame())
shell['cluster'] = KMeans(n_clusters=6, random_state=1).fit_predict(shell[['share_price_scaled','oil_price_scaled']])

# The 954 most common RGB monitor colors https://xkcd.com/color/rgb/
colors = ['baby blue', 'amber', 'scarlet', 'grey','milk chocolate', 'windows blue']
palette=sns.xkcd_palette(colors)

sns.lmplot(x='oil_price', y='share_price_scaled',ci=None,palette=palette, hue='cluster',fit_reg=0 ,data=shell)

#==============================================================================
# Supervised learning linear regression
#==============================================================================

from sklearn import linear_model

# 1.- Data preparation
shell15=pd.DataFrame()
Пример #6
0
def skl_clustering(cd, n_clusters=10, **kwargs):
    # cd == ndarray(words*disjuncts)
    clustering = kwa(('agglomerative', 'ward'), 'clustering', **kwargs)
    if type(clustering) is str:
        if clustering == 'kmeans':
            clustering = ('kmeans', 'k-means++', 10)
        elif clustering == 'agglomerative':
            clustering = ('agglomerative', 'ward')
        elif clustering == 'mean_shift':
            clustering = ('mean_shift', 'auto')
        elif clustering == 'group':  # TODO: call ILE clustering?
            print('Call ILE clustering from optimal_clusters?')
        elif clustering == 'random':  # TODO: call random clustering?
            print('Call random clustering from optimal_clusters?')
        else:
            clustering = ('agglomerative', 'ward')

    # linkage: ('ward', 'average', 'complete')
    cluster_criteria = kwa('silhouette', 'cluster_criteria', **kwargs)  # GL.0.6 legacy
    clustering_metric = kwa(('silhouette', 'euclidean'), 'clustering_metric', **kwargs)
    labels = np.asarray([[]])
    metrics = {'clustering': clustering}
    centroids = np.asarray([[]])

    try:  # if True:  #
        if clustering[0] == 'agglomerative':
            linkage = 'ward'
            affinity = 'euclidean'
            connectivity = None
            compute_full_tree = 'auto'
            if clustering[1] in ['average', 'complete', 'single']:
                linkage = clustering[1]
            if len(clustering) > 2:
                if clustering[2] in ['euclidean', 'cosine', 'manhattan']:
                    affinity = clustering[2]
            if len(clustering) > 3:  # connectivity
                print('skl_clustering: connectivity:', clustering[3])
                if type(clustering[3]) is int and clustering[3] > 0:
                    neighbors = clustering[3]
                    # TODO: int / dict 
                    connectivity = kneighbors_graph(cd, neighbors, include_self=False)
                    print(f'\nconnectivity: {connectivity}\n')

            if len(clustering) > 4:  # compute_full_tree
                if clustering[4] is bool:
                    compute_full_tree = clustering[4]
                    print(f'compute_full_tree: {compute_full_tree}\n')

            model = AgglomerativeClustering(
                n_clusters=n_clusters, linkage=linkage, affinity=affinity,
                connectivity=connectivity, compute_full_tree=compute_full_tree)
            model.fit(cd)
            labels = model.labels_

            # TODO: centroids = ...

        elif clustering[0] in ['k-means', 'kmeans']:

            print('skl_clustering ⇒ kmeans')  # FIXME:DEL

            if clustering[1] in ['k-means++']:  # 'random' - fails?
                init = clustering[1]
            else:
                init = 'k-means++'
            if len(clustering) > 2 and type(clustering[2]) is int:
                n_init = clustering[2]
            else:
                n_init = 10
            model = KMeans(init=init, n_clusters=n_clusters, n_init=n_init)
            model.fit(cd)
            labels = model.labels_
            metrics['inertia'] = model.inertia_
            centroids = np.asarray(model.cluster_centers_[:(max(labels) + 1)])

        elif clustering[0] in ['mean shift', 'mean_shift']:

            print('skl_clustering ⇒ mean shift')  # FIXME:DEL

            if len(clustering) < 2:
                bandwidth = None
            if type(clustering[1]) is int:
                bandwidth = clustering[1]
            else:
                bandwidth = None  # TODO: auto ⇒ estimate_bandwidth
            model = MeanShift(bandwidth=bandwidth)
            model.fit(cd)
            labels = model.labels_
            centroids = np.asarray(model.cluster_centers_[:(max(labels) + 1)])

        else:  # TODO: random clustering?
            model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters)
            model.fit(cd)
            labels = model.labels_
        # silhouette = metrics.silhouette_score(cd, labels, metric=silhouette_metric)
        try:
            metrics['silhouette_index'] = silhouette_score(cd, labels, metric=clustering_metric[1])
        except:
            metrics['silhouette_index'] = 0.0
        try:
            metrics['variance_ratio'] = calinski_harabaz_score(cd, labels)
        except:
            metrics['variance_ratio'] = 0.0
        # try: metrics['davies_bouldin_score'] = davies_bouldin_score(cd, labels)
        # except: metrics['davies_bouldin_score'] = 0.0

        return labels, metrics, centroids
    except:  # else:  #
        return [], {'clustering': 'skl_clustering error'}, []
Пример #7
0

km = k_means(X, k)
km.calcul()

super_scat_it(X, km.label, dim, km.centroid)


# ### 7.2.2 Exploration of the K-means Algorithm with Scikit Learn
# 
# Once the algorithm has been coded, we are going to make our life easier and simply use the [Scikit Learn library](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) (-_-).  First, let's check that everything is running fine.

# In[ ]:


kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
super_scat_it(X, kmeans.labels_, dim, kmeans.cluster_centers_)


# ### 7.2.3 Choosing the optimal number of clusters
# 
# Until now, we knew the actual number of subpopulations (parameterized by the variable $ k $) associated with the simulated data. On the other hand, with non simulated datasets, the data is only very rarely labeled. It is therefore important to develop methodologies in order to clearly define the number of clusters required.

# **Questions 7.2**
# 
# 1. Find a simple way to determine the optimal number of clusters.
# 2. Implement it.
# 3. How many clusters would you choose?

# **Questions 7.3**
# 
Пример #8
0
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numeric(df)


X = np.array(df.drop(['duration_ms','time_signature','Dancebility'],1).astype(float))
y = np.array(df['Manual Mood Classification'])
X = preprocessing.scale(X)
clf = KMeans(n_clusters = 3)
clf.fit(X)

correct_count = 0
for i in range(len(X)):
    predict_data = np.array(X[i].astype(float))
    predict_data = predict_data.reshape(-1, len(predict_data))
    prediction = clf.predict(predict_data)
    if prediction == y[i]:
        correct_count += 1
print(df.head())
print(correct_count/len(X))



import pickle

from sklearn.cluster import KMeans
from numpy import size

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run KMeans on training set")
    parser.add_argument("--dataset", type=str, default="data/train_keypoints.p",
                        help="number of clusters")
    parser.add_argument("--clusters", type=int, default=500,
                        help="number of clusters")

    args = parser.parse_args()
    dataset = args.dataset
    clusters = args.clusters

    print("Loading dataset")
    train_features = pickle.load(open(dataset, "rb"))
    n_features = len(train_features)

    print("Number of feature points to run clustering on: %d" % n_features)

    # Clustering with KMeans.
    print("Running KMeans clustering")
    kmeans = KMeans(init='k-means++', n_clusters=clusters, n_init=10, n_jobs=2,
        verbose=1)
    kmeans.fit(train_features)

    # Save trained kmeans object to file.
    pickle.dump(kmeans, open("data/cb_%dclusters.p" % clusters, "wb"))
Пример #10
0
    actual_split.remove('')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
X = cv.fit_transform(actual_split).toarray()
V = cv.vocabulary_
B = cv.get_feature_names()

from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X = pca.fit_transform(X)

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.savefig('Satisfied Initial Cluster WCSS.png', dpi=500)
plt.show()

opt = 5

kmeans = KMeans(n_clusters=opt, init='k-means++', max_iter=300, n_init=10)
y_kmeans = kmeans.fit_predict(X)

copy_actual['Cluster Level 3'] = list(y_kmeans)
Пример #11
0
def bow_kmeans(X,k):
    from sklearn.cluster import KMeans
    estimator = KMeans(init='k-means++', n_clusters=k, n_init=10, random_state= 0)
    estimator.fit(X)
    return estimator.labels_
Пример #12
0
# print len(question_dict)

cluster_docs = [""] * 5
documents = []
charctesToRemove = ['"', "'"]
# myfile = open('readd.txt', 'r')
for line in question_dict:
    lines = question_dict[line].ques.encode('ascii', 'ignore')
    lineFile = lines.translate(None, ''.join(charctesToRemove))
    documents.append(lineFile)

# myfile.close()
true_k = 5
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
index = model.predict(X)

k = 0
for q in question_dict:
    for answers in question_dict[q].ans:
        cluster_docs[index[k]] = cluster_docs[index[k]] + answers
    k = k + 1

# print("\n\n\n\nTop terms per cluster:")
# for i in range(true_k):
#     print "Cluster %d:" % i,
Пример #13
0
def cluster(data):
    # Get path of app directory
    path = os.path.abspath(os.path.dirname(__file__)) + '/data/'
    cluster_csv_name = 'cluster_df.csv'
    all_csv_name = 'all_types.csv'

    cluster_csv_path = path + cluster_csv_name
    all_csv_path = path + all_csv_name

    all_df = pd.read_csv(all_csv_path)
    df = pd.read_csv(cluster_csv_path)
    df = df.drop(['Unnamed: 0'], axis=1)

    # Scale features
    scale = StandardScaler()
    scale.fit(df)
    df_std = scale.transform(df)

    kmeans = KMeans(n_clusters=50)
    kmeans = kmeans.fit(df_std)
    labels = kmeans.predict(df_std)

    # Create user input features
    user_input = {
        'bedrooms': int(data['bedroom']),
        'bathrooms': float(data['bathroom']),
        'finished_SqFt': float(data['finished_sq_ft']),
        'total_rooms': float(data['total_room']),
        'Allston': 1 if data['neighborhood'] == 'Allston' else 0,
        'Back Bay': 1 if data['neighborhood'] == 'Back Bay' else 0,
        'Bay Village': 1 if data['neighborhood'] == 'Bay Village' else 0,
        'Beacon Hill': 1 if data['neighborhood'] == 'Beacon Hill' else 0,
        'Brighton': 1 if data['neighborhood'] == 'Brighton' else 0,
        'Charlestown': 1 if data['neighborhood'] == 'Charlestown' else 0,
        'Chinatown': 1 if data['neighborhood'] == 'Chinatown' else 0,
        'Downtown': 1 if data['neighborhood'] == 'Downtown' else 0,
        'Downtown Crossing': 1 if data['neighborhood'] == 'Downtown Crossing' else 0,
        'East Boston': 1 if data['neighborhood'] == 'East Boston' else 0,
        'Fenway': 1 if data['neighborhood'] == 'Fenway' else 0,
        'Hyde Park': 1 if data['neighborhood'] == 'Hyde Park' else 0,
        'Jamaica Plain': 1 if data['neighborhood'] == 'Jamaica Plain' else 0,
        'Kenmore': 1 if data['neighborhood'] == 'Kenmore' else 0,
        'Leather District': 1 if data['neighborhood'] == 'Leather District' else 0,
        'Mattapan': 1 if data['neighborhood'] == 'Mattapan' else 0,
        'Mission Hill': 1 if data['neighborhood'] == 'Mission Hill' else 0,
        'North Dorchester': 1 if data['neighborhood'] == 'North Dorchester' else 0,
        'North End': 1 if data['neighborhood'] == 'North End' else 0,
        'Roslindale': 1 if data['neighborhood'] == 'Roslindale' else 0,
        'Roxbury': 1 if data['neighborhood'] == 'Roxbury' else 0,
        'South Boston': 1 if data['neighborhood'] == 'South Boston' else 0,
        'South Dorchester': 1 if data['neighborhood'] == 'South Dorchester' else 0,
        'South End': 1 if data['neighborhood'] == 'South End' else 0,
        'West End': 1 if data['neighborhood'] == 'West End' else 0,
        'West Roxbury': 1 if data['neighborhood'] == 'West Roxbury' else 0,
        'Winthrop': 1 if data['neighborhood'] == 'Winthrop' else 0}

    user_df = pd.DataFrame(user_input, index=[0])

    # Scale features from user input
    scaled_user_df = scale.transform(user_df)
    # Get cluster for user input
    user_cluster = kmeans.predict(scaled_user_df)
    # Get distance from user input datapoint
    trans = kmeans.transform(df_std)
    # Sort distance
    closest_points = []
    argsor = np.argsort(trans[:, user_cluster[0]])
    for i, argsortidx in enumerate(argsor):
        if i == 3:
            break
        closest_points.append(argsortidx)

    zpids = []
    addresses = []
    prices = []
    sold_dates = []

    # Get index of the 3 shortest distance from user cluster
    for i in closest_points:
        zpid = all_df.loc[i, 'zpid']
        zpids.append(zpid)

        add = all_df.loc[i, 'address']
        addresses.append(add)

        price = 'Last sold price: $' + abbrNumber(all_df.loc[i, 'price'])
        prices.append(price)

        sold_date = 'Sold on: {}'.format(all_df.loc[i, 'readable_date_sold'])
        sold_dates.append(sold_date)

    # Get picture by zpid
    pic_urls = []
    home_urls = []

    zillow_id = app.config['ZILLOW_API_KEY']
    url = 'http://www.zillow.com/webservice/GetUpdatedPropertyDetails.htm?'
    tree = ''

    for id in zpids:
        zpid_data = {'zws-id': zillow_id, 'zpid': id}
        query_string = url + urllib.parse.urlencode(zpid_data)

        response = requests.get(query_string)

        msg = response.content
        tree = ET.fromstring(msg)

        code = tree.find('message/code')

        if code.text == '0':
            result = tree.find('response')

            homeInfo = result.find('links/homeInfo')
            images = result.find('images/image')

            home_url = homeInfo.text if homeInfo is not None else None
            pic_url = images[0].text if images is not None else 'http://source.unsplash.com/daily'

            home_urls.append(home_url)
            pic_urls.append(pic_url)
        else:
            home_urls.append(None)
            pic_urls.append('http://source.unsplash.com/daily')

    result = {
        'addresses': addresses,
        'prices': prices,
        'sold_dates': sold_dates,
        'home_urls': home_urls,
        'pic_urls': pic_urls
    }

    return result
Пример #14
0
# Plot the scatter of solutions as (r, theta) points because they are phi symmetric
plt.scatter(Rs, Thetas)
plt.show()

# #####################CLUSTERING#################

num_clusters = 4

# Convert solns to an np array of (r, theta, phi) points
solns_as_nparray = []
for i in solns:
    solns_as_nparray.append((i.r, i.theta, i.phi))

solns_as_nparray = np.array(solns_as_nparray)

est = KMeans(n_clusters=num_clusters)
est.fit(solns_as_nparray)
labels = est.labels_


# Function to plot clusters that kmeans has estimated
def plot_Kmeans_clusters(ax, sample, label, k):
    colors = ['bo', 'ro', 'go', 'mo']
    for i in range(k):
        data = sample[label == i]
        ax.plot(data[:, 0], data[:, 1], colors[i])


# Plot clusters
fig, ax = plt.subplots(figsize=(8, 8))
plot_Kmeans_clusters(ax, solns_as_nparray.astype(float), labels, num_clusters)
feature_2 = "exercised_stock_options"
feature_3 = 'total_payments'
poi  = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list )
poi, finance_features = targetFeatureSplit( data )


### in the "clustering with 3 features" part of the mini-project,
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
for f1, f2, _ in finance_features:
    plt.scatter( f1, f2 )
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
pred = kmeans.fit_predict(finance_features)



### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"
def do_k_means(X: np.ndarray, k):
    kmeans = KMeans(n_clusters=k, max_iter=1000, n_init=500).fit(X)
    return kmeans.inertia_, kmeans.labels_
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import KMeans
global labels
data = np.array(final_data_object['no_identifiers_data_list'])
data = preprocessing.MinMaxScaler().fit_transform(data)
model = KMeans(n_clusters=4)
clustering = model.fit(data)
labels = clustering.labels_
labels = labels.tolist()
Пример #18
0
		main_invest_region,
		risk_beta,
		return_on_investment_3month,
		return_on_investment_6month,
		return_on_investment_1year,
		return_on_investment_3year,
		risk_return_level,
		established_scale,
		scale,
		risk_standard_deviation,
		fee
	]
	features.append(tmp_feature);

X = np.array(features);
kmeans = KMeans(n_clusters=20, random_state=0).fit(X);
X_category = kmeans.labels_;
cluster_centers = handle_cluster_center(kmeans.cluster_centers_, X_category);

with open('../data/cluster_centers.json', 'w') as f:
	json.dump(cluster_centers, f, indent=1);

new_fund_datas = [];
for i in range(len(fund_datas)):
	fund_info = fund_datas[i];
	fund_info['id'] = i;
	fund_info['cluster_id'] = int(X_category[i]);
	new_fund_datas.append(fund_info);
with open('../data/new_fund.json', 'w') as f:
	json.dump(new_fund_datas, f, indent=1, ensure_ascii=False);
from model import Preprocess
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

print('only we look whole distribution')

from sklearn.manifold import LocallyLinearEmbedding
from model import gen_init_point


data, feature, corpus = Preprocess.preprocess_chinese()


n_topic = 20
lle = LocallyLinearEmbedding(n_components=2)
data_lle = lle.fit_transform(data)
plt.show(data[:,0],data[:,1])
plt.show()



km = KMeans(n_cluster=)

Пример #20
0
# Set random seed for reproducibility
np.random.seed(1000)

min_nb_clusters = 2
max_nb_clusters = 20

if __name__ == '__main__':
    # Load the dataset
    digits = load_digits()
    X_train = digits['data'] / np.max(digits['data'])

    # Compute the inertias
    inertias = np.zeros(shape=(max_nb_clusters - min_nb_clusters + 1, ))

    for i in range(min_nb_clusters, max_nb_clusters + 1):
        km = KMeans(n_clusters=i, random_state=1000)
        km.fit(X_train)
        inertias[i - min_nb_clusters] = km.inertia_

    # Plot the inertias
    sns.set()

    fig, ax = plt.subplots(figsize=(12, 7))

    ax.plot(np.arange(2, max_nb_clusters + 1), inertias, "o-")
    ax.set_xlabel("Number of clusters", fontsize=18)
    ax.set_ylabel("Inertia", fontsize=18)
    ax.set_xticks(np.arange(2, max_nb_clusters + 1))
    ax.grid(True)
    plt.show()
Пример #21
0
def build_vocabulary(image_paths, vocab_size):
    """
    This function should sample HOG descriptors from the training images,
    cluster them with kmeans, and then return the cluster centers.

    Inputs:
        image_paths: a Python list of image path strings
         vocab_size: an integer indicating the number of words desired for the
                     bag of words vocab set

    Outputs:
        a vocab_size x (z*z*9) (see below) array which contains the cluster
        centers that result from the K Means clustering.

    You'll need to generate HOG features using the skimage.feature.hog() function.
    The documentation is available here:
    http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.hog

    However, the documentation is a bit confusing, so we will highlight some
    important arguments to consider:
        cells_per_block: The hog function breaks the image into evenly-sized
            blocks, which are further broken down into cells, each made of
            pixels_per_cell pixels (see below). Setting this parameter tells the
            function how many cells to include in each block. This is a tuple of
            width and height. Your SIFT implementation, which had a total of
            16 cells, was equivalent to setting this argument to (4,4).
        pixels_per_cell: This controls the width and height of each cell
            (in pixels). Like cells_per_block, it is a tuple. In your SIFT
            implementation, each cell was 4 pixels by 4 pixels, so (4,4).
        feature_vector: This argument is a boolean which tells the function
            what shape it should use for the return array. When set to True,
            it returns one long array. We recommend setting it to True and
            reshaping the result rather than working with the default value,
            as it is very confusing.

    It is up to you to choose your cells per block and pixels per cell. Choose
    values that generate reasonably-sized feature vectors and produce good
    classification results. For each cell, HOG produces a histogram (feature
    vector) of length 9. We want one feature vector per block. To do this we
    can append the histograms for each cell together. Let's say you set
    cells_per_block = (z,z). This means that the length of your feature vector
    for the block will be z*z*9.

    With feature_vector=True, hog() will return one long np array containing every
    cell histogram concatenated end to end. We want to break this up into a
    list of (z*z*9) block feature vectors. We can do this using a really nifty numpy
    function. When using np.reshape, you can set the length of one dimension to
    -1, which tells numpy to make this dimension as big as it needs to be to
    accomodate to reshape all of the data based on the other dimensions. So if
    we want to break our long np array (long_boi) into rows of z*z*9 feature
    vectors we can use small_bois = long_boi.reshape(-1, z*z*9).

    The number of feature vectors that come from this reshape is dependent on
    the size of the image you give to hog(). It will fit as many blocks as it
    can on the image. You can choose to resize (or crop) each image to a consistent size
    (therefore creating the same number of feature vectors per image), or you
    can find feature vectors in the original sized image.

    ONE MORE THING
    If we returned all the features we found as our vocabulary, we would have an
    absolutely massive vocabulary. That would make matching inefficient AND
    inaccurate! So we use K Means clustering to find a much smaller (vocab_size)
    number of representative points. We recommend using sklearn.cluster.KMeans
    to do this. Note that this can take a VERY LONG TIME to complete (upwards
    of ten minutes for large numbers of features and large max_iter), so set
    the max_iter argument to something low (we used 100) and be patient. You
    may also find success setting the "tol" argument (see documentation for
    details)
    """
    features = None
    for im_path in image_paths:
        im = rgb2grey(imread(im_path))
        cells_per_block = 3
        im_hog = hog(im, cells_per_block=(cells_per_block, cells_per_block))
        im_hog = im_hog.reshape(-1, cells_per_block * cells_per_block * 9)
        if features is None:
            features = im_hog
        else:
            features = np.vstack([features, im_hog])
    clf = KMeans(vocab_size, max_iter=100, tol=1e-3, n_jobs=-1)
    clf.fit(features)

    return clf.cluster_centers_
cust_df.info()

# In[6]:

import numpy as np
from sklearn.preprocessing import StandardScaler
X = cust_df.values[:, 1:]
X = np.nan_to_num(X)
clus_dataset = StandardScaler().fit_transform(X)
clus_dataset

# In[7]:

from sklearn.cluster import KMeans
clusternum = 4
k_means = KMeans(init="k-means++", n_clusters=clusternum, n_init=12)
k_means.fit(clus_dataset)
lables = k_means.labels_
print(lables)

# In[8]:

cust_df['Clus_km'] = lables
cust_df.head(5)

# In[10]:

cust_df.groupby('Clus_km').mean()

# In[14]:
Пример #23
0
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

data = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1.8)
print(data)
plt.scatter(data[0][:,0],data[0][:,1], c=data[1])
plt.show()

from sklearn.cluster import KMeans
wcss=[]
for i in range(1, 20):
        kmeans=KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10)
        kmeans.fit(data[0])
        wcss.append(kmeans.inertia_)
plt.plot(range(1,20),wcss )
plt.show()

kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10)
pred_y = kmeans.fit_predict(data[0])
plt.scatter(data[0][:,0], data[0][:,1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

from pyclustering.cluster.kmedoids import kmedoids
initial_medoids = [100,200,300,400]
k_medoids = kmedoids (nclusters=4, data=data[0],initial_index_medoids=initial_medoids)
k_medoids.process()
pred_y = k_medoids.predict(data[0])

from pyclustering.cluster import cluster_visualizer
clusters = k_medoids.get_clusters()  # list of clusters
Пример #24
0
f2 = london['Mean TemperatureC'].values

X = np.array(list(zip(f1, f2)))
#X = np.array(list(zip(f1, f2)))
#X = london.iloc[:,[8,14]].values
#pl.scatter(f1, f2, c='black', s=7)

#pl.figure()
#X = london.iloc[:,[1,7]].values
#pl.scatter(X[:,0],X[:,1], c=cluster.labels_, cmap='rainbow')

# Elbow Method

I = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++')
    kmeans.fit(X)
    I.append(kmeans.inertia_)

pl.figure()
pl.plot(range(1, 11), I)
pl.title('The Elbow Method')
pl.xlabel('Number of Clusters')
pl.ylabel('WCSS')
pl.show()

# Algorithme du Kmeans

kmeans = KMeans(n_clusters=3, init='k-means++')
y_kmeans = kmeans.fit_predict(X)
Пример #25
0
# for i in range(2,12):
#     km=KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0)
#     km.fit(reduced_data)
#     wcss.append(km.inertia_)
# plt.plot(range(2,12),wcss)
# plt.title('Elbow Method')
# plt.xlabel('Number of clusters')
# plt.ylabel('wcss')
# plt.show()


# k means determine k
distortions = []
K = range(2,12)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(reduced_data)
    kmeanModel.fit(reduced_data)
    distortions.append(sum(np.min(cdist(reduced_data, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / reduced_data.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()



# cluster
clusterer = KMeans(n_clusters=5, random_state = RAN_STATE).fit(reduced_data)
preds = clusterer.predict(reduced_data)
Пример #26
0
sns.FacetGrid(df, hue="Species", height=6)    .map(plt.scatter, "PetalLengthCm", "PetalWidthCm")    .add_legend()
plt.show()


# Let's find the optimal number of cluster and apply K-Means Algorithm

# In[8]:


x = df.iloc[:, [0, 1, 2, 3]].values

from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', 
                    max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)


# In[9]:


plt.plot(range(1, 11), wcss,'*-')
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') 
plt.text(4,200000,"optimal number of clusters = 3")
plt.show()

Пример #27
0
    def initialization(self):
        # Número de evaluaciones/épocas
        self._epochs = 0
        self._evals = 0
        self._last_EP_update_eval = -1
        self._last_EP_update_epoch = -1

        # Inicializar semilla aleatoria
        if self._random_seed is not None:
            random.seed(self._random_seed)
            np.random.seed(self._random_seed)
            #print("Semilla {}".format(self._random_seed)) # DEBUG

        # Población externa
        self._EP = np.empty(shape=(0, self._num_objectives))
        self._EP_chromosomes = np.empty(shape=(0, self._dimensionality))

        # Población de individuos
        self._population = np.random.randint(
            0,
            self._num_clusts,
            size=(self._population_size, self._dimensionality))  # COMPROBADO
        if self._kmeans_init_ratio > 0.0:
            initialized_with_kmeans = random.sample(
                range(0, self._population_size),
                floor(self._population_size * self._kmeans_init_ratio))

            for i in initialized_with_kmeans:
                self._population[i] = KMeans(n_clusters=self._num_clusts,
                                             max_iter=np.random.randint(
                                                 10, 20)).fit_predict(
                                                     self._data)  #.labels_

        # Vectores solución (f-values) de todos los individuos de la población
        self._FV = np.empty((self._population_size, self._num_objectives))
        for i in range(self._population_size):
            self._FV[i] = self.f(self._population[i])

        # Punto de referencia z (mejor valor obtenido con cada función objetivo)
        if self._z is None:
            if self._maximization:
                self._z = np.amax(self._FV, axis=0)
            else:
                self._z = np.amin(self._FV, axis=0)

        # Punto de referencia z-worst: el peor valor obtenido con cada función objetivo
        if self._maximization:
            self._z_worst = np.amin(self._FV, axis=0)
        else:
            self._z_worst = np.amax(self._FV, axis=0)

        # Vectores de pesos lambda
        # SUS ELEMENTOS DEBEN SUMAR 1
        self._lambdas = normalize(
            np.random.randint(low=1,
                              high=99,
                              size=(self._population_size,
                                    self._num_objectives)))

        # INICIALIZACIÓN DEL VECINDARIO
        # Matriz de distancias de los vectores lambda
        lambdas_distances = pairwise_distances(self._lambdas,
                                               Y=None,
                                               metric='euclidean')
        # Vecindario de cada vector de pesos lambda-i
        self._lambda_neighborhood = lambdas_distances.argsort(
            axis=1)[:, 0:self._lambda_neighborhood_size]  # COMPROBADO
Пример #28
0
def train_net(data, params):
    #
    # UNPACK DATA
    #

    x_train, y_train, x_val, y_val, x_test, y_test = data['spectral']['train_and_test']
    x_train_unlabeled, y_train_unlabeled, x_train_labeled, y_train_labeled = data['spectral']['train_unlabeled_and_labeled']
    x_val_unlabeled, y_val_unlabeled, x_val_labeled, y_val_labeled = data['spectral']['val_unlabeled_and_labeled']

    if 'siamese' in params['affinity']:
        pairs_train, dist_train, pairs_val, dist_val = data['siamese']['train_and_test']

    x = np.concatenate((x_train, x_val, x_test), axis=0)
    y = np.concatenate((y_train, y_val, y_test), axis=0)

    if len(x_train_labeled):
        y_train_labeled_onehot = OneHotEncoder().fit_transform(y_train_labeled.reshape(-1, 1)).toarray()
    else:
        y_train_labeled_onehot = np.empty((0, len(np.unique(y))))

    #
    # SET UP INPUTS
    #

    # create true y placeholder (not used in unsupervised training)
    y_true = tf.placeholder(tf.float32, shape=(None, params['n_clusters']), name='y_true')

    batch_sizes = {
        'Unlabeled': params['batch_size'],
        'Labeled': params['batch_size'],
        'Orthonorm': params.get('batch_size_orthonorm', params['batch_size']),
    }

    input_shape = x.shape[1:]

    # spectralnet has three inputs -- they are defined here
    inputs = {
        'Unlabeled': Input(shape=input_shape,name='UnlabeledInput'),
        'Labeled': Input(shape=input_shape,name='LabeledInput'),
        'Orthonorm': Input(shape=input_shape,name='OrthonormInput'),
    }

    #
    # DEFINE AND TRAIN SIAMESE NET
    #

    # run only if we are using a siamese network
    if params['affinity'] == 'siamese':
        siamese_net = networks.SiameseNet(inputs, params['arch'], params.get('siam_reg'), y_true)

        history = siamese_net.train(pairs_train, dist_train, pairs_val, dist_val,
                                    params['siam_lr'], params['siam_drop'], params['siam_patience'],
                                    params['siam_ne'], params['siam_batch_size'])

    else:
        siamese_net = None

    #
    # DEFINE AND TRAIN SPECTRALNET
    #

    spectral_net = networks.SpectralNet(inputs, params['arch'],
                                        params.get('spec_reg'), y_true, y_train_labeled_onehot,
                                        params['n_clusters'], params['affinity'], params['scale_nbr'],
                                        params['n_nbrs'], batch_sizes, siamese_net, x_train, len(x_train_labeled))

    spectral_net.train(
        x_train_unlabeled, x_train_labeled, x_val_unlabeled,
        params['spec_lr'], params['spec_drop'], params['spec_patience'],
        params['spec_ne'])

    print("finished training")

    #
    # EVALUATE
    #

    # get final embeddings
    x_spectralnet = spectral_net.predict(x)

    # get accuracy and nmi
    kmeans_assignments, km = get_cluster_sols(x_spectralnet, ClusterClass=KMeans, n_clusters=params['n_clusters'], init_args={'n_init':10})
    y_spectralnet, _ = get_y_preds(kmeans_assignments, y, params['n_clusters'])
    print_accuracy(kmeans_assignments, y, params['n_clusters'])
    from sklearn.metrics import normalized_mutual_info_score as nmi
    nmi_score = nmi(kmeans_assignments, y)
    print('NMI: ' + str(np.round(nmi_score, 3)))

    if params['generalization_metrics']:
        x_spectralnet_train = spectral_net.predict(x_train_unlabeled)
        x_spectralnet_test = spectral_net.predict(x_test)
        km_train = KMeans(n_clusters=params['n_clusters']).fit(x_spectralnet_train)
        from scipy.spatial.distance import cdist
        dist_mat = cdist(x_spectralnet_test, km_train.cluster_centers_)
        closest_cluster = np.argmin(dist_mat, axis=1)
        print_accuracy(closest_cluster, y_test, params['n_clusters'], ' generalization')
        nmi_score = nmi(closest_cluster, y_test)
        print('generalization NMI: ' + str(np.round(nmi_score, 3)))

    return spectral_net
Пример #29
0
    print(index)
    for i in range(4):
        if(i not in index):
            newCentroid.append(centroids[i])
            newHist.append(hist[i])
    if(centroids != []):
        for (percent, color) in zip(newHist, newCentroid):
                print(color)
                if(percent>max):
                    max = percent
                    clr = color


    # return the bar chart
    return clr.astype("uint8").tolist()


img = cv2.imread("2.jpeg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img = img.reshape((img.shape[0] * img.shape[1],3)) #represent as row*column,channel number
clt = KMeans(n_clusters=4) #cluster number
clt.fit(img)



hist = find_histogram(clt)
bar = dominantColor(hist, clt.cluster_centers_)

print(bar)
    day_names = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'
    ]  #,'Sunday']
    grids = grids_df.grid.loc[[_ in day_names for _ in grids_df.day_name]]
    grids = np.array(list(grids))[:, :15, :15]
    smooth_grids = np.array([grid_sqavg(_, 5).flatten() for _ in grids])
    grids = np.array([_.flatten() for _ in grids])

    fa = FactorAnalysis(n_components=3).fit_transform(grids)

    aics = []
    k_vals = range(1, 25)
    for k in k_vals:
        print(k)
        km = KMeans(n_clusters=k).fit(fa)
        aics.append(kmeans_AIC(km))

    plt.plot(k_vals, aics)
    plt.xlabel('Number of Clusters (k)', size=16)
    plt.ylabel('Akaike Information Criterion', size=16)
    plt.savefig('grid_aic_fa.png')
    plt.close()

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    k_best = np.where(np.array(aics) == np.min(aics))[0][0]
    km_best = KMeans(n_clusters=k_best).fit(fa)
    ax.scatter(fa[:, 0], fa[:, 1], fa[:, 2], c=km_best.labels_)
    plt.show()