def km(tx, ty, rx, ry, add="", times=10): print "km" #this does the exact same thing as the above clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 50, 88] # eight for num speakers, eleven for num vowels for num_c in clusters: add += "nc" + str(num_c) errs = [] # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] sqrd_err = [(processed[n]-ty[n])**2 for n in range(len(processed))] errs.append(sum() / float(len(ry))) plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) nn(newtx, ty, newrx, ry, add="onKM"+add) print "km done" + add
def k_means(testX, goodSample, data=None, train=False, plot=False): if train==True: n_clusters = 3 est = KMeans(n_clusters) est.fit(data) centers = est.cluster_centers_ utils.pickle(est, 'SrcTeam/capsuleData/capsule_k_means') else: est = utils.unpickle('SrcTeam/capsuleData/capsule_k_means') numMatch = 0.0 numGood = goodSample.shape[0] #sampleLabel = clusterLabel(centers, sample) testLabel = est.predict(testX) for i in range(numGood): if est.predict(goodSample[i,:]) == testLabel: numMatch += 1 if plot==True: fig = pl.figure() pl.clf() ax = Axes3D(fig) labels = est.labels_ ax.scatter(data[:,0],data[:,1],data[:,2],c=labels.astype(np.float)) pl.show() return float(numMatch) / numGood
def cluster_and_learn_nn(train_data, train_target, test_data, test_target,): # get cluster assignments for training and test data # 2 was the best k per earlier experiments km = KMeans(n_clusters=2, random_state=1).fit(train_data) train_clusters = km.predict(train_data) test_clusters = km.predict(test_data) # add the cluster assignment as a feature train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1) test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1) print('KMeans cluster NN') learn_nn(train_with_cluster, train_target, test_with_cluster, test_target) # repeat with EM # 4 = best c per earlier experiments em = GMM(n_components=4, random_state=1) em.fit(train_data) train_clusters = em.predict(train_data) test_clusters = em.predict(test_data) # add the cluster assignment as a feature train_with_cluster = np.concatenate((train_data, train_clusters.reshape(len(train_clusters), 1)), axis=1) test_with_cluster = np.concatenate((test_data, test_clusters.reshape(len(test_clusters), 1)), axis=1) print('EM cluster NN') learn_nn(train_with_cluster, train_target, test_with_cluster, test_target)
class DataCreator(object): def __init__(self): self.name = 'DataCreator Class' self.model = None self.events_per_centroid = None def fit(self, data, n_clusters=2): self.model = KMeans(n_clusters=n_clusters) self.model.fit(data) event_per_centroid = [] output = self.model.predict(data) for icenter in range(n_clusters): event_per_centroid = (np.append(event_per_centroid, float(sum(output==icenter))/ float(len(data)))) self.events_per_centroid = event_per_centroid def create_events(self, data, n_events=100): output = self.model.predict(data) for icenter in range(len(self.events_per_centroid)): qtd_events = (np.float(n_events)*np.ceil(self.events_per_centroid[icenter])).astype(int) if qtd_events == 0: continue select_data = data[output==icenter,:] return select_data [np.random.randint(0, select_data.shape[0]-1, size=qtd_events),:]
class WordCluster(object): def __init__(self): self.train_list = self.build_training_set() # Initialize Kmeans self.kmeans = KMeans(n_clusters=2) self.kmeans.fit(self.train_list) self.centroids = self.kmeans.cluster_centers_ self.labels = self.kmeans.labels_ self.word_scope = ['global', 'local'] @staticmethod def build_training_set(): parameters = [] with open('../data/local_params.dat', 'r') as fp: for line in fp: word_params = [float(x.replace(" ", "").replace(")", "").replace("(", "")) for x in line[:-1].split(';')[0].split(',')] parameters.append(word_params) parameters = np.array(parameters) dim2array = zip(parameters[:, 1], parameters[:, 2]) return dim2array def predict(self, params): centroid_test = self.kmeans.predict((0.1, 0.1))[0] if centroid_test == 1: self.word_scope = ['local', 'global'] return self.word_scope[self.kmeans.predict(params)[0]]
class TextClusters: """Tokenizes text, and fits to a KMeans model""" def __init__(self): self.stemmer = PorterStemmer() self.vectorizer = TfidfVectorizer() self.clf = KMeans(10) def tokenize(self,title): title = title.decode('latin1') title = [word for word in title.lower().split() if word not in punctuation] title = [self.stemmer.stem(word) for word in title] return " ".join(title) def fit(self,text): features = np.array([self.tokenize(title) for title in text]) X = self.vectorizer.fit_transform(features).toarray() self.clf.fit(X) return self.clf.predict(X) def predict_one(self,line): query = self.tokenize(line) query_vector = self.vectorizer.transform([query]).toarray() return self.clf.predict(query_vector)[0]
class joshkmeans(BaseEstimator, ClusterMixin): def __init__(self): self.k_means_back = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10) def fit(self, X, y): self.k_means_back.fit(X) self.result = self.k_means_back.predict(X) self.y_train = y def find_majority(self, k): myMap = {} maximum = ( '', 0 ) # (occurring element, occurrences) for n in k: if n in myMap: myMap[n] += 1 else: myMap[n] = 1 # Keep track of maximum on the go if myMap[n] > maximum[1]: maximum = (n,myMap[n]) return maximum def predict(self, X): test = self.k_means_back.predict(X) #Maps the cluster labels back to the provided labels by comparing the predict results #to the results from training set for i in range(len(test)): cluster_label = test[i] lst = [] for j in range(len(self.result)): if (self.result[j] == cluster_label): lst.append(self.y_train[j,0]) val = self.find_majority(lst)[0] test[i] = val return test
def km(tx, ty, rx, ry, add="", times=5): #this does the exact same thing as the above errs = [] checker = KM(n_clusters=2) checker.fit(ry) truth = checker.predict(ry) # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] errs.append(sum((processed-truth)**2) / float(len(ry))) plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) nn(newtx, ty, newrx, ry, add="onKM"+add)
def compare_sklearn(x_train, x_test, y_train, y_test, k): """ Apply the KMeans algorithm of sklearn to the input data set, and return its "accuracy" of assigning labels to clusters. Use k as the number of clusters learned by sklearn's KMeans :param x_train: :param x_test: :param y_train: :param y_test: :return: Accuracy of the clustering assignments, using the training set accuracy if test set is empty """ # ## TODO: Your code here (Q6) # this code will call eval_clustering; see main for how to use clu = KMeans(n_clusters=k) clu.fit(x_train) if len(x_test) > 0: guess_clusters = clu.predict(x_test) truth = y_test print guess_clusters print truth else: guess_clusters = clu.predict(x_train) truth = y_train return eval_clustering(truth, guess_clusters)
def KMeans_(clusters, model_data, prediction_data = None): t0 = time() kmeans = KMeans(n_clusters=clusters).fit(model_data) if prediction_data == None: labels = kmeans.predict(model_data) else: labels = kmeans.predict(prediction_data) print "K Means Time: %0.3f" % (time() - t0) return labels
def runKmens(K): training,lable= genTrainingAndLableData() data=np.array(training) testData=readTestFile() test=np.array(testData) #y_pred = KMeans(n_clusters=K).fit_predict(data) y_pred = KMeans(n_clusters=K).fit(data) print y_pred.predict(data) return y_pred.cluster_centers_
def test_full_vs_elkan(): km1 = KMeans(algorithm='full', random_state=13) km2 = KMeans(algorithm='elkan', random_state=13) km1.fit(X) km2.fit(X) homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0
def test_predict_equal_labels(): km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, algorithm='full') km.fit(X) assert_array_equal(km.predict(X), km.labels_) km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1, algorithm='elkan') km.fit(X) assert_array_equal(km.predict(X), km.labels_)
def test_predict(): k_means = KMeans(k=n_clusters, random_state=42).fit(X) # sanity check: predict centroid labels pred = k_means.predict(k_means.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = k_means.predict(X) assert_array_equal(k_means.predict(X), k_means.labels_)
def k_nearest_cluster(xs,ds,n): model = KMeans(n_clusters=n, precompute_distances = True, n_jobs=1)#multiparallel doesn't work :( model.fit(xs) frame_x = ps.DataFrame(model.predict(xs)[None].T,columns=["Mnew_knearest" + str(n)]) frame_x.name = "Mnew_knearest" + str(n) frame_d = ps.DataFrame(model.predict(ds)[None].T,columns=["Mnew_knearest" + str(n)]) frame_d.name = "Mnew_knearest" + str(n) return (frame_x, frame_d)
def kMeansClustering(train,evaluate,test): km = KMeans(n_clusters=40) f_train = km.fit_predict(train[['X','Y']]) f_eval = km.predict(evaluate[['X','Y']]) f_test = km.predict(test[['X','Y']]) print km.cluster_centers_ print f_train print f_eval print f_test return (f_train,f_eval,f_test)
def KMeansClusering(self): for n in range(2,6): clusterer = KMeans(n_clusters=n, random_state=42) clusterer.fit(self.reduced_data) preds = clusterer.predict(self.reduced_data) centers = clusterer.cluster_centers_ sample_preds = clusterer.predict(self.pca_samples) score = metrics.silhouette_score(self.reduced_data, preds, metric='sqeuclidean') print("K Means with cluster number %d, score %0.3f"% (n, score)) return
class EKGAnomalyDetection(TimeSeriesAnomalyDetection): def __init__(self, anomaly_fraction=.1, window=32, step=2, samples=200000): self.anomaly_fraction = anomaly_fraction self.window = window self.step = step self.samples = samples def build_model(self, x): self.window_vector = np.zeros(self.window) for i in xrange(self.window): w = np.sin(np.pi * i / (self.window - 1)) self.window_vector[i] = np.square(w) if len(x.shape) == 2: x = x[:, 0] r = np.zeros((self.samples, self.window)) for i in xrange(self.samples): offset = i * self.step row = x[offset:offset+self.window] * self.window_vector scale = np.linalg.norm(row) r[i, :] = row / scale self.model = KMeans(n_clusters=50, max_iter=20) self.model.fit(r) def reconstruct_signal(self, x): if len(x.shape) == 2: x = x[:, 0] reconstructed_signal = np.zeros(len(x)) row = np.zeros(self.window) row[self.window/2:] = x[:self.window/2] scale = np.linalg.norm(row) row /= scale ndx = self.model.predict(row)[0] current = self.model.cluster_centers_[ndx, :] reconstructed_signal[:self.window/2] += current[self.window/2:] for i in xrange(self.window/2, len(x)-self.window/2, self.window/2): row = x[i-self.window/2:i+self.window/2] * self.window_vector scale = np.linalg.norm(row) if scale > 0: row /= scale else: row = np.zeros(self.window) ndx = self.model.predict(row)[0] current = self.model.cluster_centers_[ndx, :] reconstructed_signal[i-self.window/2:i+self.window/2] += current * scale return reconstructed_signal[:, np.newaxis]
def clusterSignal(self): acc_XNormalized = self.normalize(self.data['Acc_X']) km = KMeans(n_clusters=5, init='k-means++') """Binning Acc_X """ acc_XNormalizedValues = acc_XNormalized.as_matrix() #print acc_XNormalizedValues km.fit(acc_XNormalizedValues.reshape(-1,1)) km.predict(acc_XNormalizedValues.reshape(-1,1)) #print km.labels_ #for p in km.labels_: print p print km.cluster_centers_
def optimalClustering(self): n =2 clusterer = KMeans(n_clusters=n, random_state=42) clusterer.fit(self.reduced_data) preds = clusterer.predict(self.reduced_data) self.centers = clusterer.cluster_centers_ sample_preds = clusterer.predict(self.pca_samples) score = metrics.silhouette_score(self.reduced_data, preds, metric='sqeuclidean') print("K Means with cluster number %d, score %0.3f"% (n, score)) rs.cluster_results(self.reduced_data, preds, self.centers, self.pca_samples) return
def find_clusters(ax,reduced_data, n_clusters = 2, color='blue', cmap=plt.get_cmap('bwr'), title='K-means clustering on the dataset\n' 'Centroids are marked with white cross'): """ http://scikit-learn.sourceforge.net/dev/auto_examples/cluster/plot_kmeans_digits.html """ kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) kmeans.fit(reduced_data) # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() dx = (x_max - x_min) / 30 dy = (y_max - y_min) / 30 x_min, x_max = x_min - dx, x_max + dx y_min, y_max = y_min - dy, y_max + dy npixels = 500 # Step size of the mesh. Decrease to increase the quality of the VQ. # h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. hx = (x_max - x_min) / npixels hy = (y_max - y_min) / npixels xx, yy = np.meshgrid(np.arange(x_min, x_max, hx), np.arange(y_min, y_max, hy)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) ax.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') #plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) ax.scatter(reduced_data[:, 0], reduced_data[:, 1], s=50, c=color, cmap=cmap) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ ax.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) ax.set_title(title) ax.set_xlim(x_min, x_max) ax.set_ylim(y_min, y_max) ax.set_xticks(()) ax.set_yticks(()) return kmeans.predict(reduced_data)
def run(self, styleImage, styleMask, targetImage, targetMask, colors=4,outHdf5='masks.hdf5'): # Load images img_style = scipy.misc.imread(styleImage) if targetImage != None: img_content = scipy.misc.imread(targetImage) # Load masks mask_style = scipy.misc.imread(styleMask) mask_target = scipy.misc.imread(targetMask) # Save shapes style_shape = mask_style.shape target_shape = mask_target.shape if img_style.shape != style_shape: raise Exception('Style image and mask have different sizes!') if targetImage != None: if img_content.shape != target_shape: raise Exception('Content image and mask have different sizes!') # Run K-Means to get rid of possible intermediate colors style_flatten = mask_style.reshape(style_shape[0]*style_shape[1], -1) target_flatten = mask_target.reshape(target_shape[0]*target_shape[1], -1) kmeans = KMeans(n_clusters=colors, random_state=0).fit(style_flatten) # Predict masks labels_style = kmeans.predict(style_flatten.astype(float)) labels_target = kmeans.predict(target_flatten.astype(float)) style_kval = labels_style.reshape(style_shape[0], style_shape[1]) target_kval = labels_target.reshape(target_shape[0], target_shape[1]) # Dump f = h5py.File(outHdf5, 'w') for i in range(colors): f['style_mask_%d' % i] = (style_kval == i).astype(float) f['target_mask_%d' % i] = (target_kval == i).astype(float) # Torch style image save f['style_img'] = img_style.transpose(2, 0, 1).astype(float) / 255. if targetImage != None: f['content_img'] = img_content.transpose(2, 0, 1).astype(float) / 255. f['has_content'] = np.array([1]) else: f['has_content'] = np.array([0]) f['n_colors'] = np.array([colors]) # Torch does not want to read just number f.close() print ('Done!')
def predictCustomerEngagement(df,test_df=None): correct = 0 X = np.array(df.drop(['events_plan'], 1).astype(float)) X = preprocessing.scale(X) y = np.array(df['events_plan']) X_pca = PCA(n_components=2, whiten=True).fit_transform(X) clf = KMeans(n_clusters=2,max_iter=10,n_init=2,n_jobs=-1) clf.fit(X_pca) count_paid=0 count_free=0 centroids = clf.cluster_centers_ lables = clf.labels_ for i in range(len(X_pca)): predict_me = np.array(X_pca[i].astype(float)) predict_me = predict_me.reshape(-1, len(predict_me)) prediction = clf.predict(predict_me) plt.plot(X_pca[i][0], X_pca[i][1],colours[lables[i]],markersize=10) if prediction[0] == 0: count_paid += 1 elif prediction[0] == 1: count_free += 1 if prediction[0] == y[i]: correct += 1 X2 = np.array(test_df.drop(['events_plan'], 1).astype(float)) X2 = preprocessing.scale(X2) X2_pca = PCA(n_components=2, whiten=True).fit_transform(X2) clf2 = KMeans(n_clusters=2,max_iter=10,n_init=2,n_jobs=-1) clf2.fit(X2_pca) centroids2 = clf2.cluster_centers_ lables = clf2.labels_ for i in range(len(X2_pca)): predict_me = np.array(X2_pca[i].astype(float)) predict_me = predict_me.reshape(-1, len(predict_me)) prediction = clf2.predict(predict_me) plt.plot(X2_pca[i][0], X2_pca[i][1],colours2[lables[i]],markersize=10) plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", c=('g','r'), s=150 ,zorder=10) imgdata = StringIO.StringIO() plt.savefig(imgdata, format='png') imgdata.seek(0) fig = plt.figure() ax = fig.gca() ax.pie((count_free,count_paid),colors=('r', 'g'),radius=0.25, center=(0.5, 0.5), frame=True) pieData = StringIO.StringIO() plt.savefig(pieData,format='png') pieData.seek(0) pieUri = 'data:image/png;base64,' + urllib.quote(base64.b64encode(pieData.buf)) uri = 'data:image/png;base64,' + urllib.quote(base64.b64encode(imgdata.buf)) return {'accuracy':(float(correct) / len(X_pca))*100,'centroids':centroids, 'points':X_pca ,'total':count_free+count_paid,'count_paid':count_paid,'count_free':count_free,'plot':uri,'pieUri':pieUri}
def plot_pca(data): train = [row[:-1] for row in data.examples] scaled = scale(train) reduced_data = PCA(n_components=2).fit_transform(scaled) kmeans = KMeans(init='k-means++', n_clusters=10, n_init=10) kmeans.fit(reduced_data) cluster_label_dict = clusterLabelDict(kmeans, data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() + 1, reduced_data[:, 0].max() - 1 y_min, y_max = reduced_data[:, 1].min() + 1, reduced_data[:, 1].max() - 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure(1) pl.clf() pl.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=pl.cm.Paired, aspect='auto', origin='lower') pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ for i in range(len(centroids)): c0 = centroids[:, 0][i] c1 = centroids[:, 1][i] predicted = kmeans.predict(centroids[i]) label = cluster_label_dict[predicted[0]] pl.scatter(c0, c1, marker='$%d$' % label, s=169, linewidths=3, color='w', zorder=10) pl.title('K-means clustering on digits, reduced to 2-D with PCA\n' 'Each white number is the mode of its centroid.') pl.xlim(x_min, x_max) pl.ylim(y_min, y_max) pl.xticks(()) pl.yticks(()) pl.show()
class AdvancedModel(): clusters = [] # price class regression price_reg = LinearRegression() def fit(self, X_train, y_train, n_clusters=4): y_train_mat = np.array(y_train).reshape((-1,1)) # 1. determine clusters self.km = KMeans(n_clusters=5) self.km.fit(y_train_mat) clusters = self.km.cluster_centers_ cluster_indices = self.km.predict(y_train_mat) print(clusters) # 2. fit naive bayes #self.nb.fit(X_train, ...) #self # 3. train regression model #price_reg.fit def predict(self, X): pass def get_weights(self): return np.append(self.price_reg.coef_, [self.price_reg.intercept_]) def set_weights(self, w): self.price_reg.coef_ = w[:-1] self.price_reg.intercept_ = w[-1]
def test_predict(): km = KMeans(n_clusters=n_clusters, random_state=42) km.fit(X) # sanity check: predict centroid labels pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = km.predict(X) assert_array_equal(pred, km.labels_) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) assert_array_equal(pred, km.labels_)
def pca_k_means(self): if not self.pca_reduced: self.pc_analysis() kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10) kmeans.fit(self.pca_reduced, self.player_value) h = .02 x_min, x_max = self.pca_reduced[:, 0].min() - 1, self.pca_reduced[:, 0].max() + 1 y_min, y_max = self.pca_reduced[:, 1].min() - 1, self.pca_reduced[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(self.pca_reduced[:, 0], self.pca_reduced[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ labels = self.pca_labels = kmeans.labels_ intertia = kmeans.inertia_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n' 'Centroids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) return {'plt': plt, 'centroids': centroids, 'labels': labels, 'inertia': intertia}
def re_classify_dict(): dict_file = open("_dictionary.pickle", "rb") sc_list = cPickle.load(dict_file) sc_list = np.concatenate(sc_list) Dh_dict = sc_list[:, 144:] Dl_dict = sc_list[:, :144] k_means = KMeans(n_clusters=15) k_means = k_means.fit(Dl_dict) y_predict = k_means.predict(Dl_dict) num = [] y_tmp = np.asarray(y_predict, dtype=int) * 0 + 1 for i in range(len(np.unique(y_predict))): num.append(np.sum(y_tmp[y_predict == i])) rand = np.asarray(num).argsort() # 按照各个类别patch个数从少到多排序的类别索引 classified_hdict = [] classified_patch = [] for i in rand: predict_temp = y_predict == i classified_hdict.append(Dh_dict[predict_temp]) print len(classified_hdict[-1]) for i in range(9): x = i % 3 y = i / 3 # 进行一次系数编码测试 patch_show(classified_hdict[i+5][:100], [0.05+x*0.31, 0.05+y*0.31, 0.3, 0.3], i) plt.show()
def add_kmeans_col(self, iter = 1000, n_init = 10, n = 4): '''Add a new k_means cluster column to X data''' logging.info('Adding kmeans %d clusters to X' %(n)) km = KMeans(n_clusters=n, max_iter=iter, n_init=n_init) km.fit(self.X[:,1:]) # XXX: This might not be kosher as it affects all of X self.models['km-col'] = km self.X = np.hstack( (self.X, km.predict(self.X[:,1:]).reshape(-1,1)) )
def findColor(frame): t = time() # dim = np.array(frame.size)/2 # frame.thumbnail(dim, Image.ANTIALIAS) # print "Thumbnail in %0.3f seconds." % (time() - t) # t = time() points = imresize(np.array(frame, dtype=np.float64), 0.3) w,h,d = points.shape data = np.reshape(points, (w*h, d)) sample = shuffle(data, random_state=0)[:len(data)/3] print "Reshape and shuffle in %0.3f seconds." % (time() - t) t = time() kmeans = KMeans(n_clusters=k_colors, n_jobs=jobs).fit(sample) labels = kmeans.predict(data) print "Fit and predict in %0.3f seconds." % (time() - t) t = time() colors = [map(int, color) for color in kmeans.cluster_centers_] # hsvs = np.array([rgb_to_hsv(*values) for values in colors]) # frequent = np.argmax(hsvs[:,1]) # frequent = colors[frequent] print "Found in %0.3f seconds." % (time() - t) frequents = defaultdict(int) for l in labels: frequents[l] += 1 frequents = sorted(frequents.items(), key=lambda x:x[1], reverse=True) frequents = [colors[i[0]] for i in frequents[:3]] # print "Counted in %0.3f seconds." % (time() - t) # print "Top 3 colors [RGB]: ", frequents[:3] return frequents[2] if len(frequents) == 3 else frequents[0]
plt.imshow(orderedDataMatrix, aspect='auto', interpolation='nearest') plt.grid(False) plt.title("Heatmap of Iris characteristics") plt.colorbar() plt.xticks([x for x in range(6)], labels_t) plt.savefig("week10_heatmap.png") # Save the image plt.close() plt.figure() hac.dendrogram(linkageMatrix_transposed, labels=labels) plt.savefig('dendrogram10.png') plt.close() kmeans = KMeans(n_clusters=5, random_state=0) kmeans.fit(dataMatrixnp) labels = kmeans.predict(dataMatrixnp) dataMatrix_df = pd.merge(pd.DataFrame( dataMatrixnp, columns=['CFU', 'poly', 'unk', 'int', 'mys', 'mid']), pd.DataFrame(labels, columns=['cluster']), left_index=True, right_index=True) k_clustered = dataMatrix_df.sort_values('cluster')[[ 'CFU', 'poly', 'unk', 'int', 'mys', 'mid' ]].values plt.figure() plt.imshow(k_clustered, aspect='auto', interpolation='nearest') plt.grid(False) plt.title("Heatmap of Iris characteristics") plt.colorbar() plt.xticks([x for x in range(6)], labels_t)
plt.plot(k_rng[1:], silhouette_score, 'b*-') plt.xlim([1, 15]) plt.grid(True) plt.ylabel('Silhouette Coefficient') plt.xlabel("Values of K") plt.plot(3, silhouette_score[1], 'o', markersize=12, markeredgewidth=1.5, markerfacecolor='None', markeredgecolor='r') ###plot three clusters est = KMeans(n_clusters=3, init='random') est.fit(d1) y_kmeans = est.predict(d1) colors = np.array(['r', 'b', 'g']) plt.figure() plt.scatter(d1.DAY_OF_WEEK, d1.HOUR_ARR, c=colors[y_kmeans], s=50) plt.xlim(1.5, 8.5) plt.xticks([2, 3, 4, 5, 6, 7, 8], ['Mon', 'Tue', 'Wed', "Thu", "Fri", "Sat", "Sun"]) plt.ylim(-.5, 24) plt.yticks([0, 6, 12, 18], ["12:00 AM", "6:00 AM", "12:00 PM", "6:00 PM"]) plt.title("Rush Hour Determination from K-Means Clustering") t2['cluster'] = y_kmeans commuter_hours = t2[t2.cluster == 1]["HOUR_OF_WEEK"] winter["rush"] = [ 1 if x in set(commuter_hours) else 0 for x in winter.HOUR_OF_WEEK
# import numpy as np # import sklearn from sklearn.datasets import make_blobs from sklearn.cluster import KMeans N = 10000 centers = 4 X, Y = make_blobs(n_samples=N, n_features=2, centers=centers, random_state=28) km = KMeans(n_clusters=centers, init='random', random_state=28) km.fit(X) print(Y) print( '---------------------------------------------------------------------------' ) y_hat = km.predict(X[:10]) print(y_hat)
class UnsupervisedKmeansBowModel(UnsupervisedBaseModel): def __init__(self, task): super(UnsupervisedKmeansBowModel, self).__init__(task) self.num_clusters = 4 # combinations of social and agency self.text_repr_model = self.get_text_representation_model() self.clf_model = KMeans(init='k-means++', n_clusters=self.num_clusters, n_init=10, random_state=self.args.random_state) def augment_features(self, X_text, X_all_feats): if not self.args.use_allfeats: return X_text.toarray() age = X_all_feats[:, 2].reshape(-1, 1) gender = X_all_feats[:, 3].reshape(-1, 1) married = X_all_feats[:, 4].reshape(-1, 1) parenthood = X_all_feats[:, 5].reshape(-1, 1) country = X_all_feats[:, 6].reshape(-1, 1) reflection = X_all_feats[:, 7].reshape(-1, 1) duration = X_all_feats[:, 8].reshape(-1, 1) X_all = np.concatenate([ X_text.toarray(), age, gender, married, parenthood, country, reflection, duration ], axis=1) return X_all def get_text_representation_model(self): steps = [] vectorizer = TfidfVectorizer(ngram_range=(1, self.args.ngrams), min_df=5, max_df=0.5, stop_words="english", use_idf=False) steps.append(('vec', vectorizer)) repr_model = Pipeline(steps) return repr_model def train(self, X, y=None): X, y = self.augment_instances(X, y) X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL]) X_all_feats = self.augment_features(X_text, X) pca = PCA(n_components=self.num_clusters, random_state=self.args.random_state) pca.fit(X_all_feats) model = KMeans(init=pca.components_, n_clusters=self.num_clusters, n_init=1, random_state=self.args.random_state) model.fit(X_all_feats) self.clf_model = model def predict(self, X): X_text = self.text_repr_model.transform(X[:, self.args.TEXT_COL]) X_all_feats = self.augment_features(X_text, X) y_pred = self.clf_model.predict(X_all_feats) y = y_pred.astype(np.uint8) y = np.unpackbits(y) y = y.reshape(y_pred.shape[0], 8) y = y[:, -2:] y = y[:, ::-1] return y
corpus = open('/Users/mccallmathers./Desktop/NLP/dataset.txt').read() docs = corpus.split('\n') X = [] for doc in docs: i, l = doc.split(':') X.append(i.strip()) from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() matrix_X = vec.fit_transform(X) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2, max_iter=300, tol=1e-4) kmeans.fit(matrix_X[:5]) print(kmeans.labels_) print(kmeans.predict(matrix_X[5])) from sklearn.neighbors import NearestNeighbors kn = NearestNeighbors() kn.fit(matrix_X) print(kn.kneighbors(matrix_X[3], 2)) kn.radius_neighbors(matrix_X[3], radius=1.7)
wcss= [] ##Within Cluster Sum of Squares ##elbow method to know the number of clusters for i in range(1,11): kmeans= KMeans(n_clusters=i, max_iter=300,random_state=0) kmeans.fit(z) wcss.append(kmeans.inertia_) plt.plot(range(1,11),wcss) plt.title('the elbow method') plt.xlabel('Number of Clusters') plt.ylabel('Wcss') plt.show() #Silhouette score # predict the cluster for each data point y_cluster_kmeans = km.predict(z) from sklearn import metrics score = metrics.silhouette_score(z, y_cluster_kmeans) print("The Silhouette score is",score) from sklearn import preprocessing scaler =preprocessing.StandardScaler() scaler.fit(z) X_scaled_array=scaler.transform(z) X_scaled=pd.DataFrame(X_scaled_array, columns =z.columns) print("Feature Scaling",X_scaled) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Fit on training set only.
### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2 in finance_features: plt.scatter(f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans clf = KMeans(n_clusters=2) clf.fit(finance_features) pred = clf.predict(finance_features) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print("no predictions object named pred found, no clusters to plot")
X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) y = np.array(test_target["Survived"]) # PCA on data from sklearn.decomposition import PCA pca = PCA(n_components=2) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) # KMeans clf = KMeans(n_clusters=2) clf.fit(X_train) clf_pred = clf.predict(X_test) correct = 0 # Calculate Score for i in range(len(clf_pred)): if clf_pred[i] == y[i]: correct += 1 print(max(1 - correct / len(clf_pred), correct / len(clf_pred))) PassengerId = np.array(test["PassengerId"]).astype(int) my_solution = pd.DataFrame(clf_pred, PassengerId, columns=["Survived"]) # Write your solution to a csv file with the name my_solution.csv my_solution.to_csv("KMeans.csv", index_label=["PassengerId"])
print("centers:", model.cluster_centers_) print("labels", labels) print("intertia:", model.inertia_) texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label == i_cluster: texts_per_cluster[i_cluster] += 1 print("Top words per cluster:") for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: print("\t" + words[term]) print("\n") print("Prediction") text_to_predict = "Why batman was defeated by superman so easy?" Y = vectorizer.transform([text_to_predict]) predicted_cluster = model.predict(Y)[0] texts_per_cluster[predicted_cluster] += 1 print(text_to_predict) print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])), for term in ordered_words[predicted_cluster, :10]: print("\t" + words[term])
for k in range(K): index = np.where(idx == k)[0] # 一个簇一个簇的分开来计算 temp = X[index, :] # ? by m # 每次先取出一个簇中的所有样本 s = np.sum(temp, axis=0) centriod[k, :] = s / np.size(index) return centriod def kmeans(X, K, max_iter=200): centroids = InitCentroids(X, K) idx = None for i in range(max_iter): idx = findClostestCentroids(X, centroids) centroids = computeCentroids(X, idx, K) return idx if __name__ == '__main__': x, y = load_data() K = len(np.unique(y)) y_pred = kmeans(x, K) nmi = normalized_mutual_info_score(y, y_pred) print("NMI by ours: ", nmi) model = KMeans(n_clusters=K) model.fit(x) y_pred = model.predict(x) nmi = normalized_mutual_info_score(y, y_pred) print("NMI by sklearn: ", nmi)
features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list) poi, finance_features = targetFeatureSplit(data) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2, _ in finance_features: plt.scatter(f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred model = KMeans(n_clusters=2) model.fit(finance_features) pred = model.predict(finance_features) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print("no predictions object named pred found, no clusters to plot")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Aug 19 14:08:13 2018 @author: suyash """ from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np import pandas as pd iris = pd.read_csv('iris.csv') iris = np.array(iris) iris = np.transpose(iris) model = KMeans(n_clusters=3) model.fit(iris) labels = model.predict(iris) plt.scatter(iris[:, 0], iris[:, 1], iris[:, 2], c=labels, marker='o') centroids = model.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='D') plt.show()
from display_network import * from mnist import MNIST #require pip install python-mnist import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import normalize mndata = MNIST("MNIST/") #path to your MNIST folder mndata.load_testing() X = mndata.test_images X0 = np.asarray(X)[:1000, :] / 256.0 X = X0 K = 10 kmeans = KMeans(n_clusters=K).fit(X) pred_label = kmeans.predict(X) print(type(kmeans.cluster_centers_.T)) print(kmeans.cluster_centers_.T.shape) A = display_network(kmeans.cluster_centers_.T, K, 1) f1 = plt.imshow(A, interpolation='nearest', cmap="jet") f1.axes.get_xaxis().set_visible(False) f1.axes.get_yaxis().set_visible(False) plt.show() #plt.savefig('a1.png', bbox_inches='tight') #a colormap and a mormalization instance cmap = plt.cm.jet norm = plt.Normalize(vmin=A.min(), vmax=A.max()) #map the normalized data to colors
salaries.append(stock) print("Maximum Value: {}".format(max(salaries))) print("Minimum Value: {}".format(min(salaries))) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2, _ in finance_features: plt.scatter( f1, f2 ) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2, random_state=0, max_iter=100).fit(finance_features) pred = kmeans.predict(finance_features) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
"Best cat photo I've ever taken.", "Climbing ninja cat.", "Impressed with google map feedback.", "Key promoter extension for Google Chrome."]''' vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) true_k = 2 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) model.fit(X) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i), for ind in order_centroids[i, :10]: print(' %s' % terms[ind]), print print("\n") print("Prediction") Y = vectorizer.transform(["chrome browser to open."]) prediction = model.predict(Y) print(prediction) Y = vectorizer.transform(["My cat is hungry."]) prediction = model.predict(Y) print(prediction)
""" import numpy as np import matplotlib.pyplot as plt import pandas as pd digits_train = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra',header=None) digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes',header=None) X_train = digits_train[np.arange(64)] y_train = digits_train[64] X_test = digits_test[np.arange(64)] y_test = digits_test[64] from sklearn.cluster import KMeans kmeans = KMeans(n_clusters = 10) kmeans.fit(X_train) y_pred = kmeans.predict(X_test) from sklearn import metrics print(metrics.adjusted_rand_score(y_test,y_pred))#ARI进行聚类性能评估 import numpy as np from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt plt.subplot(3,2,1)#分割出6个子图,并在1号子图作画 x1 = np.array([1,2,3,1,5,6,5,5,6,7,8,9,7,9]) x2 = np.array([1,3,2,2,8,6,7,6,7,1,2,1,1,3]) X = np.array(zip(x1,x2)).reshape(2,len(x1) plt.xlim([0,10]) plt.ylim([0,10])
class PartitionedXgbRegressor(TransformerMixin): """An xgboost regressor variant with implicit inverse class frequency weighting, and a few other tricks This is a passthrough to xgboost's standard XgbRegressor, with an added preprocessing stage, intended for use with the NystroemSpectralProjection class, and a KMeans clustering stage, which is used to compute sample weighting. The premise is that the combination, which amounts to a particular graph spectral clustering, represents an implicit set of categorical variables that are partially driving the behavior of a continuous output variable. We attempt to counteract this by applying an inverse-frequency weighting scheme based on an implicit set of classes defined by the clustering. Attributes ---------- clusterer : KMeans n_augment_cols : int Number of trailing pass-through columns (for the purpose of clustering / preprocessing) """ def __init__(self, base_estimator=XGBRegressor(), n_augment_cols=1, preprocess=None, n_clusters=8, augments_only=False): """Construct a new PartitionedXgbRegressor model Parameters ---------- n_augment_cols : int Number of trailing pass-through columns (for the purpose of clustering / preprocessing) preprocess : TransformerMixin Preprocessing stage compatible with sklearn's TransformerMixin interface n_clusters : int Number of k-means clusters to use as implicit class labels """ self.base_estimator = base_estimator self.clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1) self.estimator_ = None self.n_augment_cols = n_augment_cols self.preprocess = preprocess self.augments_only = augments_only def fit(self, X, y=None, weights=None, **kwargs): """Fit regressor Note ---- Provided weights are unused. Keyword arguments to support early stopping are currently required. Parameters ---------- X : ndarray N samples x M dimensions ndarray containing the data to fit y : ndarray N element ndarray containing the target values weights : None Unused kwargs : dict Required keys: eval_set, eval_metric, early_stopping_rounds. Values as specified by xgboost docs. """ if self.preprocess is not None: X = np.hstack([ self.preprocess.transform(X[:, :-self.n_augment_cols]), X[:, -self.n_augment_cols:].reshape((-1, self.n_augment_cols)) ]) eval_X = np.hstack([ self.preprocess.transform( kwargs["eval_set"][0][:, :-self.n_augment_cols]), kwargs["eval_set"][0][:, -self.n_augment_cols:].reshape( (-1, self.n_augment_cols)) ]) else: eval_X = kwargs["eval_set"][0] X_cats = self.clusterer.fit_predict(X[:, :-self.n_augment_cols]) eval_cats = self.clusterer.predict(eval_X[:, :-self.n_augment_cols]) # NOTE: left end of clip should be unnecessary, right end should be max_gain parameter weight_map = { c: np.clip(X_cats.size / X_cats[X_cats == c].size, 1.0, 1000.0) for c in np.unique(X_cats) } print(weight_map) W = np.asarray([weight_map.get(c, 1.0) for c in X_cats]) eval_W = np.asarray([weight_map.get(c, 1.0) for c in eval_cats]) # TODO: do something with the augments_only option eset = (eval_X, kwargs["eval_set"][1]) reg = clone(self.base_estimator) reg.base_score = np.mean(y) reg.fit(X, y, W, eval_set=[eset], eval_metric=kwargs["eval_metric"], sample_weight_eval_set=[eval_W], early_stopping_rounds=kwargs["early_stopping_rounds"], verbose=kwargs.get("verbose", False)) self.estimator_ = reg def predict(self, X): """Predict values for new samples""" assert self.estimator_ is not None, "Cannot Predict: Model has not been trained" if self.preprocess is not None: X = np.hstack([ self.preprocess.transform(X[:, :-self.n_augment_cols]), X[:, -self.n_augment_cols:].reshape((-1, self.n_augment_cols)) ]) return self.estimator_.predict(X)
# -*- coding:UTF-8 -*- import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans # 加载数据集 dataMat = [] fr = open("data/10.KMeans/testSet.txt") # 注意,这个是相对路径,请保证是在 MachineLearning 这个目录下执行。 for line in fr.readlines(): curLine = line.strip().split('\t') fltLine = list(map(float,curLine)) # 映射所有的元素为 float(浮点数)类型 dataMat.append(fltLine) # 训练模型 km = KMeans(n_clusters=4) # 初始化 km.fit(dataMat) # 拟合 km_pred = km.predict(dataMat) # 预测 centers = km.cluster_centers_ # 质心 # 可视化结果 plt.scatter(np.array(dataMat)[:, 1], np.array(dataMat)[:, 0], c=km_pred) plt.scatter(centers[:, 1], centers[:, 0], c="r") plt.show()
user_unique_id = pd.merge(user_unique_id, recent_purchase[['customerid', 'Recency']], on='customerid') # print(user_unique_id) #how many clusters are required sse = {} tx_recency = user_unique_id[['Recency']] for k in range(1, 8): kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency) #tx_recency["clusters"] = kmeans.labels_ sse[k] = kmeans.inertia_ kmeans = KMeans(n_clusters=4) kmeans.fit(user_unique_id[['Recency']]) user_unique_id['RecencyCluster'] = kmeans.predict(user_unique_id[['Recency']]) #print(user_unique_id['RecencyCluster'].describe()) #function for ordering cluster numbers #print(user_unique_id.groupby('RecencyCluster')['Recency'].describe()) def order_cluster(cluster_field_name, target_field_name, df, ascending): new_cluster_field_name = 'new_' + cluster_field_name df_new = df.groupby( cluster_field_name)[target_field_name].mean().reset_index() df_new = df_new.sort_values(by=target_field_name, ascending=ascending).reset_index(drop=True) df_new['index'] = df_new.index
#Save within-cluster sums of squares to the list wcss.append(kmeans.inertia_) #Display the graph print(wcss) plt.plot(range(1, 10), wcss) plt.title('the elbow method') plt.xlabel('Number of Clusters') plt.ylabel('WCSS') plt.show() #From the map, at k=3 seem like data slowly unchange => choose k=3 #Silhouette score km = KMeans(n_clusters=3) km.fit(x) y_cluster_kmeans = km.predict(x) score = metrics.silhouette_score(x, y_cluster_kmeans) print() print('Silhouette score for', 3, 'clusters', score) ########################################################################### from sklearn import preprocessing scaler = preprocessing.StandardScaler() scaler.fit(x) X_scaled_array = scaler.transform(x) X_scaled = pd.DataFrame(X_scaled_array, columns=x.columns) km = KMeans(n_clusters=3) km.fit(X_scaled) y_cluster_kmeans = km.predict(X_scaled)
def dataprep(data_in, depvar='default_time', splitvar='time', threshold=26): df=data_in.dropna(subset=['time', 'default_time','LTV_time', 'FICO_orig_time']).copy() # Economic features df.loc[:, 'annuity'] = ((df.loc[:,'interest_rate_time']/(100*4))*df.loc[:,'balance_orig_time'])/(1-(1+df.loc[:,'interest_rate_time']/(100*4))**(-(df.loc[:,'mat_time']-df.loc[:,'orig_time']))) df.loc[:,'balance_scheduled_time'] = df.loc[:,'balance_orig_time']*(1+df.loc[:,'interest_rate_time']/(100*4))**(df.loc[:,'time']-df.loc[:,'orig_time'])-df.loc[:,'annuity']*((1+df.loc[:,'interest_rate_time']/(100*4))**(df.loc[:,'time']-df.loc[:,'orig_time'])-1)/(df.loc[:,'interest_rate_time']/(100*4)) df.loc[:,'property_orig_time'] = df.loc[:,'balance_orig_time']/(df.loc[:,'LTV_orig_time']/100) df.loc[:,'cep_time']= (df.loc[:,'balance_scheduled_time'] - df.loc[:,'balance_time'])/df.loc[:,'property_orig_time'] df.loc[:,'equity_time'] = 1-(df.loc[:,'LTV_time']/100) df=df.dropna(subset=['time', 'cep_time', 'equity_time']) df.loc[:,'age'] = (df.loc[:,'time']-df.loc[:,'first_time']+1) df.loc[df['age'] >= 40, 'age'] = 40 df.loc[:,'age_1'] = df.loc[:,'time']-df.loc[:,'first_time'] df.loc[df['age_1'] >= 39, 'age_1'] = 39 df.loc[:,'age_1f'] = df.loc[:,'age_1'] df.loc[df['age_1f'] <= 1, 'age_1f'] = 1 df.loc[:,'age2'] = df.loc[:,'age']**2 df['vintage'] = df.loc[:,'orig_time'] df.loc[df['vintage'] < 0, 'vintage'] = 0 df.loc[df['vintage'] >= 30, 'vintage'] = 30 df.loc[:,'vintage2'] = df.loc[:,'vintage']**2 df.loc[:,'state_orig_time'] = pd.Categorical(df.state_orig_time, ordered=False) if depvar=='default_time': df2 = df df2 = df2.loc[df2['state_orig_time'] != 'AL',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'AK',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'AR',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'ND',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'SD',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'MT',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'DE',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'WV',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'VT',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'ME',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'NE',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'NH',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'MS',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'VI',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'DC',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'PR',:].copy() df2 = df2.loc[df2['state_orig_time'] != 'nan',:].copy() # Splitting data_train = df2.loc[df2[splitvar] < threshold+1,:].copy() data_test = df2.loc[df2[splitvar] > threshold,:].copy() # PCA defaultrates_states_train = data_train.groupby(['time', 'state_orig_time'])['default_time'].mean().unstack(level=1).add_prefix('defaultrate_').fillna(0).reset_index(drop=False) defaultrates_states = df2.groupby(['time', 'state_orig_time'])['default_time'].mean().unstack(level=1).add_prefix('defaultrate_').fillna(0).reset_index(drop=False) scaler = StandardScaler().fit(defaultrates_states_train) defaultrates_states_train1 = scaler.transform(defaultrates_states_train) defaultrates_states1 = scaler.transform(defaultrates_states) pca = PCA() pca.fit(defaultrates_states_train1) z_train = pca.transform(defaultrates_states_train1) z = pca.transform(defaultrates_states1) z_train = z_train[:,0:5] z = z[:,0:5] Z_train = pd.DataFrame(data=z_train, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5']) Z = pd.DataFrame(data=z, columns=['PCA1', 'PCA2', 'PCA3', 'PCA4', 'PCA5']) Z_train_1 = Z_train.shift(1).add_suffix('_1') Z_1 = Z.shift(1).add_suffix('_1') defaultrates_states_train2 = pd.concat([defaultrates_states_train['time'], Z_train_1], axis=1).dropna(subset=['PCA1_1']).copy() defaultrates_states2 = pd.concat([defaultrates_states['time'], Z_1], axis=1).dropna(subset=['PCA1_1']).copy() data_train = pd.merge(data_train, defaultrates_states_train2, on='time') df3 = pd.merge(df2, defaultrates_states2, on='time') data_test = df3.loc[df3[splitvar] > threshold,:].copy() # Scaling X_train = data_train[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'gdp_time', 'PCA1_1','PCA2_1', 'PCA3_1','PCA4_1','PCA5_1']].dropna() X_test = data_test[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'gdp_time', 'PCA1_1','PCA2_1', 'PCA3_1','PCA4_1','PCA5_1']].dropna() scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) y_train = data_train['default_time'].values.reshape(-1,) y_test = data_test['default_time'].values.reshape(-1,) # Clustering n_clusters = 2 kmeans = KMeans(n_clusters=n_clusters, random_state=2, verbose=0) kmeans.fit(X_train_scaled) clusters_train =kmeans.predict(X_train_scaled) clusters_test = kmeans.predict(X_test_scaled) dummies_train = pd.get_dummies(clusters_train, drop_first=True, prefix='cluster') dummies_test = pd.get_dummies(clusters_test, drop_first=True, prefix='cluster') X_train_scaled = np.append(X_train_scaled, dummies_train, axis=1) X_test_scaled = np.append(X_test_scaled, dummies_test, axis=1) dummies = pd.concat([dummies_train, dummies_test], axis=0, ignore_index=True) dummies = dummies.reindex(data.index) df3 = pd.concat([df3, dummies], axis=1).dropna(subset=['id']) data_train = pd.concat([data_train, dummies_train], axis=1) dummies_test = dummies_test.reindex(data_test.index) data_test = pd.concat([data_test, dummies_test], axis=1) if depvar=='lgd_time': # LGD dataprep df2 = df.query('default_time == 1').copy() df3 = resolutionbias(df2,'lgd_time','res_time','time') df3.loc[df3['lgd_time'] <= 0, 'lgd_time'] = 0.0001 df3.loc[df3['lgd_time'] >= 1, 'lgd_time'] = 0.9999 # Splitting data_train =df3.loc[df3[splitvar] < threshold+1,:].copy() data_test =df3.loc[df3[splitvar] > threshold,:].copy() X_train = data_train[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'REtype_CO_orig_time', 'REtype_PU_orig_time', 'gdp_time']] X_test = data_test[['cep_time', 'equity_time', 'interest_rate_time', 'FICO_orig_time', 'REtype_CO_orig_time', 'REtype_PU_orig_time', 'gdp_time']] y_train = data_train['lgd_time'].values.reshape(-1,) y_test = data_test['lgd_time'].values.reshape(-1,) # Scaling scaler = StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) dummies_train = pd.get_dummies(data_train.state_orig_time, drop_first=True, prefix='state_orig_time') dummies_test = pd.get_dummies(data_test.state_orig_time, drop_first=True, prefix='state_orig_time') X_train_scaled = np.append(X_train_scaled, dummies_train, axis=1) X_test_scaled = np.append(X_test_scaled, dummies_test, axis=1) return df3, data_train, data_test, X_train_scaled, X_test_scaled, y_train, y_test
df = pd.DataFrame({'labels': labels, 'companies': companies}) print(df.sort_values('labels')) # In[81]: # PCA Analysis using Singular value decomposition from sklearn.decomposition import PCA reduced_data = PCA(n_components=2).fit_transform(new) #running K-Means on reduced data kmeans = KMeans(n_clusters=10, max_iter=1000) kmeans.fit(reduced_data) labels = kmeans.predict(reduced_data) df = pd.DataFrame({'labels': labels, 'companies': companies}) print(kmeans.inertia_) print(df.sort_values('labels')) # In[97]: h = 0.01 #printing the decision boundary x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
finance_features = finance_features + [200000.,1000000.] from sklearn.cluster import KMeans import numpy as np kmeans = KMeans(n_clusters=2, random_state=0).fit(X_train_std) kmeans.fit(X_train_std) kmeans.labels_ label = kmeans.labels_ pred = kmeans.predict(X_train_std) predict1 = kmeans.predict([200000.,1000000.]) print predict1 centers = kmeans.cluster_centers_ ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError:
df = pd.DataFrame({ 'x': [ 12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72 ], 'y': [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24] }) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=3) kmeans.fit(df) labels = kmeans.predict(df) centroids = kmeans.cluster_centers_ fig = plt.figure(figsize=(5, 5)) colors = map(lambda x: colmap[x + 1], labels) plt.scatter(df['x'], df['y'], color=colors, alpha=0.5, edgecolor='k') for idx, centroid in enumerate(centroids): plt.scatter(*centroid, color=colmap[idx + 1]) plt.xlim(0, 80) plt.ylim(0, 80) plt.show()
# kmeans plt.figure() for k in range(3,20,1): clf = KMeans(n_clusters=k) s = clf.fit(x_train) print(s) # print clf.cluster_centers_ # print clf.labels_ k_labels=clf.labels_ print (clf.inertia_) k_inertia=clf.inertia_ # print clf.predict(x_test) k_pred=clf.predict(x_test) plt.plot(k,k_inertia,c='g',marker='x') plt.show() for k in range(3,9): clf = KMeans(n_clusters=k) s = clf.fit(x_train) numSamples = len(x_train) centroids = clf.labels_ # print centroids,type(centroids) print (clf.inertia_) # k_inertia = clf.inertia_ # k_pred = clf.predict(x_test) # plt.plot(k, k_inertia, c='g', marker='x')
from sklearn.datasets import make_blobs from sklearn.cluster import KMeans import matplotlib.pyplot as plt import mglearn import numpy as np ## Super basic k-means clusters X, y = make_blobs(random_state=1) kmeans = KMeans(n_clusters=3) kmeans.fit(X) print('Cluster Membership:\n{}'.format(kmeans.labels_)) print(kmeans.predict(X)) mglearn.discrete_scatter(X[:, 0], X[:, 1], kmeans.labels_, markers='o') mglearn.discrete_scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], [0, 1, 2], markers='^', markeredgewidth=2) ## Changing number of categories to show lack of apriori meaning fig, axes = plt.subplots(1, 2, figsize=(10, 5)) kmeans = KMeans(n_clusters=2) kmeans.fit(X) assignments = kmeans.labels_ mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[0]) kmeans = KMeans(n_clusters=5) kmeans.fit(X) assignments = kmeans.labels_ mglearn.discrete_scatter(X[:, 0], X[:, 1], assignments, ax=axes[1])
from sklearn.datasets.samples_generator import make_blobs sns.set() ''' K-Means ''' # 设置随机样例点 X, y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.60) plt.scatter(X[:, 0], X[:, 1], s=50) plt.show() # 4中心聚类实现对上面样例数据的聚类 est = KMeans(4) est.fit(X) y_kmeans = est.predict(X) plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='rainbow') plt.show() ''' 手写数字应用 ''' # 加载数据 digits = load_digits() # 加载模型 est = KMeans(n_clusters=10) clusters = est.fit_predict(digits.data) print(est.cluster_centers_.shape) # (10, 64) # 显示10个数字 fig = plt.figure(figsize=(8, 3))
Deaths_df = pd.read_csv( r"C:\Users\NehaS\Desktop\CS418\MEASURESOFBIRTHANDDEATH.csv") Deaths_IL = Deaths_df[Deaths_df['CHSI_State_Name'] == 'Illinois'] Death_Infant = Deaths_IL[[ 'CHSI_County_Name', 'LBW', 'VLBW', 'Premature', 'Under_18', 'Over_40', 'Unmarried', 'Late_Care', 'Infant_Mortality' ]] Death_Infant = Death_Infant.replace(-1111.1, np.NaN) Death_Infant = Death_Infant[Death_Infant['Infant_Mortality'].notnull()] X = np.array(Death_Infant[['Infant_Mortality', 'LBW', 'VLBW']]) model = KMeans(n_clusters=3, random_state=1) model.fit(X) pred = model.predict(X) levels = ['2', '0', '1'] pred_val = [levels[x] for x in pred] Death_Infant['Death_Level'] = pred_val X1 = Death_Infant[[ 'LBW', 'VLBW', 'Premature', 'Under_18', 'Over_40', 'Unmarried', 'Late_Care' ]].to_numpy() y = Death_Infant['Death_Level'].to_numpy() X_opt = X1[:, [0, 1]] OLS_res = sm.OLS(endog=Death_Infant['Infant_Mortality'], exog=X_opt).fit() #print(OLS_res.summary()) X2 = Death_Infant[['LBW', 'VLBW']].to_numpy() y2 = Death_Infant['Death_Level'].to_numpy() X2_train, X2_test, y2_train, y2_test = train_test_split(X1,
#Print the output (describe) print(clustervar.describe()) clus_train, clus_test = train_test_split(clustervar, test_size=.3, random_state=123) # Utilized to identify the k-means cluster analysis. Specifically looking at the range of 1-10 clusters from scipy.spatial.distance import cdist clusters = range(1, 11) meandist = [] for k in clusters: model = KMeans(n_clusters=k) model.fit(clus_train) clusassign = model.predict(clus_train) meandist.append( sum( np.min(cdist(clus_train, model.cluster_centers_, 'euclidean'), axis=1)) / clus_train.shape[0]) ##Plotting the average distance from observations from the cluster centroid. Here we are utilizing the Elbow method ##to identify number of clusters to choose. We were told to use 4 in the assignment. The model shows 4 is ideal as well plt.figure() plt.plot(clusters, meandist) plt.xlabel('Number of clusters') plt.ylabel('Average distance') plt.title('Selecting k with the Elbow Method') plt.show()
#transformatio min max maxX, maxY = df.max() minX, minY = df.min() pointsTransformed = [] for x, y in points: pointsTransformed.append([(x - minX) / (maxX - minX), (y - minY) / (maxY - minY)]) dfTransformed = pd.DataFrame(pointsTransformed, columns=["x", "y"]) from sklearn.cluster import KMeans #centers seems ok print("Centers") import matplotlib.pyplot as plt dfTransformed.plot(x="x", y="y", kind="scatter") plt.savefig("transformedData.png") for n in [5, 10, 20, 50]: dfTransformed = pd.DataFrame(pointsTransformed, columns=["x", "y"]) kmeans = KMeans(n_clusters=n).fit(dfTransformed) print(kmeans.cluster_centers_) dfTransformed['cluster'] = kmeans.predict(dfTransformed) dfTransformed.plot(x="x", y="y", c="cluster", kind="scatter", colormap='summer') plt.savefig("withClusters-" + str(n) + ".png")