def test_kprotoypes_huang_stocks(self): np.random.seed(42) kproto_huang = kprototypes.KPrototypes(n_clusters=4, n_init=1, init='Huang', verbose=2) # Untrained model with self.assertRaises(AssertionError): kproto_huang.predict(STOCKS, categorical=[1, 2]) result = kproto_huang.fit_predict(STOCKS, categorical=[1, 2]) expected = np.array([0, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kprotoypes_impossible_init(self): data = np.array([ [0., 'Regular'], [0., 'Regular'], [0., 'Regular'], [0., 'Slim'], [0., 'Slim'], [0., 'Slim'] ]) np.random.seed(42) kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2) with self.assertRaises(ValueError): kproto_cao.fit_predict(data, categorical=[1])
def test_kmodes_fit_predict_equality(self): """Test whether fit_predict interface works the same as fit and predict.""" kproto = kprototypes.KPrototypes(n_clusters=3, init='Cao', random_state=42) sample_weight = [0.5] * STOCKS.shape[0] model1 = kproto.fit(STOCKS, categorical=[1, 2], sample_weight=sample_weight) data1 = model1.predict(STOCKS, categorical=[1, 2]) data2 = kproto.fit_predict(STOCKS, categorical=[1, 2], sample_weight=sample_weight) assert_cluster_splits_equal(data1, data2)
def test_k_prototypes_sample_weight_all_but_one_zero(self): """Test whether centroid collapses to single datapoint with non-zero weight.""" kproto = kprototypes.KPrototypes(n_clusters=1, init='Cao', random_state=42) n_samples = 2 for indicator in range(n_samples): sample_weight = np.zeros(n_samples) sample_weight[indicator] = 1 model = kproto.fit(STOCKS[:n_samples, :], categorical=[1, 2], sample_weight=sample_weight) np.testing.assert_array_equal(model.cluster_centroids_[0, :], STOCKS[indicator, :])
def test_kprotoypes_init_stocks_ng(self): init_vals = [ np.array([[356.975], [275.35], [738.5], [197.667]]), np.array([[3, 2], [0, 2], [3, 2], [2, 2]]) ] kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2, cat_dissim=ng_dissim, random_state=42) result = kproto_init.fit_predict(STOCKS, categorical=[1, 2]) expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kprotoypes_missings(self): init_vals = [ np.array([[356.975], [275.35], [738.5], [np.NaN]]), np.array([[3, 2], [0, 2], [3, 2], [2, 2]]) ] kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(ValueError): kproto_init.fit_predict(STOCKS, categorical=[1, 2])
def test_kprotoypes_nunique_nclusters(self): data = np.array([ [0., 'Regular'], [0., 'Regular'], [0., 'Regular'], [1., 'Slim'], [1., 'Slim'], [1., 'Slim'] ]) kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2, random_state=42) kproto_cao.fit_predict(data, categorical=[1]) # Check if there are only 2 clusters. self.assertEqual(kproto_cao.cluster_centroids_[0].shape, (2, 1)) self.assertEqual(kproto_cao.cluster_centroids_[1].shape, (2, 1))
def test_kprotoypes_init_stocks(self): # Wrong order init_vals = [ np.array([[3, 2], [0, 2], [3, 2], [2, 2]]), np.array([[356.975], [275.35], [738.5], [197.667]]) ] kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kproto_init.fit_predict(STOCKS, categorical=[1, 2]) init_vals = [ np.array([[356.975], [275.35], [738.5], [197.667]]), np.array([[3, 2], [0, 2], [3, 2], [2, 2]]) ] np.random.seed(42) kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2) result = kproto_init.fit_predict(STOCKS, categorical=[1, 2]) expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3]) np.testing.assert_array_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def run_kproto(X, cat_cols, init_method='Cao', n_clusters=4): ''' Perform k-prototypes clustering. :param X: prepared array for clustering :param cat_cols: list of index positions for categorical variables :param init_method: initiation method for k-prototypes clustering, default = 'Cao' :param n_clusters: number of clusters for model to segment data, default = 4 :returns: k-prototypes models, array of labels ''' kp = kprototypes.KPrototypes(n_clusters=n_clusters, init=init_method, n_init=10, max_iter=5, verbose=2) labels = kp.fit_predict(X, categorical=cat_cols) return kp, labels
def users_clustering(users_ids, users_bio, users_tweet, my_data): users_dataset = [] data_set = {} for i in range(len(users_ids)): user = users_ids[i] user_bio = users_bio[i].tolist() user_tweet = users_tweet[i].tolist() profile_background_tile = 1 if my_data[user]['profile_features'][ 'profile_background_tile'] else 0 profile_use_background_image = 1 if my_data[user]['profile_features'][ 'profile_use_background_image'] else 0 screen_name = len(my_data[user]['profile_features']['screen_name']) verified = 1 if my_data[user]['profile_features']['verified'] else 0 statuses_count = my_data[user]['profile_features']['statuses_count'] favourites_count = my_data[user]['profile_features'][ 'favourites_count'] has_extended_profile = 1 if my_data[user]['profile_features'][ 'has_extended_profile'] else 0 friends_count = my_data[user]['profile_features']['friends_count'] followers_count = my_data[user]['profile_features']['followers_count'] number_cascades = len(my_data[user]['cascades_feature']) users_dataset.append([ profile_background_tile, profile_use_background_image, screen_name, verified, statuses_count, favourites_count, has_extended_profile, friends_count, followers_count, number_cascades ] + user_bio + user_tweet) data_set[i] = user logging.info("making data matrix finished.") users_dataset = np.array(users_dataset) logging.info('data set created') kproto_init = kprototypes.KPrototypes(n_clusters=3600, init="Huang", verbose=2, n_init=1) logging.info('go for learning clusters') result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6]) logging.info("model fit-predict result:{0}".format(result)) pickle.dump(result, open('results1_text.p', 'wb')) pickle.dump(data_set, open('results11_text.p', 'wb')) with open('results1_text.txt', 'w') as f: f.write("\n".join(str(result)))
def test_kprotoypes_not_stuck_initialization(self): init_problem = np.array([[0, 'Regular'], [0, 'Regular'], [0, 'Regular'], [0, np.NaN], [-0.5, 'Regular'], [-0.5, 'Regular'], [0, np.NaN], [0, 'Regular'], [0, 'Regular'], [0, 'Slim'], [0, 'Regular'], [0, 'Regular'], [0.5, 'Regular'], [-0.5, 'Regular'], [0.5, 'Regular'], [0.5, 'Slim'], [0, 'Regular'], [0.5, 'Regular'], [0, 'Regular'], [-0.5, 'Regular'], [0, np.NaN], [0, np.NaN], [0, 'Regular'], [0, 'Regular'], [0, 'Regular']]) kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2, random_state=42) kproto_cao = kproto_cao.fit(init_problem, categorical=[1]) self.assertTrue(hasattr(kproto_cao, 'cluster_centroids_'))
def k_prototype(self, clust_num, clustees): print("Starting k-prototypes clustering...") kproto = kprototypes.KPrototypes(n_clusters=clust_num, init='Cao', verbose=2) num_cols = [4, 21] # age, renta cat_data_indices = self.get_cat_cols(self.data, num_cols) self.data = self.convert_col_type(self.data, cat_data_indices) #print(self.data.dtypes) clusters = kproto.fit_predict(self.data.values, categorical=cat_data_indices) print("cluster centroids of the trained model.") print(kproto.cluster_centroids_) print("training statistics") print(kproto.cost_) print(kproto.n_iter_) #for s, c in zip(clustees, clusters): # print("CustID: {}, cluster:{}".format(s, c)) return clusters
def cluster_from_pickle(number_of_clusters=3600): user_features = pickle.load( open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb')) users_features_vectors = list(user_features.values()) users_dataset = np.array(users_features_vectors) print(users_dataset[1]) kproto_init = kprototypes.KPrototypes(n_clusters=number_of_clusters, init="Huang", verbose=2, n_init=1) result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6]) clustering_result = {} for i in range(len(result)): if result[i] in clustering_result: clustering_result[result[i]] += [users_features_vectors[i]] else: clustering_result[result[i]] = [users_features_vectors[i]] file_to_write = open('users_vectprs_clustering.p', 'wb') pickle.dump(clustering_result, file_to_write)
def test_kprototypes_sample_weights_validation(self): kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) sample_weight_too_few = [1] * 11 with self.assertRaisesRegex( ValueError, "sample_weight should be of equal size as samples."): kproto.fit_predict(STOCKS, categorical=[1, 2], sample_weight=sample_weight_too_few) sample_weight_negative = [-1] + [1] * 11 with self.assertRaisesRegex( ValueError, "sample_weight elements should be positive."): kproto.fit_predict(STOCKS, categorical=[1, 2], sample_weight=sample_weight_negative) sample_weight_non_numerical = [None] + [1] * 11 with self.assertRaisesRegex( ValueError, "sample_weight elements should either be int or floats."): kproto.fit_predict(STOCKS, categorical=[1, 2], sample_weight=sample_weight_non_numerical)
def kprototype(filename, num_clusters): #输入数据 #若输入完整数据库30000个entries可能计算时间会过久,故先以3000个data points作为例子。 num_data = 3000 X_original = np.genfromtxt(filename, dtype=object, delimiter=',')[1:num_data, :] #normalize连续型变量 X_categorical = X_original[:, 0:10] X_numerical = normalize(X_original[:, 11:], norm='l2') #对于连续型变量,如果数量较多的话可以考虑使用PCA降维 #X_numerical = PCA(n_components=1).fit_transform(X_numerical) X = np.concatenate((X_categorical, X_numerical), axis=1) #开始训练,默认权重u为0.5 * 连续型变量值的标准差 kproto = kprototypes.KPrototypes(n_clusters=num_clusters, init='Cao', verbose=2) clusters = kproto.fit_predict(X, categorical=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) print '\n' #输出每个数据点所属的聚类标签 print 'Labels of each data point: \n', kproto.labels_, '\n' #输出各个类别的样本数 for i in range(num_clusters): num_sample = 0 for n in kproto.labels_: if i == n: num_sample += 1 print 'numbers of samples in the', i, 'cluster: ', num_sample print '\n' #输出聚类中心 print 'Clusters: \n', kproto.cluster_centroids_ #输出目标函数成本 print 'Cost: ', kproto.cost_
def cluster_data_before_classify(self, data): model = kprototypes.KPrototypes(n_clusters=8) cluster_data = data.drop(['target'], axis=1) categoricals = [i for i in range(3, len(cluster_data.columns))] # visualizer for find best cluster number # visualizer = KElbowVisualizer(model, k=(2, 8), metric='silhouette', timings=False) # # Fit the data and visualize # visualizer.fit(data) # visualizer.poof() model.fit(cluster_data.values, categorical=categoricals) labels = model.labels_ unique, counts = np.unique(labels, return_counts=True) dict(zip(unique, counts)) indices = [ np.where(model.labels_ == i)[0] for i in range(model.n_clusters) ] for i in range(model.n_clusters): random.shuffle(indices[i]) self.merge_train_test_data_from_each_cluster(indices)
0], worker_pca_result[np.where(worker_df['agglo_label'] == j)[0], 1], label=str(j) + ": " + str(len(np.where(worker_df['agglo_label'] == j)[0]))) pl.title("Workers Agglomerative Clustering") pl.legend() pl.show() # kprototypes clustering requester_norm_df = pd.DataFrame( np.hstack((requester_df.iloc[:, 1:2].values, requester_norm_features))) worker_norm_df = pd.DataFrame( np.hstack((worker_df.iloc[:, 1:2].values, worker_norm_features))) for i in range(2, 6): kproto = kp.KPrototypes(n_clusters=i) # cluster requesters data requester_label = kproto.fit_predict(requester_norm_df.iloc[:, 1:].values, categorical=[0]) requester_df['kmeans_' + str(i)] = requester_label pl.figure() for j in range(i): pl.scatter( requester_pca_result[np.where(requester_df["kmeans_" + str(i)] == j)[0], 0], requester_pca_result[np.where(requester_df["kmeans_" + str(i)] == j)[0], 1], label=str(j) + ": " + str(len(np.where(requester_df["kmeans_" + str(i)] == j)[0]))) pl.title("Requesters Clustering result k=" + str(i))
#!/usr/bin/env python import numpy as np from kmodes import kprototypes # stocks with their market caps, sectors and countries syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0] xnum = np.genfromtxt('stocks.csv', dtype=float, delimiter=',')[:, 1] xnum = np.atleast_2d(xnum).T xcat = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 2:] kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) clusters = kproto.fit_predict([xnum, xcat]) for s, c in zip(syms, clusters): print("Symbol: {}, cluster:{}".format(s, c))
import numpy as np from kmodes import kprototypes kp = kprototypes.KPrototypes(n_clusters=7, init='Cao', verbose=2) X = df_small.values clusters = kp.fit_predict(X, categorical=[1, 2, 3, 7, 8]) print "Cluster centroids" print(kp.cluster_centroids_) print "Training stats" print(kproto.cost_) print(kproto.n_iter_) #getting cost cost = [] for i in range(1, 11): kp = kprototypes.KPrototypes(n_clusters=i, init='Cao', verbose=2) clusters = kp.fit_predict(X, categorical=[0, 1, 2, 3, 4, 5, 6, 8, 9]) cost.append(kp.cost_)
def get_silhouette_score(nclust): kprot = kprototypes.KPrototypes(nclust) labels = kprot.fit_predict(scaled, categorical=categoricals_indicies) sil_avg = silhouette_score(scaled, labels) return sil_avg
#np.save('X',X) #np.save('Y',Y) #%% # Make training matrix. #training_matrix = [] #for user in train_data: # for artist in train_data[user]: # training_matrix.append([user, artist] + train_data[user][artist]) #reg = KMeans(n_clusters = 10, n_init = 3, n_jobs = -1) #reg.fit(training_matrix[:-1],training_matrix[-1]) #reg.fit(training_matrix) #reg.fit(training_matrix[:-1],training_matrix[-1]) #%% X_train,X_val,Y_train,Y_val = train_test_split(X, Y,test_size=0.9) reg = kprototypes.KPrototypes(n_clusters = 8, init='Cao') reg.fit(X_train,y=Y_train,categorical =[0,1,2,4,5] ) #%% Test out vs user mean for i in X_val # Write out test solutions. with open(test_file, 'r') as test_fh: test_csv = csv.reader(test_fh, delimiter=',', quotechar='"') next(test_csv, None) with open(soln_file, 'w') as soln_fh: soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) soln_csv.writerow(['Id', 'plays'])
def analyze(data): # Convert this to python data for us to be able to run ML algorithms json_to_python = json.loads(data) per_user = dict() # IP-Status hostlist = dict() # Data pre-processing here: for y in json_to_python: hostlist[y['HOST']] = 1 if y['HOST'] in per_user: per_user[y['HOST']].append(y['STATUS']) else: per_user[y['HOST']] = [y['STATUS']] log.debug("*** Printing input contents to the algorithm: ***") ###Analysis 1 : (ML): Run K-prototypes algorithm on IP-Response_status feature-set here: X = np.array([[0.00, '0']]) for x in hostlist: word_counter = {} for word in per_user[x]: if word in word_counter: word_counter[word] += 1 else: word_counter[word] = 1 popular_words = sorted(word_counter, key=word_counter.get, reverse=True) max_status = popular_words[0] # print x + ": " + max_status y = x.split(".") ip = "" for z in range(4): l = len(y[z]) l = 3 - l if (l > 0): zero = "" for t in range(3 - len(y[z])): zero = zero + "0" y[z] = zero + y[z] ip = ip + y[z] log.debug(str(float(float(ip) / 1000)) + ": " + max_status) le = [float(float(ip) / 1000), max_status] X = np.vstack([X, le]) # print X log.info( "######******* Analysis #1: K-prototype for IP address-Response status: ******#######" ) ##For k-proto analysis: ##Adjust number of clusters here kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) result = kproto.fit_predict(X, categorical=1) # print result # cluster by status num_clust = dict() clust_content = dict() X_index = 0 for x in result: if x in num_clust: num_clust[x] += 1 clust_content[x].append(X_index) else: num_clust[x] = 1 clust_content[x] = [X_index] X_index += 1 min_index = min(num_clust, key=num_clust.get) max_index = max(num_clust, key=num_clust.get) log.info("Cluster no. " + str(min_index) + " has the least elements: " + str(num_clust[min_index])) log.info("Check INFO.log to view its contents!") content_arr = clust_content[min_index] log.info( "**** Contents of the cluster with minimum number of elements! *****" ) # Prints contents of min cluster input_index = 0 for y in X: if input_index in content_arr: log.info(y) input_index += 1 log.info("Cluster no. " + str(max_index) + " has the maximum elements: " + str(num_clust[max_index])) log.info("Check INFO.log to view its contents!") log.info( "Check DEBUG.log to view contents of all clusters along with the main input X!" ) content_arr = clust_content[max_index] log.info( "***** Contents of the cluster with maximum number of elements! *****") # Prints contents of max cluster input_index = 0 for y in X: if input_index in content_arr: log.info(y) input_index += 1 log.debug("***** Contents of all clusters! *****") # Prints contents of all clusters for k in clust_content: content_arr = clust_content[k] log.debug("***** Contents of cluster #" + str(k) + ": *****") log.debug("***** This cluster has " + str(num_clust[k]) + " elements! *****") input_index = 0 for y in X: if input_index in content_arr: log.debug(y) input_index += 1
scaled = scaler.fit_transform(dfSessions) # clustering k-prototypes (mixed numeric and categorical features) init = 'Huang' # can be 'Cao', 'Huang' or 'random' n_clusters = 12 max_iter = 100 # 15 # 1556.6108261222275 # 16 # 1435.3049147588504 kproto = kprototypes.KPrototypes(n_clusters=n_clusters, init=init, max_iter=max_iter) # k_prototypes(X, categorical, n_clusters, max_iter, num_dissim, # cat_dissim, gamma, init, n_init, verbose, random_state, n_jobs) # cluster_centroids_ : array, [n_clusters, n_features] # Categories of cluster centroids # labels_ : # Labels of each point # cost_ : float # Clustering cost, defined as the sum distance of all points to # their respective cluster centroids. # n_iter_ : int # The number of iterations the algorithm ran for. # gamma : float # The (potentially calculated) weighing factor.
def test_kprotoypes_no_categoricals(self): kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2, random_state=42) with self.assertRaises(NotImplementedError): kproto_cao.fit(STOCKS, categorical=[])
def test_pickle(self): obj = kprototypes.KPrototypes() s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def test_pickle(self): obj = kprototypes.KPrototypes() serialized = pickle.dumps(obj) self.assertTrue(isinstance(pickle.loads(serialized), obj.__class__))
def test_kprototypes_unknowninit_soybean(self): kproto = kprototypes.KPrototypes(n_clusters=4, init='nonsense', verbose=2) with self.assertRaises(NotImplementedError): kproto.fit(STOCKS, categorical=[1, 2])
def test_kprotoypes_random_stocks(self): kproto_random = kprototypes.KPrototypes(n_clusters=4, init='random', verbose=2) result = kproto_random.fit(STOCKS, categorical=[1, 2]) self.assertIsInstance(result, kprototypes.KPrototypes)
def test_kprotoypes_wrong_categorical_type(self): kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) with self.assertRaises(AssertionError): kproto.fit_predict(STOCKS, categorical={1, 2})
def test_pickle_fitted(self): kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2) model = kproto.fit(STOCKS[:, :2], categorical=1) serialized = pickle.dumps(model) self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))