def test_kmodes_predict_soybean(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint16))
def test_kmodes_predict_soybean_ng(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
#print(housing_binary) #print(len(housing_binary)) #print(u_housing[i_housing]) #print(len(u_housing[i_housing])) # LIFT km = KModes(n_clusters = 5) #kmeans = KMeans(n_clusters = 5) X = np.vstack((i_plaintiff,i_judgment_type, i_judgment_method,d_a, g_num)) X = np.transpose(X) X = np.hstack((X,low)) #print(X) km.fit(X) y_km = km.predict(X) #print(X[0,:]) #print(X[1,:]) plt.scatter(latitude, longitude, c = y_km, s = 50, cmap='winter') plt.xlabel('Latitude') plt.ylabel('Longitude') plt.show() #print(y_km) units = np.array(df['units']) number = np.nonzero(units) print(number) number = np.array(number) units_final = units[number]
def fit(qual_id, count): # More than 500 documents results in slow training if count >= 500: count = 500 # Query for passed qual_id with incorrect answers # May take a lengthly amount of time. Recommend optimizing query. # print("Querying for", qual_id) data = collection.find({"qual_id": qual_id, "correct": False})[:count] # print("Query complete.") # Compile dictionary of all possible features in given list of records # print("Compiling dictionary of features.") features = {} for doc in data: doc_features = {} if doc['response'] is None: continue doc_features = retrieveKeys(doc['response'], doc_features) features = mergeFeatures(doc_features, features) # print("Feature compilation complete.") # Count number of features length = countFeatures(features) if length == 0: return # Reuse queried documents. data = data.rewind() # Append missing features to all records and assign common benign value. # Current benign value is an empty string. # print("Appending features to documents.") student_data = np.array([]) for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) if len(student_data) == 0: student_data = np.append(student_data, temp) student_data = np.reshape(student_data, (-1, length)) else: student_data = np.append(student_data, [temp], axis=0) # print("Finished appending features to documents.") # Perform k-modes clustering # print("Clustering...") clusters = len(student_data) # K-modes implementation can't generate more than 255 centroids if clusters > 255: clusters = 255 km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False) # print("Finished.") km.fit(student_data) # Print important information from clustering # Centroids are common values to each cluster centroids = km.cluster_centroids_ # print("Centroids") # print(centroids) # Labels is a list indicating which cluster each record belongs to labels = km.labels_ # print("Labels") # print(labels) # Cost is value indicating possible error in the clusters. Ideal value is # 0.0. If value is greater than 0.0, then the max number of clusters were # generated and some responses were assigned to an inexact cluster. This. # would result in the largest cluster having having documents it shouldn't. # Recommend re-clustering with fewer documents or more clusters if possible. cost = km.cost_ # print("Cost") # print(cost) # Prints 5 largest cluster labels and number of records per cluster. most_common = Counter(labels).most_common(5) # print("Most populated centroids") # print(most_common) # Generate cluster dictionary to be inserted in the centroid_db. # Qual_id: qual_id of given documents # Features: Dictionary of all possible features in passed documents. # Centroids: List of generated centroids. # Cluster_sizes: Number of documents in each cluster. # Behavioral_traits: Behavioral traits associated with at least one # document assigned to the given centroid. # Screenshot_urls: A screenshot from one document within each cluster. # Centroids and behavioral_traits have the same lengths. The behavioral # traits in a given index of behavioral_traits is associated with the same # index of centroids. post = { 'qual_id': qual_id, 'features': features, 'centroids': {}, 'cluster_sizes': {}, 'behavioral_traits': {}, 'screenshot_urls': {} } for i in Counter(labels).most_common(len(centroids)): if str(i[0]) not in post['cluster_sizes']: post['cluster_sizes'][str(i[0])] = str(i[1]) for i in range(len(centroids.tolist())): if str(i) not in post['centroids']: post['centroids'][str(i)] = centroids.tolist()[i] # Reuse queried documents. data = data.rewind() label = 0 for doc in data: if doc['response'] is None: continue elif str(labels[label]) not in post['screenshot_urls']: post['screenshot_urls'][str(labels[label])] = doc['screenshot_url'] label += 1 else: label += 1 # Reuse queried documents. data = data.rewind() # Add associated behavioral traits to cluster dictionary. for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) temp = np.reshape(temp, (-1, length)) label = km.predict(temp)[0] if str(label) not in post['behavioral_traits']: post['behavioral_traits'][str( label)] = doc['behavioral_traits'] # Add generated cluster dictionary to centroid_db. # If a record shares the same qual_id as the generated cluster dictionary, # then the stored record will be overwritten. # print("Posting centroids to database centroids.") centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
def test_kmodes_predict_unfitted(self): kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2) with self.assertRaises(AssertionError): kmodes_cao.predict(SOYBEAN) with self.assertRaises(AttributeError): kmodes_cao.cluster_centroids_
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: if X is None: return [] # check the datatype of user-defined columns if not isinstance(include_columns, list): raise ValueError("Variable: 'include_columns' should be <list>") if not isinstance(ignore_columns, list): raise ValueError("Column: 'ignore_columns' should be <list>") if not isinstance(num_clusters, int): raise ValueError("Column: 'num_clusters' should be <int>") ## validate user-inputs and override the columns given by user features = list(X.names) if len(include_columns) > 0: for _ in include_columns: if _ not in list(X.names): raise ValueError("Column: '" + str(_) + "' is not present in the dataset") features = include_columns ## list to ignore specific columns given by user features = [_f for _f in features if _f not in ignore_columns] ## handle columns with missing values ignore_ = [] X_df = X.to_pandas() for col in features: # label encode categorical columns # refer - https://github.com/h2oai/driverlessai-recipes/pull/68#discussion_r365133392 if X_df[col].dtype == "object": X_df[f"{col}_enc"] = LabelEncoder().fit_transform( X_df[col].to_numpy()) ignore_.append(col) miss_percent = X_df[col].isna().sum() / X_df.shape[0] if miss_percent >= 0.3: # ignore columns having more than 30% missing values ignore_.append(col) elif miss_percent > 0.0: # impute by mean for other columns with missing values X_df[col] = X_df[col].fillna(X_df[col].mean()) features = [f for f in features if f not in ignore_] features += [_f for _f in X_df.columns if "_enc" in _f] if len(features) == 0: raise ValueError("Unable to cluster: No useful features available") X_clust = X_df[features].values # Apply min max scaling X_clust = MinMaxScaler().fit_transform(X_clust) # Go through possible numbers of clusters best_score = None best_n_clust = None best_clust_ids = None ## if number of clusters is pre-defined by user, then dont find the optimal if num_clusters > 1: model = KModes(n_clusters=num_clusters, n_jobs=NUM_JOBS).fit(X_clust) clust_ids = model.predict(X_clust).astype(np.int32) score = my_davies_bouldin_score(X_clust, clust_ids) best_score = score best_n_clust = num_clusters best_clust_ids = clust_ids else: for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS, CLUSTER_STEP_SIZE): model = KModes(n_clusters=n_clusters, n_jobs=NUM_JOBS).fit(X_clust) clust_ids = model.predict(X_clust).astype(np.int32) score = my_davies_bouldin_score(X_clust, clust_ids) improve = False if best_score is None: improve = True elif best_score > score: improve = True if improve: best_score = score best_n_clust = n_clusters best_clust_ids = clust_ids if best_score is None: return [] else: X[:, f'kmodes{best_n_clust}'] = dt.Frame(best_clust_ids) return X
def fit(qual_id, count): # More than 500 documents results in slow training if count >= 500: count = 500 # Query for passed qual_id with incorrect answers # May take a lengthly amount of time. Recommend optimizing query. if FLAG_VERBOSE: print("Querying for", qual_id) data = collection.find({"qual_id": qual_id, "correct": False})[:count] if FLAG_VERBOSE: print("Query complete.") # Compile dictionary of all possible features in given list of records if FLAG_VERBOSE: print("Compiling dictionary of features.") num_examples = 0 num_empty = 0 features = {} for doc in data: doc_features = {} if doc['response'] is None: num_empty += 1 continue doc_features = retrieveKeys(doc['response'], doc_features) features = mergeFeatures(doc_features, features, "") num_examples += 1 if FLAG_VERBOSE: print("Feature compilation complete.") # Count number of features num_features = countFeatures(features) if FLAG_VERBOSE: print("*** Number of features: {}".format(num_features)) print( "*** Number of non-empty records for [Q_ID:{}]: {}. (dropped {} with empty resp)" .format(qual_id, num_examples, num_empty)) if num_features == 0: return # Reuse queried documents. data = data.rewind() # Append missing features to all records and assign common benign value. # Current benign value is an empty string. # print("Appending features to documents.") # faster to create zeroed np array first, rather then appending student_data = np.zeros((num_examples, num_features), dtype='<U32') i = 0 for doc in data: if doc['response'] is None: continue else: temp = addFeatures(features, [], doc['response']) student_data[i, :] = temp i += 1 if FLAG_VERBOSE: print("Finished appending features to documents.") print(student_data) #print("*** Features: ***") #pprint(interpretFeatures(features, [])) # print feature vectors #print("*** FEATURE VECTOR: ***") #i = 0 #for row in student_data: # print("[{}]: {}".format(i, row)) # i += 1 #print(repr(student_data)) # Perform k-modes clustering print("Clustering...") clusters = NUM_CLUSTERS # K-modes implementation can't generate more than 255 centroids if clusters > 255: clusters = 255 if clusters > len(student_data): clusters = len(student_data) km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False) km.fit(student_data) print("Finished.") # Print important information from clustering # Centroids are common values to each cluster centroids = km.cluster_centroids_ if FLAG_VERBOSE: print("*** CENTROIDS: ***") print(centroids) # Labels is a list indicating which cluster each record belongs to labels = km.labels_ if FLAG_VERBOSE: print("*** LABELS: ***") print(labels) # Cost is value indicating possible error in the clusters. Ideal value is 0.0 if FLAG_VERBOSE: cost = km.cost_ print("*** COST: ***") print(cost) # Prints 5 largest cluster labels and number of records per cluster. if FLAG_VERBOSE: most_common = Counter(labels).most_common(5) print("Most populated centroids") print(most_common) # Generate cluster dictionary to be inserted in the centroid_db. # Qual_id: qual_id of given documents # Features: Dictionary of all possible features in passed documents. # Centroids: List of generated centroids. # Behavioral_traits: Behavioral traits associated with at least one # document assigned to the given centroid. # Centroids and behavioral_traits have the same lengths. The behavioral # traits in a given index of behavioral_traits is associated with the same # index of centroids. if FLAG_USE_CENTROID_DB: post = { 'qual_id': qual_id, 'features': features, 'centroids': centroids.tolist(), 'behavioral_traits': {} } # Reuse queried documents. data = data.rewind() # Add associated behavioral traits to cluster dictionary. for doc in data: if doc['response'] is None: continue else: temp = np.array([]) temp = addFeatures(features, temp, doc['response']) temp = np.reshape(temp, (-1, num_features)) label = km.predict(temp)[0] if str(label) not in post['behavioral_traits']: post['behavioral_traits'][str( label)] = doc['behavioral_traits'] X_ids.append(doc['_id']) # Add generated cluster dictionary to centroid_db. # If a record shares the same qual_id as the generated cluster dictionary, # then the stored record will be overwritten. print("Posting centroids to database centroids.") centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True) print(qual_id, "complete.") print() if FLAG_DO_ANALYSIS: # perform some automatic EDA on largest clusters and save # collect ids of examples data = data.rewind() X_ids = [] for doc in data: if doc['response'] is None: continue else: X_ids.append(doc['_id']) out_dir = ANALYS_OUT_DIR if out_dir is None: out_dir = "./out/" + str(qual_id) analys = cluster_analyzer(collection, out_dir) analys.analyze(student_data, labels, centroids, X_ids, qual_id, interpretFeatures(features, []))
print() ## e) dataFrame['Type'] = dataFrame['Type'].astype('category') dataFrame['Origin'] = dataFrame['Origin'].astype('category') dataFrame['DriveTrain'] = dataFrame['DriveTrain'].astype('category') dataFrame['Cylinders'] = dataFrame['Cylinders'].astype('category') cat_col = dataFrame.select_dtypes(['category']).columns df = dataFrame[cat_col].apply(lambda x: x.cat.codes) km = KModes(n_clusters=3, init='Huang', random_state=555) clusters = km.fit(df) cents = km.cluster_centroids_ predict_results = km.predict(df) unique, counts = np.unique(predict_results, return_counts=True) num_obs_in_each_cluster = dict(zip(unique, counts)) def showResult(i): print("The number of observations in cluster 1: %d" % num_obs_in_each_cluster[i]) print("The number of observations in cluster 2: %d" % num_obs_in_each_cluster[i + 1]) print("The number of observations in cluster 3: %d" % num_obs_in_each_cluster[i + 2]) for x in range(0, 1): showResult(x)
#train['total_miss_square']=train['total_miss']**2 #test['total_miss_square']=test['total_miss']**2 #for n in bincol+normcol04+normcol59+monthday: ## _,test[n+'_freq']=FreqEncode(train[n],test[n]) # _,test[n+'_target']=TargetEncode(train[n],test[n],target) ## train[n+'_miss']=train[n].isna() ## test[n+'_miss']=test[n].isna() #te=ce.TargetEncoder(smoothing=0.3) #te.fit(train,target) #test=te.transform(test) #==================k mode clustering======== from kmodes.kmodes import KModes km = KModes(n_clusters=2, init="Cao", n_init=1, verbose=1, random_state=1990) train['cluster'] = km.fit_predict(train_cluster) test['cluster'] = km.predict(test_cluster) ##==========test independency #import scipy.stats as scs # #def chi_square_of_df_cols(df, col1, col2): # df_col1, df_col2 = df[col1], df[col2] # # result = [[sum((df_col1 == cat1) & (df_col2 == cat2)) # for cat2 in df_col2.unique()] # for cat1 in df_col1.unique()] # # return scs.chi2_contingency(result) # #chi_matrix=np.zeros([len(train_cluster.columns),len(train_cluster.columns)]) #for i,r in enumerate(train_cluster.columns): # for j,c in enumerate(train_cluster.columns):
#wcss = [] #for i in range(1,30): # kmodes = KModes(n_clusters=i, init='Huang', n_init=5, verbose=1) # kmodes.fit(data1) # wcss.append(kmodes.cluster_centroids_) #plt.plot(range(1,30), wcss) #plt.title("The elbow method") #plt.xlabel("The number of clusters") #plt.ylabel("WCSS") #plt.show() wcss """**Kmode Model Creation and prediction**""" km = KModes(n_clusters=23, init='Huang', n_init=5, verbose=1) km = km.fit(data1) clusters = km.predict(data1) # Print the cluster centroids print(km.cluster_centroids_) """**Storing My Prediction to CSV file**""" k = pd.DataFrame() k['output'] = clusters k.to_csv("outpt.csv")