# fit the model model.fit(x_train, y_train) # perform permutation importance results = permutation_importance(model, x_train, y_train, scoring='accuracy') # get importance importance = results.importances_mean plt.plot(labels, results.importances_mean, label = 'permutation k-neighbor' ) plt.legend() ### -------- UNSUPERVISED SECTION -----------### # cluster model = KMeans(n_clusters=2, random_state=0,n_init=50) idx = model.fit_predict(x_best) x = x_best[:,0] y = x_best[:,1] cdict = ['k','r'] f, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True) for g in np.unique(y_train): ix = np.where(y_train == g) ax1.scatter(x[ix], y[ix], c = cdict[g], label = g, s = 8) ax1.legend() ax1.title.set_text('Real') ax1.set_xlabel(best_labels[0]) ax1.set_ylabel(best_labels[1]) for g in np.unique(idx):
def eval_individual_device(train_data_file, dname, specified_models=None): global root_feature, root_model, root_output, dir_tsne_plots """ Assumptions: the train_data_file contains only 1 device, all possible states(tags); the models can only be one of the implementated: knn, kmeans, dbscan, random forest classifier """ warnings.simplefilter("ignore", category=DeprecationWarning) warnings.simplefilter("ignore", category=FutureWarning) """ Skip trained models, return if there is no model to train. """ list_all_models = model_list if specified_models is not None: list_all_models = specified_models list_models_todo = [] for model_alg in list_all_models: """ Prepare the directories and add only models that have not been trained yet """ model_dir = '%s/%s' % (root_model, model_alg) model_file = '%s/%s%s.model' % (model_dir, dname, model_alg) label_file = '%s/%s.label.txt' % (model_dir, dname) if not os.path.exists(model_file) and os.path.exists(train_data_file): # check .model # check if training data set is available list_models_todo.append(model_alg) if len(list_models_todo) < 1: print('skip %s, all models trained for alg: %s' % (dname, str(list_all_models))) return print('Training %s using algorithm(s): %s' % (dname, str(list_models_todo))) train_data = pd.read_csv(train_data_file) num_data_points = len(train_data) if num_data_points < 1: print(' Not enough data points for %s' % dname) return print('\t#Total data points: %d ' % num_data_points) X_feature = train_data.drop(['device', 'state', 'hosts'], axis=1).fillna(-1) ss = StandardScaler() pca = PCA(n_components=5) X_std = ss.fit_transform(X_feature) # Create a PCA instance: pca X_std = pca.fit_transform(X_std) # Save components to a DataFrame X_std = pd.DataFrame(X_std) X_feature = X_std.iloc[:, :4] y_labels = np.array(train_data.state) # y_labels, example: on, off, change_color """ Split data set into train & test, default fraction is 30% test """ X_train, X_test, y_train, y_test = train_test_split(X_feature, y_labels, test_size=.3, random_state=42) print('Train: %s' % len(X_train)) print('Test: %s' % len(X_test)) num_lables = len(set(y_labels)) if num_lables < 2: print('\tNo enough labels for %s' % dname) return """ One hot encoding y labels """ lb = LabelBinarizer() lb.fit(y_labels) # collect all possible labels y_train_bin = lb.transform(y_train) y_test_bin = lb.transform(y_test) y_test_bin_1d = np.argmax(y_test_bin, axis=1) """ Train through the list of interested ML algorithms """ ret_results = [] for model_alg in list_models_todo: model_dir = os.path.join(root_model, model_alg) if not os.path.exists(model_dir): os.system('mkdir -pv %s' % model_dir) model_file = os.path.join(model_dir, dname + model_alg + ".model") label_file = os.path.join(model_dir, dname + ".label.txt") single_outfile = os.path.join(model_dir, dname + ".result.csv") output_file = os.path.join(root_output, "result_" + model_alg + ".txt") _acc_score = -1 _noise = -1 _silhouette = -1 """ Two steps 1. Train (70%) 2. Test 3. Evaluate """ if model_alg == 'knn': print(' knn: n_neighbors=%s' % num_lables) trained_model = KNeighborsClassifier(n_neighbors=num_lables) trained_model.fit(X_train, y_train_bin) y_predicted = trained_model.predict(X_test) try: y_predicted_1d = np.argmax(y_predicted, axis=1) except: y_predicted_1d = y_predicted if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'kmeans': print(' kmeans: n_clusters=%s' % num_lables) trained_model = MiniBatchKMeans(n_clusters=num_lables, random_state=0, batch_size=6) trained_model.fit(X_train) y_predicted_1d = trained_model.predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'spectral': print(' Spectral Clustering: n_clusters=%s' % num_lables) trained_model = SpectralClustering(n_clusters=num_lables, affinity='nearest_neighbors', random_state=0) trained_model.fit(X_train) y_predicted_1d = trained_model.fit_predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) elif model_alg == 'dbscan': print(' eps=%s' % 300) trained_model = DBSCAN(eps=200, min_samples=5) trained_model.fit(X_train) y_predicted_1d = trained_model.fit_predict(X_test).round() if len(set(y_predicted_1d)) > 1: _silhouette = silhouette_score(X_test, y_predicted_1d) _noise = list(y_predicted_1d).count(-1) * 1. / num_data_points elif model_alg == 'rf': trained_model = RandomForestClassifier(n_estimators=1000, random_state=42) trained_model.fit(X_train, y_train_bin) y_predicted = trained_model.predict(X_test).round() # print(y_predicted) if y_predicted.ndim == 1: y_predicted_1d = y_predicted else: y_predicted_1d = np.argmax(y_predicted, axis=1) try: _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d) except: print(y_predicted_1d[0]) _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d[0]) """ Eval clustering based metrics """ _homogeneity = -1 _complete = -1 _vmeasure = -1 _ari = -1 _f1_score = -1 if model_alg not in ['rf']: """ Metrics for clustering algorithms """ # print('y_test_bin: %s' % y_test_bin_1d) # print('y_predicted_1d: %s' % y_predicted_1d) _homogeneity = homogeneity_score(y_test_bin_1d, y_predicted_1d) _complete = completeness_score(y_test_bin_1d, y_predicted_1d) _vmeasure = v_measure_score(y_test_bin_1d, y_predicted_1d) _ari = adjusted_rand_score(y_test_bin_1d, y_predicted_1d) """ Plot tSNE graph """ figfile = '%s/%s/%s-%s.png' % (root_model, model_alg, model_alg, dname) pp = 30 # perplexity if num_data_points > 200: pp = 50 tsne_plot(X_feature, y_labels, figfile, pp) """ Save the model """ model_dictionary = dict({ 'standard_scaler': ss, 'pca': pca, 'trained_model': trained_model }) pickle.dump(model_dictionary, open(model_file, 'wb')) """ Save the label for onehot encoding """ # unique_labels = label_encoder.classes_.tolist() unique_labels = lb.classes_.tolist() open(label_file, 'w').write('%s\n' % '\n'.join(unique_labels)) """ Save eval results """ # TODO: due to the multi-thread, needs to change the settings with open(single_outfile, 'a+') as off: off.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (dname, _acc_score, _homogeneity, _complete, _vmeasure, _ari, _noise, _silhouette)) # y_test_bin_1d, y_predicted_1d off.write('%s\n' % ','.join(map(str, y_test_bin_1d))) off.write('%s\n' % ','.join(map(str, y_predicted_1d))) ret_results.append([ output_file, dname, _acc_score, _homogeneity, _complete, _vmeasure, _ari, _noise, _silhouette ]) """ Print to Terminal """ print(' model -> %s' % model_file) print(' labels -> %s' % label_file) print('\t' + '\n\t'.join(unique_labels) + '\n') if model_alg not in ['rf']: print(' _homogeneity: %.3f' % _homogeneity) print(' _completeness: %.3f' % _complete) print(' _vmeausre: %.3f' % _vmeasure) print(' _ari: %.3f' % _ari) print(' _silhouette: %.3f' % _silhouette) print(' _acc_score: %.3f' % _acc_score) print(' measures saved to: %s' % single_outfile) return ret_results