# fit the model
model.fit(x_train, y_train)
# perform permutation importance
results = permutation_importance(model, x_train, y_train, scoring='accuracy')
# get importance
importance = results.importances_mean
plt.plot(labels, results.importances_mean, label = 'permutation k-neighbor' )


plt.legend()

### -------- UNSUPERVISED SECTION -----------###

# cluster
model = KMeans(n_clusters=2, random_state=0,n_init=50)
idx = model.fit_predict(x_best)

x = x_best[:,0]
y = x_best[:,1]

cdict = ['k','r']
f, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True)

for g in np.unique(y_train):
    ix = np.where(y_train == g)
    ax1.scatter(x[ix], y[ix], c = cdict[g], label = g, s = 8)
ax1.legend()
ax1.title.set_text('Real')
ax1.set_xlabel(best_labels[0])
ax1.set_ylabel(best_labels[1])
for g in np.unique(idx):
示例#2
0
def eval_individual_device(train_data_file, dname, specified_models=None):
    global root_feature, root_model, root_output, dir_tsne_plots
    """
    Assumptions: the train_data_file contains only 1 device, all possible states(tags); the models can only be 
    one of the implementated: knn, kmeans, dbscan, random forest classifier  
    """
    warnings.simplefilter("ignore", category=DeprecationWarning)
    warnings.simplefilter("ignore", category=FutureWarning)
    """
    Skip trained models, return if there is no model to train. 
    """
    list_all_models = model_list
    if specified_models is not None:
        list_all_models = specified_models

    list_models_todo = []
    for model_alg in list_all_models:
        """
        Prepare the directories and add only models that have not been trained yet 
        """
        model_dir = '%s/%s' % (root_model, model_alg)
        model_file = '%s/%s%s.model' % (model_dir, dname, model_alg)
        label_file = '%s/%s.label.txt' % (model_dir, dname)
        if not os.path.exists(model_file) and os.path.exists(train_data_file):
            # check .model
            # check if training data set is available
            list_models_todo.append(model_alg)

    if len(list_models_todo) < 1:
        print('skip %s, all models trained for alg: %s' %
              (dname, str(list_all_models)))
        return
    print('Training %s using algorithm(s): %s' %
          (dname, str(list_models_todo)))
    train_data = pd.read_csv(train_data_file)

    num_data_points = len(train_data)
    if num_data_points < 1:
        print('  Not enough data points for %s' % dname)
        return
    print('\t#Total data points: %d ' % num_data_points)
    X_feature = train_data.drop(['device', 'state', 'hosts'],
                                axis=1).fillna(-1)
    ss = StandardScaler()
    pca = PCA(n_components=5)
    X_std = ss.fit_transform(X_feature)
    # Create a PCA instance: pca
    X_std = pca.fit_transform(X_std)
    # Save components to a DataFrame
    X_std = pd.DataFrame(X_std)
    X_feature = X_std.iloc[:, :4]
    y_labels = np.array(train_data.state)
    # y_labels, example: on, off, change_color
    """
    Split data set into train & test, default fraction is 30% test
    """
    X_train, X_test, y_train, y_test = train_test_split(X_feature,
                                                        y_labels,
                                                        test_size=.3,
                                                        random_state=42)
    print('Train: %s' % len(X_train))
    print('Test: %s' % len(X_test))

    num_lables = len(set(y_labels))
    if num_lables < 2:
        print('\tNo enough labels for %s' % dname)
        return
    """
    One hot encoding y labels
    """
    lb = LabelBinarizer()
    lb.fit(y_labels)  # collect all possible labels
    y_train_bin = lb.transform(y_train)
    y_test_bin = lb.transform(y_test)
    y_test_bin_1d = np.argmax(y_test_bin, axis=1)
    """
    Train through the list of interested ML algorithms
    """
    ret_results = []
    for model_alg in list_models_todo:
        model_dir = os.path.join(root_model, model_alg)
        if not os.path.exists(model_dir):
            os.system('mkdir -pv %s' % model_dir)
        model_file = os.path.join(model_dir, dname + model_alg + ".model")
        label_file = os.path.join(model_dir, dname + ".label.txt")
        single_outfile = os.path.join(model_dir, dname + ".result.csv")
        output_file = os.path.join(root_output, "result_" + model_alg + ".txt")
        _acc_score = -1
        _noise = -1
        _silhouette = -1
        """
        Two steps
            1. Train (70%)
            2. Test 
            3. Evaluate 
        """
        if model_alg == 'knn':
            print('  knn: n_neighbors=%s' % num_lables)
            trained_model = KNeighborsClassifier(n_neighbors=num_lables)
            trained_model.fit(X_train, y_train_bin)

            y_predicted = trained_model.predict(X_test)
            try:
                y_predicted_1d = np.argmax(y_predicted, axis=1)
            except:
                y_predicted_1d = y_predicted

            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'kmeans':
            print('  kmeans: n_clusters=%s' % num_lables)
            trained_model = MiniBatchKMeans(n_clusters=num_lables,
                                            random_state=0,
                                            batch_size=6)
            trained_model.fit(X_train)

            y_predicted_1d = trained_model.predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'spectral':
            print('  Spectral Clustering: n_clusters=%s' % num_lables)
            trained_model = SpectralClustering(n_clusters=num_lables,
                                               affinity='nearest_neighbors',
                                               random_state=0)
            trained_model.fit(X_train)

            y_predicted_1d = trained_model.fit_predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'dbscan':
            print('  eps=%s' % 300)
            trained_model = DBSCAN(eps=200, min_samples=5)
            trained_model.fit(X_train)
            y_predicted_1d = trained_model.fit_predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)
            _noise = list(y_predicted_1d).count(-1) * 1. / num_data_points

        elif model_alg == 'rf':
            trained_model = RandomForestClassifier(n_estimators=1000,
                                                   random_state=42)
            trained_model.fit(X_train, y_train_bin)
            y_predicted = trained_model.predict(X_test).round()
            # print(y_predicted)
            if y_predicted.ndim == 1:
                y_predicted_1d = y_predicted
            else:
                y_predicted_1d = np.argmax(y_predicted, axis=1)

        try:
            _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d)
        except:
            print(y_predicted_1d[0])
            _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d[0])
        """
        Eval clustering based metrics
        """

        _homogeneity = -1
        _complete = -1
        _vmeasure = -1
        _ari = -1
        _f1_score = -1
        if model_alg not in ['rf']:
            """
            Metrics for clustering algorithms 
            """
            # print('y_test_bin: %s' % y_test_bin_1d)
            # print('y_predicted_1d: %s' % y_predicted_1d)
            _homogeneity = homogeneity_score(y_test_bin_1d, y_predicted_1d)
            _complete = completeness_score(y_test_bin_1d, y_predicted_1d)
            _vmeasure = v_measure_score(y_test_bin_1d, y_predicted_1d)
            _ari = adjusted_rand_score(y_test_bin_1d, y_predicted_1d)
        """
        Plot tSNE graph
        """
        figfile = '%s/%s/%s-%s.png' % (root_model, model_alg, model_alg, dname)
        pp = 30  # perplexity
        if num_data_points > 200:
            pp = 50
        tsne_plot(X_feature, y_labels, figfile, pp)
        """
        Save the model 
        """
        model_dictionary = dict({
            'standard_scaler': ss,
            'pca': pca,
            'trained_model': trained_model
        })
        pickle.dump(model_dictionary, open(model_file, 'wb'))
        """
        Save the label for onehot encoding 
        """
        # unique_labels = label_encoder.classes_.tolist()
        unique_labels = lb.classes_.tolist()
        open(label_file, 'w').write('%s\n' % '\n'.join(unique_labels))
        """
        Save eval results
        """
        # TODO: due to the multi-thread, needs to change the settings
        with open(single_outfile, 'a+') as off:
            off.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                      (dname, _acc_score, _homogeneity, _complete, _vmeasure,
                       _ari, _noise, _silhouette))
            # y_test_bin_1d, y_predicted_1d
            off.write('%s\n' % ','.join(map(str, y_test_bin_1d)))
            off.write('%s\n' % ','.join(map(str, y_predicted_1d)))

        ret_results.append([
            output_file, dname, _acc_score, _homogeneity, _complete, _vmeasure,
            _ari, _noise, _silhouette
        ])
        """
        Print to Terminal 
        """
        print('    model -> %s' % model_file)
        print('    labels -> %s' % label_file)
        print('\t' + '\n\t'.join(unique_labels) + '\n')
        if model_alg not in ['rf']:
            print('    _homogeneity: %.3f' % _homogeneity)
            print('    _completeness: %.3f' % _complete)
            print('    _vmeausre: %.3f' % _vmeasure)
            print('    _ari: %.3f' % _ari)
            print('    _silhouette: %.3f' % _silhouette)
        print('    _acc_score: %.3f' % _acc_score)
        print('    measures saved to: %s' % single_outfile)
    return ret_results