def generate_safe_unsafe_dataset_for_ssl(): # STEP 0 - data preprocessing, dummy coding tra_csv = pd.read_csv('data/nursery-ssl10-10-1tra.csv') trs_csv = pd.read_csv('data/nursery-ssl10-10-1trs.csv') tst_csv = pd.read_csv('data/nursery-ssl10-10-1tst.csv') tra_len, trs_len, tst_len = len(tra_csv.index), len(trs_csv.index), len( tst_csv.index) merge_csv = pd.concat([tra_csv, trs_csv, tst_csv]) merge_csv_dummy_y = pd.get_dummies(merge_csv['class']) NUM_CLASSES = len(merge_csv_dummy_y.columns) merge_csv.drop(['class'], axis=1, inplace=True) merge_csv_dummy_x = pd.get_dummies(merge_csv) tra_dummy_y = merge_csv_dummy_y[0:tra_len] trs_dummy_y = merge_csv_dummy_y[tra_len:(tra_len + trs_len)] tst_dummy_y = merge_csv_dummy_y[(tra_len + trs_len):] tra_dummy_x = merge_csv_dummy_x[0:tra_len] trs_dummy_x = merge_csv_dummy_x[tra_len:(tra_len + trs_len)] tst_dummy_x = merge_csv_dummy_x[(tra_len + trs_len):] tra_dummy_y_labeled = tra_dummy_y.loc[tra_dummy_y['unlabeled'] != 1] tra_dummy_x_labeled = tra_dummy_x.loc[tra_dummy_x.index.isin( tra_dummy_y_labeled.index)] tra_dummy_y_unlabeled = tra_dummy_y.loc[tra_dummy_y['unlabeled'] == 1] tra_dummy_y_unlabeled_y = trs_dummy_y.loc[trs_dummy_y.index.isin( tra_dummy_y_unlabeled.index)] tra_dummy_x_unlabeled = tra_dummy_x.loc[tra_dummy_x.index.isin( tra_dummy_y_unlabeled.index)] del tra_csv del trs_csv del tst_csv del merge_csv del tra_dummy_y del tra_dummy_x # STEP 1 - train primal classifier using Regularised Least Square y = tra_dummy_y_labeled.as_matrix() X = tra_dummy_x_labeled.as_matrix() print("Now fitting primal RLS classifier") clf = KernelRidge(alpha=0.05) clf.fit(X, y) # EQUATION 1 gamma = 0.05 K = clf._get_kernel(X) Id = np.identity(len(X)) alpha_star = np.matmul(inv(K + gamma * (Id)), y) # STEP 2 - train dual classifier using Collaborative Representation-based Classification print("Now training dual CRC classifier") XT = np.transpose(X) XTX = np.matmul(XT, X) Id = np.identity(len(XTX)) alpha_k = np.matmul(np.matmul(inv(XTX + gamma * Id), XT), y) # EQUATION 2 # STEP 3 - calculate sigma values representing L2 norm distance of Xfeatures for each yClass sigma_list = [] for c in tra_dummy_y_labeled.columns: # EQUATION 3 sigma = np.linalg.norm( tra_dummy_x_labeled.loc[tra_dummy_x_labeled.index.isin( tra_dummy_y_labeled.loc[tra_dummy_y_labeled[c] == 1].index)]. describe().loc['std', :].as_matrix()) if math.isnan(sigma): sigma = np.linalg.norm( tra_dummy_x_labeled.describe().loc['std', :].as_matrix()) sigma_list.append(sigma) num_ssl_classify_wrong = 0 num_ssl_classify_correct = 0 num_ssl_rejected_wrong = 0 num_ssl_rejected_correct = 0 ssl_safe_x = pd.DataFrame(columns=tra_dummy_x_unlabeled.columns) ssl_safe_y = pd.DataFrame(columns=tra_dummy_y_unlabeled_y.columns) ssl_safe_unlabelled_y = pd.DataFrame( columns=tra_dummy_y_unlabeled_y.columns) ssl_safe_real_y = pd.DataFrame(columns=tra_dummy_y_unlabeled_y.columns) not_ssl_safe_x = pd.DataFrame(columns=tra_dummy_x_unlabeled.columns) not_ssl_safe_y = pd.DataFrame(columns=tra_dummy_y_unlabeled_y.columns) unsafe_risk_series = [] safe_risk_series = [] # ROC CURVE VARIABLES roc_curve_list = [] # STEP 4 & 5 - calculate risk for each training instance and classify as SAFE or UNSAFE for i in tra_dummy_x_unlabeled.index: Xrs = tra_dummy_x_unlabeled.loc[i].as_matrix() yrs = tra_dummy_y_unlabeled_y.loc[i].as_matrix() prediction = clf.predict(np.reshape(Xrs, (1, len(Xrs)))) predicted_class = np.argmax(prediction[0]) # reconstruct back X instance of the predicted y reco_x = np.matmul(alpha_k, prediction[0]) real_x = Xrs # predict y base on reconstructed X prediction_reco = clf.predict(np.reshape(reco_x, (1, len(Xrs)))) predicted_class_reco = np.argmax(prediction_reco[0]) sigma = sigma_list[predicted_class] # yXpredicted should be == yXreconstructed predict_correct = (predicted_class == predicted_class_reco) calc_risk = calculate_risk(real_x, reco_x, predict_correct, sigma) real_class = np.argmax(yrs) # over risk threshold is considered unsafe for ssl training # better to add these as supervised training if possible if calc_risk >= RISK_THRESHOLD: not_ssl_safe_x.loc[i] = tra_dummy_x_unlabeled.loc[i] not_ssl_safe_y.loc[i] = tra_dummy_y_unlabeled_y.loc[i] unsafe_risk_series.append(calc_risk) if real_class == predicted_class: num_ssl_rejected_wrong += 1 # false negative if real_class != predicted_class: num_ssl_rejected_correct += 1 # true negative # less than risk threshold is considered safe for ssl training if calc_risk < RISK_THRESHOLD: ssl_safe_x.loc[i] = tra_dummy_x_unlabeled.loc[i] # predicted class one hot encoded one_hot_encoded = [0] * len(prediction_reco[0]) one_hot_encoded[predicted_class_reco] = 1 # real class one hot encoded one_hot_encoded2 = [0] * len(prediction_reco[0]) one_hot_encoded2[real_class] = 1 ssl_safe_y.loc[i] = one_hot_encoded ssl_safe_unlabelled_y.loc[i] = [0, 0, 0, 0, 1, 0] ssl_safe_real_y.loc[i] = one_hot_encoded2 safe_risk_series.append(calc_risk) if real_class == predicted_class: num_ssl_classify_correct += 1 # true positive if real_class != predicted_class: num_ssl_classify_wrong += 1 # false positive roc_curve_list.append( [calc_risk, 1 if real_class == predicted_class else 0]) print("calculating roc curve") roc_threshold = [] roc_tpr = [] roc_fpr = [] plt_tp_div_fp = [] plt_tp = [] plt_fp = [] np_roc_curve_list = np.array(roc_curve_list) np_roc_curve_list = np_roc_curve_list[np_roc_curve_list[:, 0].argsort()] for i in range(len(np_roc_curve_list)): t = np_roc_curve_list[i, 0] tpr = np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 1)) / np.sum(np_roc_curve_list[:, 1] == 1) fpr = np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 0)) / np.sum(np_roc_curve_list[:, 1] == 0) roc_threshold.append(t) roc_tpr.append(tpr) roc_fpr.append(fpr) tp_div_fp = np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 1)) / np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 0)) plt_tp.append( np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 1))) plt_fp.append( np.sum( np.multiply(np_roc_curve_list[:, 0] < t, np_roc_curve_list[:, 1] == 0))) plt_tp_div_fp.append(tp_div_fp) # roc curve is not a good metric to evaluate our model as we will be discarding # instances that are considered UNSAFE print("finished calculating roc curve. now plotting roc curve") import matplotlib.pyplot as plt from sklearn.metrics import auc roc_auc = auc(roc_fpr, roc_tpr) plt.figure() plt.plot(roc_tpr, roc_fpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() print("finished plotting roc curve. now analysing algorithm performance") dual_ssl_stats = pd.DataFrame({ 'tp': plt_tp, 'fp': plt_fp, 'tp_div_fp': plt_tp_div_fp, 'threshold': roc_threshold }) print("num classified safe but actually unsafe: " + str(num_ssl_classify_wrong)) print("num classified safe and actually safe: " + str(num_ssl_classify_correct)) print("num classified unsafe but actually safe: " + str(num_ssl_rejected_wrong)) print("num classified unsafe and is really unsafe: " + str(num_ssl_rejected_correct)) ssl_safe_x['risk'] = safe_risk_series tra_dummy_x_labeled['risk'] = 0.0 not_ssl_safe_x['risk'] = unsafe_risk_series merge_safe_y = pd.concat([tra_dummy_y_labeled, ssl_safe_y]) merge_safe_x = pd.concat([tra_dummy_x_labeled, ssl_safe_x]) data_path = os.path.join(os.getcwd(), RESULT_DIR, 'dual-ssl-stats.csv') dual_ssl_stats.to_csv(path_or_buf=data_path, index=False) # labelled training data + unlabelled training data that is safe merge_unlabelled_safe_y = pd.concat( [tra_dummy_y_labeled, ssl_safe_unlabelled_y]) ssl_safe_unlabelled = pd.concat([merge_safe_x, merge_unlabelled_safe_y], axis=1) data_path = os.path.join(os.getcwd(), RESULT_DIR, 'dataset-ssl-safe-unlabelled.csv') ssl_safe_unlabelled.to_csv(path_or_buf=data_path, index=False) # labelled training data + unlabelled training data with real y merge_real_label_safe_y = pd.concat([tra_dummy_y_labeled, ssl_safe_real_y]) ssl_safe_w_real_y = pd.concat([merge_safe_x, merge_real_label_safe_y], axis=1) data_path = os.path.join(os.getcwd(), RESULT_DIR, 'dataset-ssl-safe-w-real-y.csv') ssl_safe_w_real_y.to_csv(path_or_buf=data_path, index=False) # labelled training data + unlabelled training labelled with RLS prediction ssl_safe_w_prediction = pd.concat([merge_safe_x, merge_safe_y], axis=1) data_path = os.path.join(os.getcwd(), RESULT_DIR, 'dataset-ssl-safe.csv') ssl_safe_w_prediction.to_csv(path_or_buf=data_path, index=False) return merge_safe_y, merge_safe_x, not_ssl_safe_y, not_ssl_safe_x
from sklearn.kernel_ridge import KernelRidge import numpy as np import pickle n_samples, n_features = 10, 5 rng = np.random.RandomState(0) y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) clf = KernelRidge(alpha=1.0) clf.fit(X, y) print clf.predict(X) k = KernelRidge._get_kernel(clf, X, clf.X_fit_) # print clf.dual_coef_ ,clf.X_fit_ dual_coef = clf.dual_coef_ print dual_coef X_fit_ = clf.X_fit_ pickle.dump([dual_coef, k, X_fit_], open("ridge_attributes.p", "wb")) # print np.dot(k,clf.X_fit_)