def cv(self, factor, split_val, shadow_func=None, shadow_to_val=None, del_freq=None): """ Cross-validate prediction of factor 'factor'. """ self._prepare(factor, split_val, shadow_func=shadow_func, shadow_to_val=shadow_to_val, del_freq=del_freq) fac_ind = self.col_names.index(factor) self.clf = KNNC(40, algorithm='brute', metric='cosine') z = self._get_features_only(self.non_null_set).astype(float) target = np.ravel(self.non_null_set.getcol(fac_ind).todense()) u, s, v = linalg.svds(z, k=51) T = u.dot(np.diag(s)) kf = cross_validation.KFold(len(target), 5) for train_idx, test_idx in kf: #print len(train_idx), len(test_idx) self.clf.fit(T[train_idx], target[train_idx]) r = self.clf.predict(T[test_idx]) print 'Average error:',\ np.mean(np.abs(r - target[test_idx])),\ "+/-",\ np.std(np.abs(r - target[test_idx]))
def __init__(self, input_dim: int = 62, last_avg: int = 3, data_dir: str = '../data_train', sequence_length: int = 45, data_type: DataType = DataType.HIGH_PASS, classes_list: list = [ 'acetone', 'isopropanol', 'orange_juice', 'pinot_noir', 'raisin', 'wodka' ], weights: str = 'distance', metric: str = 'euclidean', num_neighbors: int = 5): """ Class for a classifier based on k-nearest-neighbor approach defining training and prediction function. The saturated sensor values of the same class are assumed to have a small distance, whereas the distance between data points of different classes should be large. During inference the classes of the num_neghbors nearest neighbors are used to predict the class of the new datapoint by performing a (weighted) majority vote. Our best performing model uses 5 neighbors, the euclidean space and a distance weighting. :param input_dim: Number of dimensions of input data. :param last_avg: Number of last time steps used to compute mean of saturated channel. :param data_dir: Path to data directory containing training csv files that are used to fit model. :param sequence_length: Specifies time step of a measurement sequence at which data points are extracted. Sensor channels should be saturated at that point. :param data_type: Type of data preprocessing. :param classes_list: List of classes to be learnt by model. :param weights: Kind of weighting of the neighbors. :param metric: Metric space. For more options we refer to the sklearn library. :param num_neighbors: Number of neighbors to consider. """ self.input_dim = input_dim self.sequence_length = sequence_length self.data_type = data_type self.last_avg = last_avg self.classes_list = classes_list self.model = KNNC(num_neighbors, weights=weights, metric=metric) self.data_dir = data_dir self.classes_dict = {} for i, c in enumerate(classes_list): self.classes_dict[c] = i self.fit()
def predict(self, factor, split_val, shadow_func=None, shadow_to_val=None, del_freq=None, results_fn=None): self._prepare(factor, split_val, shadow_func=shadow_func, shadow_to_val=shadow_to_val, del_freq=del_freq) fac_ind = self.col_names.index(factor) self.clf = KNNC(40, algorithm='brute', metric='cosine') z = self._get_features_only(self.non_null_set).astype(float) target = np.ravel(self.non_null_set.getcol(fac_ind).todense()) u, s, v = linalg.svds(z, k=51) T = u.dot(np.diag(s)) z2 = self._get_features_only(self.null_set).astype(float) u2, s2, v2 = linalg.svds(z2, k=51) T2 = u2.dot(np.diag(s2)) results = [] self.clf.fit(T, target) for row_ind in range(self.null_set.shape[0]): r = self.clf.predict(T2[row_ind]) results.append((int(self.null_set[row_ind, 0]), int(r[0]))) if results_fn is not None: w = open(results_fn, 'w') msgpack.pack(results, w) w.close() else: print results
tree.plot_tree(dtc) plt.show() pause() rfc = RFC(criterion='gini', n_estimators=25, random_state=1, n_jobs=2) rfc.fit(X_train, y_train) plot_decision_regions(X, y, classifier=rfc, test_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.title('Random Forest Classifier') plt.legend(loc='upper left') plt.tight_layout() plt.show() pause() knn = KNNC(n_neighbors=5, p=2, metric='minkowski') knn.fit(X_train, y_train) plot_decision_regions(X, y, classifier=knn, test_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.title('KNN') plt.tight_layout() plt.show() pause()
# SageMaker parameters, like the directories for training data and saving models; set automatically # Do not need to change parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN']) parser.add_argument('--n_neighbors', type=int, default=5) # args holds all passed-in arguments args = parser.parse_args() # Read in csv training file training_dir = args.data_dir train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) # Labels are in the first column train_y = train_data.iloc[:,0] train_x = train_data.iloc[:,1:] # Define a model model = KNNC(n_neighbors=args.n_neighbors) print('Model Defined!') # Train the model model.fit(train_x, train_y) print('Fitting complete!') # Save the trained model joblib.dump(model, os.path.join(args.model_dir, "model.joblib")) print('Model saved to {}'.format(os.path.join(args.model_dir, "model.joblib")))
Z = qda_clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) # The Bayes Boundary for k=2 classes is the contour where P(Y=k|X=x) = 0.5 cp = ax.contour(xx, yy, Z, [0.5], linewidths=1., colors='k') plt.clabel(cp, inline=True, fmt='Bayes Decision Boundary', fontsize=8) ax.set_xlabel('Lag1') ax.set_ylabel('Lag2') ax.legend(loc='best') plt.savefig(PATH + 'qda.png', dpi=300) plt.close() # K NEAREST NEIGHBORS from sklearn.neighbors import KNeighborsClassifier as KNNC # Build a KNN classifier knn_1 = KNNC(n_neighbors=1) knn_1.fit(X_train, train_df.Direction) knn1_pred = knn_1.predict(test_df[predictors]) print(knn1_pred) print('The model makes {0:.4f}% correct predictions'.format( 100 * np.mean(knn1_pred == test_df.Direction))) # Compute Test Confusion Matrix # ################################# table = pd.crosstab(knn1_pred, test_df.Direction) print(table) # use 3 neighbors now knn_3 = KNNC(n_neighbors=3) knn_3.fit(X_train, train_df.Direction)
# 'ERN': loader.get_ern, # 'SMR': lambda validation=False, subject=2: loader.get_smr(subject, validation), # noqa # 'BMNIST': loader.get_bmnist11, # 'BMNIST_2': loader.get_bmnist2, # 'ThoughtViz': loader.get_thoughtviz, 'ThoughtViz_char': loader.get_thoughtviz_char, 'ThoughtViz_digit': loader.get_thoughtviz_digit, # 'SEED': loader.get_seed, } models_dict = { 'CNN': CNN_Only_Model, 'CNN_GRU': CNN_GRU_Model, 'EEG_Net': EEGNet_model, 'AE_rf': lambda: AutoEncoder_Model(RFC()), 'AE_knn': lambda: AutoEncoder_Model(KNNC()), } if args.data == 'ALL': datasets = [[k, datasets_dict[k]] for k in datasets_dict] else: datasets = [[args.data, datasets_dict[args.data]]] if args.model == 'all': models = [[k, models_dict[k]] for k in models_dict] else: models = [[args.model, models_dict[args.model]]] for model_name, Model in models: model = Model() print('<#######@@@@@@@#######> Model <#######@@@@@@@#######>', model_name)
from sklearn.model_selection import GridSearchCV # ignore ConverenceWarning import warnings from sklearn.exceptions import ConvergenceWarning warnings.filterwarnings("ignore", category=ConvergenceWarning) ################################## ## 3.1 train and test models using GridSearchCV models = { 'DT': DTC(), 'LR': LR(), 'MLP': MLPC(), 'SVC': SVC(), 'NB': NB(), 'KNN': KNNC(), 'Bagging': BaggingC(), 'RF': RFC(), 'AdaBoost': AdaBoostC(), 'GB': GBC(), 'XGB': XGB(), } param_dict = { # 0.67 {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2} 'DT': { 'max_depth': [1,2,3,None], 'max_leaf_nodes': [4,6,8,10,None], 'min_samples_leaf': [1,2,3], 'min_samples_split': [2,4,6] },
divorce = pd.read_csv('../data/divorce.csv', sep=';') divorce.head() print(divorce.shape) divorce.Class.value_counts() for u in divorce.columns: print(divorce[u].value_counts()) from sklearn.neighbors import KNeighborsClassifier as KNNC y = divorce.Class X = divorce.drop(columns=['Class']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) knnc = KNNC(n_neighbors=3) knnc.fit(X_train, y_train) y_pred = knnc.predict(X_test) from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score print(accuracy_score(y_test, y_pred)) print(roc_auc_score(y_test, y_pred)) from sklearn.svm import SVC svc = SVC(probability=True).fit(X_train, y_train) y_pred = svc.predict(X_test) y_prob = svc.predict_proba(X_test)[::, -1]
plot((0, 1), ls='dashed', color='black') plt.show() print('Area under curve (AUC): ', auc(fpr, tpr)) # ### KNN # In[59]: len(df4.columns) # In[60]: #FIT MODEL from sklearn.neighbors import KNeighborsClassifier as KNNC model = KNNC(n_neighbors=3, algorithm='ball_tree') model.fit(X_train, y_train) # In[61]: #CONFUSION MATRIX ypred = model.predict(X_test) cm = confusion_matrix(y_test, ypred) cm # In[62]: #ACCURACY accuracy_score(y_test, ypred) # In[63]: