def knnSimulate(param): trainSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) knnFit = KNeighborsClassifier(n_neighbors=int(param['k'])) knnFit.fit(np.array(trainSet['x']), np.array(trainSet['y'])) testSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) out = OrderedDict() out['p'] = int(param['p']) out['k'] = int(param['k']) out['train'] = trainSet out['test'] = testSet out['resubPreds'] = knnFit.predict(trainSet['x']) out['resubProbs'] = knnFit.predict_proba(trainSet['x']) out['testPreds'] = knnFit.predict(testSet['x']) out['testProbs'] = knnFit.predict_proba(testSet['x']) out['resubTable'] = pd.crosstab( Series(out['resubPreds'], index=trainSet['y'].index), trainSet['y'] ) out['resubAccuracy'] = (np.sum(np.diag(out['resubTable'])) / (1.0 * np.sum(np.sum(out['resubTable'])))) out['testTable'] = pd.crosstab( Series(out['testPreds'], index=testSet['y'].index), testSet['y'] ) out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) / (1.0 * np.sum(np.sum(out['testTable'])))) return out
def get_knn(input_list, y_train, n_neighbors=301): X_train, X_test = input_list # pca = PCA(n_components=5) # X_train = pca.fit_transform(X_train) # X_test = pca.transform(X_test) if type(X_train) is sparse.csr.csr_matrix: X_train = X_train.toarray() X_test = X_test.toarray() n_samples = X_train.shape[0] n_categs = len(np.unique(y_train)) kfolds = StratifiedKFold(y_train, 2) X_train_features = np.zeros([n_samples, n_categs]) knn = KNN(n_neighbors=n_neighbors) for train, test in kfolds: X1 = X_train[train, :] y1 = y_train[train] X2 = X_train[test, :] knn.fit(X1, y1) X_train_features[test, :] = knn.predict_proba(X2) knn.fit(X_train, y_train) X_test_features = knn.predict_proba(X_test) features_manh = [X_train_features, X_test_features] return features_manh
def knn( series, n_folds, clfparams, featureparams, aggregateparams, include, exclude, save_test_predictions, save_oob_predictions, skip_cross_validation, _run, ): data = TelstraData(include=include, exclude=exclude, **featureparams) data.features_to_scale.append("location") time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") pred_cols = ["predict_{}".format(i) for i in range(3)] best_eps = 1e-15 if skip_cross_validation: loss = 999.0 else: y = data.get_y() kf = StratifiedKFold(y.values, n_folds=n_folds, shuffle=True) pred = pd.DataFrame(0.0, index=y.index, columns=pred_cols) i = 1 _run.info["loss"] = [] _run.info["trainloss"] = [] feature_importances_ = 0 for itrain, itest in kf: Xtr, ytr, Xte, yte = data.get_train_test_features(itrain, itest, **aggregateparams) clf = KNeighborsClassifier(**clfparams) clf.fit(Xtr, ytr) # , weights) pred.iloc[itest, :] = clf.predict_proba(Xte) i += 1 def obj(x): return multiclass_log_loss(y.values, pred.values, eps=10.0 ** x) res = minimize(obj, -2.0) best_eps = 10 ** (res.x[0]) loss = multiclass_log_loss(y, pred.values, eps=best_eps) _run.info["best_eps"] = best_eps _run.info["features"] = list(Xtr.columns) # Optionally save oob predictions if save_oob_predictions: filename = "{}_{}.csv".format(series, time) pred.to_csv(filename, index_label="id") # Optionally generate test predictions if save_test_predictions: filename = "{}_test_{}.csv".format(series, time) Xtr, ytr, Xte, yte = data.get_train_test_features(**aggregateparams) clf = KNeighborsClassifier(**clfparams) clf.fit(Xtr, ytr) # ,weights) predtest = pd.DataFrame(clf.predict_proba(Xte), index=yte.index, columns=pred_cols) predtest = np.clip(y_pred, best_eps, 1 - best_eps) predtest /= predtest.values.sum(axis=1)[:, np.newaxis] predtest.to_csv(filename, index_label="id") return loss
class KNeighborsClassifierStep(Step): def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30): super(KNeighborsClassifierStep, self).__init__() self._model = None self._n_neighbors=n_neighbors self._weights=weights self._algorithm = algorithm self._leaf_size = leaf_size def fit_transform(self): self._model = KNeighborsClassifier(n_neighbors=self._n_neighbors, weights=self._weights, algorithm=self._algorithm, leaf_size=self._leaf_size) x, y = load_svmlight(self._input_path) self._model.fit(x, y) scores = self._model.predict_proba(x) save_numpy_txt(scores, self._output_path) def transform(self, x=None): if not x: x, _ = load_svmlight(self._test_input_path) transformed_x = self._model.predict_proba(x) save_numpy_txt(transformed_x, self._output_path) else: transformed_x = self._model.predict_proba(x) return transformed_x def predict(self, features): return self._model.predict_proba(features)
def main(): """ Fit models and make predictions. We'll use one-hot encoding to transform our categorical features into binary features. y and X will be numpy array objects. """ model = KNeighborsClassifier(n=20) # the classifier we'll use # === load data in memory === # print "loading data" y, X = load_data('train.csv') y_test, X_test = load_data('test.csv', use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) # if you want to create new features, you'll need to compute them # before the encoding, and append them to your dataset after # === training & metrics === # mean_auc = 0.0 n = 10 # repeat the CV procedure 10 times to get more precise results for i in range(n): # for each iteration, randomly hold out 20% of the data as CV set X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=i*SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:, 1] # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc/n) # === Predictions === # # When making predictions, retrain the model on the whole training set model.fit(X, y) preds = model.predict_proba(X_test)[:, 1] filename = raw_input("Enter name for submission file: ") save_results(preds, filename + ".csv")
def knn(train_data,train_label,val_data,val_label,test_data,name = "knn_submission.csv"): print "Start training KNN Classifier..." knnClf = KNeighborsClassifier(n_neighbors=20) knnClf.fit(train_data,train_label) #evaluate on validation set val_pred_label = knnClf.predict_proba(val_data) logloss = preprocess.evaluation(val_label,val_pred_label) print "logloss of validation set:",logloss print "Start classify test set..." test_label = knnClf.predict_proba(test_data) preprocess.saveResult(test_label,filename = name)
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 5).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 462.0 df_cell_train.loc[:,'y'] *= 975.0 df_cell_test.loc[:,'x'] *= 462.0 df_cell_test.loc[:,'y'] *= 975.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values #Applying the classifier, ct = 5.3 #5.1282 clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.2).astype(int), weights=calculate_distance,metric='manhattan',n_jobs=2) clf.fit(X, y) y_pred = clf.predict_proba(df_cell_test.values) ##1 #pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:n_topx]) return pred_labels, row_ids
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def main_process(): data_dict = parse_txt() x_data, y_data, places_cnt, path_int_dict = build_x_y_data(data_dict) print 'data counts', len(x_data), len(y_data) print 'zone names counts', places_cnt print 'path counts', len(path_int_dict) # start to train, change list type to numpy.array x_data = np.array(x_data) y_data = np.array(y_data) knn = KNeighborsClassifier() indices = np.random.permutation(len(x_data)) x_train = x_data y_train = y_data x_test = x_data[indices[-TEST_DATA_ROWS:]] y_test = y_data[indices[-TEST_DATA_ROWS:]] knn.fit(x_train, y_train) # work test_result = knn.predict(x_test) # test proba_test_result = knn.predict_proba(x_test) # no duplicate value, so reverse this dictionary int_path_dict = dict(zip(path_int_dict.values(), path_int_dict.keys())) print 'predict result:', test_result print [int_path_dict[x] for x in test_result] # test result
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def process_1_grid(df_train, df_test, grid, threshold): # Creating data with the particular grid id. df_train_1_grid = df_train.loc[df_train.grid_num == grid] df_test_1_grid = df_test.loc[df_test.grid_num == grid] place_counts = df_train_1_grid.place_id.value_counts() mask = (place_counts[df_train_1_grid.place_id.values] >= threshold).values df_train_1_grid = df_train_1_grid.loc[mask] # Label Encoding le = LabelEncoder() labels = le.fit_transform(df_train_1_grid.place_id.values) # Computing train and test feature data for grid grid. X = df_train_1_grid.drop(['place_id','grid_num'], axis=1).values.astype(int) X_test = df_test_1_grid.drop(['grid_num'], axis=1).values.astype(int) row_id = df_test_1_grid.index # KNN Classifier clf = KNeighborsClassifier(n_neighbors=20, weights= 'distance', metric='manhattan') #clf = GaussianNB() # Training of the classifier #clf = XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(X,labels) # Predicting probabilities for each of the label for test data. prob_y = clf.predict_proba(X_test) # Transforming back to labels from One hot Encoding pred_labels = le.inverse_transform(np.argsort(prob_y, axis=1)[:,::-1][:,:3]) return pred_labels, row_id
class PatternBasedDiagnosis: """ Pattern Based Diagnosis with Decision Tree """ __slots__ = [ "model" ] def __init__(self): pass def train(self, data, labels): """ Train the decision tree with the training data :param data: :param labels: :return: """ print('Training Data: %s' % (data)) print('Training Labels: %s' % (labels)) self.model = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree') self.model = self.model.fit(data, labels) def eval(self, obs): print('Testing Result: %s; %s' % (self.model.predict(obs), self.model.predict_proba(obs))) # print('Testing Result: %s' % self.model.predict(obs))
def test(self): X, y = self.dataMat,self.labelMat X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.6, random_state=12) #clf = RandomForestClassifier(max_depth=6,min_samples_split=9,min_samples_leaf=15,n_estimators=5) #clf = DBN([X.shape[1], 24, 2],scales=0.5,learn_rates=0.02,learn_rate_decays = 0.95, learn_rate_minimums =0.001,epochs=500,l2_costs = 0.02*0.031, dropouts=0.2,verbose=0) #cvnum = ShuffleSplit(2013,n_iter=10,test_size=0.6,train_size=0.4,random_state=0) print "****************************************************************" #clf = GaussianNB() #clf = LDA() rbm = BernoulliRBM(batch_size=0, learning_rate=0.1, n_components=12, n_iter=100,random_state=None, verbose=0) clf= KNeighborsClassifier(n_neighbors=20, algorithm='auto',leaf_size=30) #clf = linear_model.LogisticRegression(C=1e2) logistic = linear_model.LogisticRegression(C=100) lasso = linear_model.Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,normalize=False, positive=False, precompute=False, random_state=None,selection='cyclic', tol=0.0001, warm_start=False) #classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) clf.fit(X_train, y_train); scores = cross_val_score(clf,X,y,cv=3,scoring='roc_auc') y_pred = clf.predict(X_test); y_predprob = clf.predict_proba(X_test); prf=precision_recall_fscore_support(y_test, y_pred, average='binary') print ("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std() * 2)) print classification_report(y_test,y_pred) print 'The accuracy is: ', accuracy_score(y_test,y_pred) print 'The log loss is:', log_loss(y_test, y_predprob) print 'The ROC score is:', roc_auc_score(y_test,y_predprob[:,1])
class Predictor(object): def __init__(self, n_neighbors=8, slackbot=None): self.knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights="distance") skill_data = SkillData() data_X, data_y = SkillDataLoader().make_data_set(skill_data.q) self.knn.fit(data_X, data_y) if slackbot is None: self.slackbot = SlackerAdapter() else: self.slackbot = slackbot def predict_skill(self): data_loader = SkillDataLoader() test_x = data_loader.make_X() predict = self.knn.predict(test_x)[0] confidence = max(self.knn.predict_proba(test_x)[0]) description = " ".join(Skill.classes[predict][0]) func_name = Skill.classes[predict][1] if confidence >= 0.85: runner = FunctionRunner() params = runner.filter_f_params(description, func_name) self.slackbot.send_message( text=MsgResource.PREDICT_RESULT(description=description) ) runner.load_function(func_name=func_name, params=params, day_of_week=[0]) else: functions = Functions(self.slackbot) functions.remind_idea()
def knn_predict(X_train, y_train, X_test, k=20): from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=k, algorithm='brute', weights='distance', metric='cosine') neigh.fit(X_train, y_train) return neigh.predict_proba(X_test)[:, 1]
def train_data(): x_data, y_data, zone_cnt, zone_int_dict = get_x_y_data() knn = KNeighborsClassifier() indices = np.random.permutation(len(x_data)) x_train = x_data y_train = y_data x_test = x_data[indices[-TEST_DATA_ROWS:]] y_test = y_data[indices[-TEST_DATA_ROWS:]] knn.fit(x_train, y_train) # start training print 'training data count:', len(indices), ' number of zones:', zone_cnt test_result = knn.predict(x_test) # test prob_test_result = knn.predict_proba(x_test) print prob_test_result # no duplicate value, so reverse this dictionary int_zone_dict = dict(zip(zone_int_dict.values(), zone_int_dict.keys())) print 'predict result:', test_result, [int_zone_dict[x] for x in test_result] # test result print 'ground truth:', y_test, [int_zone_dict[x] for x in y_test] # ground truth cnt = 0 for i in range(TEST_DATA_ROWS): if test_result[i] == y_test[i]: cnt += 1 print 'accurate rate', cnt * 1.0 / TEST_DATA_ROWS from sklearn.cross_validation import cross_val_score print cross_val_score(knn, x_train, y_train)
def process_one_cell(cell_train, cell_test, fw, th, n_neighbors): # Remove infrequent places cell_train = remove_infrequent_places(cell_train, th) # Store row_ids for test row_ids = cell_test[:, -1].flatten().astype(np.int32) cell_test = cell_test[:, :-1] # Preparing data y = cell_train[:, -1].flatten().astype(np.int64) X = cell_train[:, :-1] #Applying the classifier cte = 5.8 n_neighbors = int((y.size ** 0.5) / cte) clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=calculate_distance, p=1, n_jobs=2, leaf_size=15) clf.fit(X, y) y_pred = clf.predict_proba(cell_test) y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1] pred_labels = clf.classes_[y_pred_labels] cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) return cell_pred
def process_one_cell(df_cell_train, df_cell_test): # Remove infrequent places place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 5).values df_cell_train = df_cell_train.loc[mask].copy() df_cell_train['x']=df_cell_train['x']*22 df_cell_train['y']=df_cell_train['y']*52 df_cell_test['x']=df_cell_test['x']*22 df_cell_test['y']=df_cell_test['y']*52 # Store row_ids for test row_ids = df_cell_test.index # Preparing data y = df_cell_train.place_id.values X = df_cell_train.drop(['place_id'], axis=1).values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.83).astype(int), weights=calculate_distance, p=1, n_jobs=2, leaf_size=20) clf.fit(X, y) y_pred = clf.predict_proba(df_cell_test.values) y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1] pred_labels = clf.classes_[y_pred_labels] cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) return cell_pred
def knnSimulate(param, nFold=5): trainSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) knnClass = KNeighborsClassifier(n_neighbors=int(param['k'])) cvAccs = cross_val_score(estimator = knnClass, X = np.array(trainSet['x']), y = np.array(trainSet['y']), cv = nFold) knnClass.fit(np.array(trainSet['x']), np.array(trainSet['y'])) testSet = SimData.simulate2Group( n = int(param['n']), p = int(param['p']), effect = [param['effect']] * int(param['p']) ) out = OrderedDict() out['p'] = param['p'] out['k'] = param['k'] out['train'] = trainSet out['test'] = testSet out['testPreds'] = knnClass.predict(testSet['x']) out['testProbs'] = knnClass.predict_proba(testSet['x']) out['cvAccuracy'] = np.mean(cvAccs) out['testTable'] = pandas.crosstab( Series(out['testPreds'], index=testSet['y'].index), testSet['y'] ) out['testAccuracy'] = (np.sum(np.diag(out['testTable'])) / (1.0 * np.sum(np.sum(out['testTable'])))) return out
def onstartButton(self): cap = cv2.VideoCapture(str(self.file_name)) if self.isfileWorking == False and self.ishasFile == True: self.ishasFile = False self.startButton.setText("Close") # cap = cv2.VideoCapture(str(self.file_name)) self.isfileWorking = True data=spio.loadmat("openface_fea.mat") X=data['feature'] id=data['id'].astype(int)-1 Y=id[0,:] name=list(set(data['names'])) name.sort() print("***Train knn classifier***") knn=KNeighborsClassifier(n_neighbors=20,weights='distance',p=2) knn.fit(X,Y) success,frame = cap.read() while success and self.isfileWorking : start=time.time() success, frame = cap.read() if success: img=frame.copy() bb,rep=getRep(img) if bb is None: print "Can't find any face in this picture" else: if rep is 0: print "Get rep failed..." else: rep=np.reshape(rep,(1,128)) idx=knn.predict(rep) # print("label is {} ".format(idx)) proba=knn.predict_proba(rep) actor=name[idx] self.namelineEdit.setText(actor) self.timelineEdit.setText(str(round(time.time()-start,3))) self.confidencelineEdit.setText(str(round(max(proba[0]),2))) # print("Proba is {} ".format(proba)) draw_dlib_rects(frame,bb,actor,(0,255,0)) image = QtGui.QImage(frame.data, frame.shape[1], frame.shape[0], QtGui.QImage.Format_RGB888).rgbSwapped() pixmap = QtGui.QPixmap.fromImage(image) self.showlabel.setPixmap(pixmap) k = cv2.waitKey(5) else: self.ishasFile = False self.startButton.setText("Start") self.isfileWorking = False cap.release() self.showlabel.clear()
class NearestNeighborsPredictor(PredictorBase): ''' Uses k-nearest neighbors. ''' def __init__(self, animal_type): self.animal_type = animal_type if self.animal_type == "Cat": args = {'n_neighbors': 20} elif self.animal_type == "Dog": args = {'n_neighbors': 40} else: raise RuntimeError("Incorrect animal type") self.clf = KNeighborsClassifier(**args) def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def find_best_params(self): parameters = {'n_neighbors': [5, 10, 20, 40, 60]} knn = KNeighborsClassifier() clf = grid_search.GridSearchCV(knn, parameters) train_data = get_data('../data/train.csv') train_data = select_features(train_data, self.animal_type) X = train_data.drop(['OutcomeType'], axis=1) y = train_data['OutcomeType'] clf.fit(X, y) print clf.best_params_
def performance(x_train, y_train, x_test, y_test, algorithm, n_neighbors=None, n_estimators=None, max_features=None, kernel=None, C=None, gamma=None, degree=None, coef0=None): # fit the model if algorithm == 'k-nn': model = KNeighborsClassifier(n_neighbors=int(n_neighbors)) model.fit(x_train, y_train) elif algorithm == 'SVM': model = train_svm(x_train, y_train, kernel, C, gamma, degree, coef0) elif algorithm == 'naive-bayes': model = GaussianNB() model.fit(x_train, y_train) elif algorithm == 'random-forest': model = RandomForestClassifier(n_estimators=int(n_estimators), max_features=int(max_features)) model.fit(x_train, y_train) else: raise ArgumentError('Unknown algorithm: %s' % algorithm) # predict the test set if algorithm == 'SVM': predictions = model.decision_function(x_test) else: predictions = model.predict_proba(x_test)[:, 1] return optunity.metrics.roc_auc(y_test, predictions, positive=True)
def main(): # trainFeature = genfromtxt('trainF2.csv', delimiter=',') # trainLabel = genfromtxt('trainLabel100.csv', delimiter='\n') # testFeature = genfromtxt('test2Feature.csv', delimiter=',') trainFeature = genfromtxt('trainFeature.csv', delimiter=',')[0::5] trainLabel = genfromtxt('trainLabel.csv', delimiter='\n')[0::5] testFeature = genfromtxt('testFeature.csv', delimiter=',') time_start = time.clock() clf = KNeighborsClassifier(n_jobs=2) clf.fit(trainFeature, trainLabel) time_elapsed = (time.clock() - time_start) print "build model time = "+str(time_elapsed) time_start = time.clock() dec = clf.predict_proba(testFeature) time_elapsed = (time.clock() - time_start) print "predict time = "+str(time_elapsed) header = "Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS" print dec.shape[1] fmt=['%d'] + ['%1.4f'] * dec.shape[1] ind = [i for i in xrange(0,len(dec))] dec = insert(dec, 0, ind, axis=1) savetxt("predict_KNN.csv", dec, delimiter=",", header=header, fmt=fmt, comments="")
def PredictKNNRecent(day, neighbors): conn = sqlite3.connect('SQL/NBA.db') c = conn.cursor() skipDays = ['2015-10-27', '2015-10-28', '2015-10-29', '2015-10-30', '2015-10-31', '2015-11-01', '2015-11-02', '2015-11-03', '2015-11-04', '2015-11-05'] if day in skipDays: probs = [0.5] * 15 else: trainX, trainY, testX = BuildKNNRecent(day, c) trainX, testX = Standardize(trainX, testX) weights = 'uniform' metric = 'minkowski'; p = 2 model = KNeighborsClassifier(n_neighbors=neighbors, weights=weights, metric=metric, p=p) model.fit(trainX, trainY) probs = model.predict_proba(testX) probs = probs[:, 1] updatePrediction = 'UPDATE Game_Preds SET KNN = ? WHERE Game_ID = ?' updateSchedule = 'UPDATE Game_Schedule_2015 SET KNN = ? WHERE Game_ID = ?' SaveProbabilities(day, probs, updatePrediction, updateSchedule, c) conn.commit() conn.close()
def process_one_cell(df_cell_train, df_cell_test, fw, th, n_neighbors): # Remove infrequent places df_cell_train = remove_infrequent_places(df_cell_train, th).copy() # Store row_ids for test row_ids = df_cell_test.index # Preparing data y = df_cell_train.place_id.values X = df_cell_train.drop(['place_id'], axis=1).values #Applying the classifier cte = 5.8 lsize = 12 clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/cte).astype(int), weights=calculate_distance, p=1, n_jobs=2, leaf_size=lsize) clf.fit(X, y) y_pred = clf.predict_proba(df_cell_test.values) y_pred_labels = np.argsort(y_pred, axis=1)[:,:-4:-1] pred_labels = clf.classes_[y_pred_labels] cell_pred = np.column_stack((row_ids, pred_labels)).astype(np.int64) return cell_pred
def DecisionTreeClassifier(TrainData): features=['Month','Date','Year'] season=['Fall','Spring','Summer','Winter'] district=['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] days=['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday','Wednesday'] time=['first','second','third'] features2 = [x for x in range(0,24)] Minute=[x for x in range(100,160)] latitude=[x for x in range(948,964)] longitude=[x for x in range(2070,2083)] features=district+Minute+features2+season+time train,validation= train_test_split(TrainData, test_size=0.4) knn = KNeighborsClassifier() knn.fit(train[features],train['Category']) KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',metric_params=None, n_jobs=1, n_neighbors=5, p=2,weights='uniform',multilabel=True) predicted=np.array(knn.predict_proba(validation[features])) model=knn.predict(validation[features]) model1=knn.predict(train[features]) print "Precision is ",precision_score(validation['Category'].values.tolist(),model,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model,average='macro') print "Accuracy is ", accuracy_score(validation['Category'].values.tolist(),model) print "Training Accuracy is ", accuracy_score(train['Category'].values.tolist(),model1) result=pd.DataFrame(predicted, columns=le_crime.classes_) result['Predicted']=model result.to_csv('knnProbabilities.csv', index = True, index_label = 'Id' )
def KNN(x_train,y_train,x_test, udf_kneighbors=100, do_CV=False): from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_auc_score ### variables may be in different scales, so mean standardize the variables ### ### Mean Normalize variables before regression ### from sklearn.preprocessing import StandardScaler ss=StandardScaler() x_train=ss.fit_transform(x_train) x_test=ss.fit_transform(x_test) neigh=KNeighborsClassifier(weights='distance') if do_CV: k_list=[25,125,255,387] #important to have odd numbers ### Try different parameters of K for optimal value ### ### Randomly divide training set into 80/20 split ### cv_score=list() for k in k_list: neigh.n_neighbors=k x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_train,y_train,test_size=0.20, random_state=42) neigh.fit(x_train_cv,y_train_cv) y_pred=neigh.predict_proba(x_test_cv)[:,1] cv_score.append(roc_auc_score(y_test_cv,y_pred)) neigh.fit(x_train,y_train) y_pred=neigh.predict_proba(x_test)[:,1] print 'Cross Validation KNN Results........' print 'Parameters, CV_Scores' for i in range(len(cv_score)): print k_list[i], cv_score[i] else: print 'Making Prediction with optimal K neighbors...' neigh.n_neighbors=udf_kneighbors neigh.fit(x_train,y_train) y_pred=neigh.predict_proba(x_test)[:,1] print 'Writing submission file....' with open('KNN_Submission.csv','wb') as testfile: w=csv.writer(testfile) w.writerow(('Id','Probability')) for i in range(len(y_pred)): w.writerow(((i+1),y_pred[i])) testfile.close() print 'File written to disk...'
def do_knn(which = ''): (trX, trY, teX) = draft(which) (trX, teX) = normalize(trX, teX) clf = KNeighborsClassifier(probabilities = True) clf.fit(trX, trY) teY = clf.predict_proba(teX)[:,1] return teY
def knn_solver(train_data, train_label, validation, test, dimreduce, convertbinary): """ """ logging.info('begin to train the knn classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape # train_data, validation, test = convertbinary(train_data, validation, test) knn = KNeighborsClassifier (algorithm = 'auto', n_neighbors = 10, p = 3) knn.fit (train_data , train_label) tools.get_auc (knn.predict_proba (validation)[:,1]) return knn.predict_proba (test)[:,1]
def setTrainDataAndMakeModel(X_train,Y_train,X_test): knn = KNeighborsClassifier(algorithm='auto', leaf_size=50, metric='euclidean', metric_params=None, n_jobs=1, n_neighbors=128, p=2, weights='uniform') knn.fit(X_train,Y_train) ##NO USE OF CALIBRATED CV OR BAGGING output = knn.predict_proba(X_test) return output
# In[19]: df4 = pd.DataFrame({ "Prediction": predictions, "Actual": y_test }).reset_index(drop=True) # In[20]: df4.head(50) # # Passing Real Time Feature Data for Testing on the Model. # In[ ]: inputs = {'country_India': 1, 'gender_male': 1, 'activity_Agriculture': 1} test = pd.Series(index=df2.columns) for key in inputs.keys(): test[key] = inputs[key] test.fillna(0, inplace=True) # In[ ]: test1 = test.drop(['status', 'loan_amount', 'funded_amount']) predictions = knn.predict_proba(test1.values.reshape(1, -1)) print(predictions)
simple_reg_model_y = sm.OLS(Height, Weight).fit() simple_reg_model_y.summary() import pandas as pd data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['ohio', 'Colorado', 'Utah', 'New York'], columns=['a', 'b', 'c', 'd']) print(data) data.drop('b', axis=1) import scikitplot as skplt import matplotlib.pyplot as plt skplt.metrics.plot_roc(ytest, knn.predict_proba(Xtest)) plt.show() import seaborn as sns from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB(alpha=0.1).fit(Xtrain, ytrain) predict = mnb.predict(Xtest) print("Test Accuracy Score is : %f" % mnb.score(Xtest, ytest)) print(mnb) # generate confusion_matrix to check misclassification from sklearn.metrics import confusion_matrix confusion_matrix(ytest, predict)
def main(): print("Enter main()") #================================================================================== # k-NN法を用いた非線形分離問題(アヤメデータの3クラス) #================================================================================== #==================================================== # Pre Process(前処理) #==================================================== #---------------------------------------------------- # read & set data #---------------------------------------------------- print("reading data") # scikit-learn ライブラリから iris データを読み込む iris = datasets.load_iris() # 3,4 列目の特徴量を抽出し、dat_X に保管 dat_X = iris.data[:, [2, 3]] # クラスラベル(教師データ)を取得 dat_y = iris.target print('Class labels:', numpy.unique( dat_y)) # ※多くの機械学習ライブラリクラスラベルは文字列から整数にして管理されている(最適な性能を発揮するため) print("finishing reading data") #--------------------------------------------------------------------- # トレーニングされたモデルの性能評価を未知のデータで評価するために、 # データセットをトレーニングデータセットとテストデータセットに分割する #--------------------------------------------------------------------- # scikit-learn の cross_validation モジュールの関数 train_test_split() を用いて、70%:テストデータ, 30%:トレーニングデータに分割 train_test = train_test_split( # 戻り値:list dat_X, dat_y, # test_size=0.3, # 0.0~1.0 で指定 random_state=0 # ) X_train = train_test[0] X_test = train_test[1] y_train = train_test[2] y_test = train_test[3] #---------------------------------------------------------------------------------------------------- # scikit-learn の preprocessing モジュールの StandardScaler クラスを用いて、データをスケーリング #---------------------------------------------------------------------------------------------------- stdScaler = StandardScaler() # X_train の平均値と標準偏差を計算 stdScaler.fit(X_train) # 求めた平均値と標準偏差を用いて標準化 X_train_std = stdScaler.transform(X_train) X_test_std = stdScaler.transform(X_test) # 分割したデータを行方向に結合(後で plot データ等で使用する) X_combined_std = numpy.vstack( (X_train_std, X_test_std)) # list:(X_train_std, X_test_std) で指定 y_combined = numpy.hstack((y_train, y_test)) # 学習データを正規化(後で plot データ等で使用する) dat_X_std = numpy.copy(dat_X) # ディープコピー(参照コピーではない) dat_X_std[:, 0] = (dat_X[:, 0] - dat_X[:, 0].mean()) / dat_X[:, 0].std() # 0列目全てにアクセス[:,0] dat_X_std[:, 1] = (dat_X[:, 1] - dat_X[:, 1].mean()) / dat_X[:, 1].std() #==================================================== # Learning Process #==================================================== # classifier1 : kNN1 (k=1) kNN1 = KNeighborsClassifier( n_neighbors=1, # k-NN法 の k 値 p=2, # euclidean_distance (l2) for p = 2 metric='minkowski' # 距離のとり方:ミンコフスキー距離 ) kNN1.fit(X_train_std, y_train) # classifier1 : kNN2 (k=5) kNN2 = KNeighborsClassifier( n_neighbors=5, # k-NN法 の k 値 p=2, # euclidean_distance (l2) for p = 2 metric='minkowski', # 距離のとり方:ミンコフスキー距離 ) kNN2.fit(X_train_std, y_train) # classifier1 : kNN3 (k=10) kNN3 = KNeighborsClassifier( n_neighbors=10, # k-NN法 の k 値 p=2, # euclidean_distance (l2) for p = 2 metric='minkowski', # 距離のとり方:ミンコフスキー距離 ) kNN3.fit(X_train_std, y_train) #==================================================== # 汎化性能の評価 #==================================================== #------------------------------- # サンプルデータの plot #------------------------------- # plt.subplot(行数, 列数, 何番目のプロットか) plt.subplot(2, 2, 1) plt.grid(linestyle='-') # 品種 setosa のplot(赤の□) plt.scatter(dat_X_std[0:50, 0], dat_X_std[0:50, 1], color="red", edgecolor='black', marker="s", label="setosa") # 品種 virginica のplot(青のx) plt.scatter(dat_X_std[51:100, 0], dat_X_std[51:100, 1], color="blue", edgecolor='black', marker="x", label="virginica") # 品種 versicolor のplot(緑の+) plt.scatter(dat_X_std[101:150, 0], dat_X_std[101:150, 1], color="green", edgecolor='black', marker="+", label="versicolor") plt.title("iris data [Normalized]") # title plt.xlabel("sepal length [Normalized]") # label x-axis plt.ylabel("petal length [Normalized]") # label y-axis plt.legend(loc="upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 #------------------------------- # 識別結果&識別領域の表示 #------------------------------- # classifier1 : kNN1 (k=1) plt.subplot(2, 2, 2) Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std, dat_y=y_combined, classifier=kNN1, list_test_idx=range(101, 150)) plt.title("Idification Result (k=1)") # titile plt.xlabel("sepal length [Normalized]") # label x-axis plt.ylabel("petal length [Normalized]") # label y-axis plt.legend(loc="upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # classifier1 : kNN1 (k=5) plt.subplot(2, 2, 3) Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std, dat_y=y_combined, classifier=kNN2, list_test_idx=range(101, 150)) plt.title("Idification Result (k=5)") # titile plt.xlabel("sepal length [Normalized]") # label x-axis plt.ylabel("petal length [Normalized]") # label y-axis plt.legend(loc="upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # classifier1 : kNN1 (k=10) plt.subplot(2, 2, 4) Plot2D.Plot2D.drawDiscriminantRegions(dat_X=X_combined_std, dat_y=y_combined, classifier=kNN3, list_test_idx=range(101, 150)) plt.title("Idification Result (k=10)") # titile plt.xlabel("sepal length [Normalized]") # label x-axis plt.ylabel("petal length [Normalized]") # label y-axis plt.legend(loc="upper left") # 凡例 plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # plt.savefig("./kNN_scikit-learn_1.png", dpi=300) plt.show() #------------------------------- # 識別率を計算&出力 #------------------------------- y_predict1 = kNN1.predict(X_test_std) y_predict2 = kNN2.predict(X_test_std) y_predict3 = kNN3.predict(X_test_std) print("<テストデータの識別結果>") print("classifier1 : kNN1 (k=1)") # 誤分類のサンプル数を出力 print( "誤識別数 [Misclassified samples] : %d" % (y_test != y_predict1).sum()) # %d:10進数, string % data :文字とデータ(値)の置き換え # 分類の正解率を出力 print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict1)) print("classifier1 : kNN2 (k=5)") # 誤分類のサンプル数を出力 print( "誤識別数 [Misclassified samples] : %d" % (y_test != y_predict2).sum()) # %d:10進数, string % data :文字とデータ(値)の置き換え # 分類の正解率を出力 print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict2)) print("classifier3 : kNN3 (k=10)") # 誤分類のサンプル数を出力 print( "誤識別数 [Misclassified samples] : %d" % (y_test != y_predict3).sum()) # %d:10進数, string % data :文字とデータ(値)の置き換え # 分類の正解率を出力 print("正解率 [Accuracy] : %.2f" % accuracy_score(y_test, y_predict3)) #-------------------------------------------------------------------------------------------------------- # predict_proba() 関数を使用して、指定したサンプルのクラスの所属関係を予想 # 戻り値は、サンプルが Iris-Setosa, Iris-Versicolor, Iris-Virginica に所属する確率をこの順で表している. #-------------------------------------------------------------------------------------------------------- preProb = [] # classifier1 : kNN1 (k=1) preProb.append(kNN1.predict_proba(X_test_std[0, :].reshape( 1, -1))) # 0 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN1.predict_proba(X_test_std[1, :].reshape( 1, -1))) # 1 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN1.predict_proba(X_test_std[2, :].reshape( 1, -1))) # 2 番目のテストデータを reshap でタプル化して格納 # classifier2 : kNN1 (k=5) preProb.append(kNN2.predict_proba(X_test_std[0, :].reshape( 1, -1))) # 0 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN2.predict_proba(X_test_std[1, :].reshape( 1, -1))) # 1 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN2.predict_proba(X_test_std[2, :].reshape( 1, -1))) # 2 番目のテストデータを reshap でタプル化して格納 # classifier3 : kNN1 (k=10) preProb.append(kNN3.predict_proba(X_test_std[0, :].reshape( 1, -1))) # 0 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN3.predict_proba(X_test_std[1, :].reshape( 1, -1))) # 1 番目のテストデータを reshap でタプル化して格納 preProb.append(kNN3.predict_proba(X_test_std[2, :].reshape( 1, -1))) # 2 番目のテストデータを reshap でタプル化して格納 # 各々のサンプルの所属クラス確率の出力 print("classifier1 : kNN1 (k=1)") print("サンプル 0 の所属クラス確率 [%] :", preProb[0] * 100) print("サンプル 1 の所属クラス確率 [%] :", preProb[1] * 100) print("サンプル 2 の所属クラス確率 [%] :", preProb[2] * 100) print("classifier2 : kNN2 (k=5)") print("サンプル 0 の所属クラス確率 [%] :", preProb[3] * 100) print("サンプル 1 の所属クラス確率 [%] :", preProb[4] * 100) print("サンプル 2 の所属クラス確率 [%] :", preProb[5] * 100) print("classifier3 : kNN3 (k=10)") print("サンプル 0 の所属クラス確率 [%] :", preProb[6] * 100) print("サンプル 1 の所属クラス確率 [%] :", preProb[7] * 100) print("サンプル 2 の所属クラス確率 [%] :", preProb[8] * 100) #------------------------------------------------------------------------ # 各々のサンプルの所属クラス確率の図示 #------------------------------------------------------------------------ # 現在の図をクリア plt.clf() # 所属クラスの確率を棒グラフ表示 k = 0 for i in range(3): for j in range(3): k += 1 print("棒グラフ生成(複数図)", i, j, k) plt.subplot(3, 3, k) # plt.subplot(行数, 列数, 何番目のプロットか) plt.title("samples[ %d ]" % j + " by classifier %d " % (i + 1)) # title plt.xlabel("Varieties (Belonging class)") # label x-axis plt.ylabel("probability[%]") # label y-axis plt.ylim(0, 100) # y軸の範囲(0~100) plt.legend(loc="upper left") # 凡例 # 棒グラフ plt.bar(left=[0, 1, 2], height=preProb[k - 1][0] * 100, tick_label=["Setosa", "Versicolor", "Virginica"]) plt.tight_layout() # グラフ同士のラベルが重ならない程度にグラフを小さくする。 # 図の保存&表示 plt.savefig("./kNN_scikit-learn_2.png", dpi=300) plt.show() print("Finish main()") return
knn_10.fit(X_train, y_train) # Fit for k = 20 knn_20.fit(X_train, y_train) # Predict k = 5 y_pred = knn.predict(X_train) # Predict k = 10 y_pred_10 = knn_10.predict(X_train) # Predict k = 20 y_pred_20 = knn_20.predict(X_train) # Predict probability k = 5 y_pred_proba = knn.predict_proba(X_train) # Predict probability k = 10 y_pred_proba_10 = knn_10.predict_proba(X_train) # Predict probability k = 20 y_pred_proba_20 = knn_20.predict_proba(X_train) # Look at the score of the model, k = 5 knn.score(X_train, y_train) # Look at the score of the model, k = 10 knn_10.score(X_train, y_train) #Look at the score of the model, k = 20
those parameters, make predictions on the test set, and submit those predictions. BONUS TASK #2: Read the scikit-learn documentation for GridSearchCV to find the shortcut for accomplishing bonus task #1. ''' # MAIN TASK from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() from sklearn.grid_search import GridSearchCV neighbors_range = [20, 40, 60, 80, 100] weight_options = ['uniform', 'distance'] param_grid = dict(n_neighbors=neighbors_range, weights=weight_options) grid = GridSearchCV(knn, param_grid, cv=5, scoring='log_loss') grid.fit(X, y) grid.grid_scores_ grid.best_score_ grid.best_params_ # BONUS TASK #1 knn = KNeighborsClassifier(n_neighbors=100, weights='uniform') knn.fit(X, y) y_prob = knn.predict_proba(test[feature_cols])[:, 1] sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id') sub.to_csv('sub.csv') # BONUS TASK #2 y_prob = grid.predict_proba(test[feature_cols])[:, 1] sub = pd.DataFrame({'id':test.index, 'OpenStatus':y_prob}).set_index('id') sub.to_csv('sub.csv')
size = comm.size sum = 0 tmp = rank # Creating the data X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 2, 2]) start_time = time.time() ##### if comm.rank == 0: # SVM clf = SVC(gamma='auto') clf.fit(X, y) print("SVM Prediction ", clf.predict([[-0.8, -1]])) if comm.rank == 1: # Random Forest RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) RF.fit(X, y) print("RF Feature importance ", RF.feature_importances_) print("RF Prediction ", RF.predict([[-0.8, -1]])) if comm.rank == 2: # KNN neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) print("KNN Prediction ", neigh.predict([[-0.8, -1]])) print("KNN Probability ", neigh.predict_proba([[-0.8, -1]])) if comm.rank > 2: print("done")
#label2=np.zeros((303,1)) label1 = np.ones((1420, 1)) #Value can be changed label2 = np.zeros((1514, 1)) label = np.append(label1, label2) data_1, mask = lassodimension(shu, label) X = data_1 y = label sepscores = [] ytest = np.ones((1, 2)) * 0.5 yscore = np.ones((1, 2)) * 0.5 cv_clf = KNeighborsClassifier() skf = StratifiedKFold(n_splits=5) for train, test in skf.split(X, y): y_train = utils.to_categorical(y[train]) hist = cv_clf.fit(X[train], y[train]) y_score = cv_clf.predict_proba(X[test]) yscore = np.vstack((yscore, y_score)) y_test = utils.to_categorical(y[test]) ytest = np.vstack((ytest, y_test)) fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0]) roc_auc = auc(fpr, tpr) y_class = utils.categorical_probas_to_classes(y_score) y_test_tmp = y[test] acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace( len(y_class), y_class, y_test_tmp) sepscores.append( [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc]) print( 'SVC:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f' % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc)) scores = np.array(sepscores)
#���ѡȡ140��������Ϊѵ�����ݼ� iris_y_train = iris_y[indices[:-10]] #����ѡȡ��140�������ı�ǩ��Ϊѵ�����ݼ��ı�ǩ iris_x_test = iris_x[indices[-10:]] #ʣ�µ�10��������Ϊ�������ݼ� iris_y_test = iris_y[indices[-10:]] #���Ұ�ʣ��10��������Ӧ��ǩ��Ϊ�������ݼ��ı�ǩ knn = KNeighborsClassifier() #����һ��knn���������� knn.fit(iris_x_train, iris_y_train) #���øö����ѵ����������Ҫ��������������ѵ�����ݼ�����������ǩ iris_y_predict = knn.predict(iris_x_test) #���øö���IJ��Է�������Ҫ����һ���������������ݼ� probility = knn.predict_proba(iris_x_test) #����������������ڸ��ʵ�Ԥ�� neighborpoint = knn.kneighbors(iris_x_test[-1], 5, False) #���������һ���������������������5���㣬���ص�����Щ�����������ɵ����� score = knn.score(iris_x_test, iris_y_test, sample_weight=None) #���øö���Ĵ�ַ����������ȷ�� print('iris_y_predict = ') print(iris_y_predict) #������ԵĽ�� print('iris_y_test = ') print(iris_y_test) #���ԭʼ�������ݼ�����ȷ��ǩ���Է���Ա� print('Accuracy:', score) #���ȷ�ʼ�����
# In[ ]: # KNN Model Fitting and Performance Metrics knn = KNeighborsClassifier(n_neighbors = 25) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) knn_acc_score = round(knn.score(X_train, y_train) * 100, 2) print("***K Nearest Neighbors***") print("Accuracy Score:", knn_acc_score) print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred)) print("Classification Report:") print(classification_report(y_test, y_pred)) y_pred_prob =knn.predict_proba(X_test)[:,1] print("ROC_AUC Score:") print(roc_auc_score(y_test, y_pred_prob)) # **Decision Tree** # In[ ]: # Decision Tree Hyper parameter Tuning param_grid = {'max_depth': np.arange(1, 20)} decision_tree = DecisionTreeClassifier() decision_tree_cv = GridSearchCV(decision_tree, param_grid, cv=5) decision_tree_cv.fit(X, y) print("best params", decision_tree_cv.best_params_)
clf = SGDClassifier(loss="hinge") calibrated_clf = CalibratedClassifierCV(clf, cv=5, method='sigmoid') calibrated_clf.fit(X_train, y_train) #Decision Tree classifiers = DecisionTreeClassifier(criterion='entropy', random_state=0) classifiers.fit(X_train, y_train) #Random Forest model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0) model.fit(X_train, y_train) r_probs = [0 for _ in range(len(y_test))] knn_probs = knn.predict_proba(X_test) classifier_probs = classifier.predict_proba(X_test) classifiers_probs = classifiers.predict_proba(X_test) calibrated_clf_probs = calibrated_clf.predict_proba(X_test) gnb_probs = gnb.predict_proba(X_test) model_probs = model.predict_proba(X_test) knn_probs = knn_probs[:, 1] classifier_probs = classifier_probs[:, 1] calibrated_clf_probs = calibrated_clf_probs[:, 1] gnb_probs = gnb_probs[:, 1] classifiers_probs = classifiers_probs[:, 1] model_probs = model_probs[:, 1] from sklearn.metrics import roc_curve, roc_auc_score r_auc = roc_auc_score(y_test, r_probs)
# Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) # Fitting K-NN to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier( n_neighbors=5, p=2 ) #When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2 classifier.fit(features_train, labels_train) #Calculate Class Probabilities probability = classifier.predict_proba(features_test) # Predicting the class labels labels_pred = classifier.predict(features_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(labels_test, labels_pred) print(cm) #Visualization # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = features_train[:, 0].min() - 1, features_train[:, 0].max() + 1 y_min, y_max = features_train[:, 1].min() - 1, features_train[:, 1].max() + 1
def models(response_col, predicts_col): # Train test split X_train = predicts_col[:int(predicts_col.shape[0] * 0.7)] X_test = predicts_col[int(predicts_col.shape[0] * 0.7):] y_train = response_col[:int(response_col.shape[0] * 0.7)] y_test = response_col[int(response_col.shape[0] * 0.7):] # Replace nans with median X_train = X_train.fillna(X_train.median()) X_test = X_test.fillna(X_test.median()) y_train = y_train.fillna(y_train.median()) y_test = y_test.fillna(y_train.median()) # Normalize normalizer = preprocessing.Normalizer(norm="l2") X_train_norm = normalizer.fit_transform(X_train) X_test_norm = normalizer.fit_transform(X_test) # Fit random forest model rf_model = RandomForestClassifier(oob_score=True, random_state=1234) rf_model.fit(X_train_norm, y_train) rf_preds = rf_model.predict(X_test_norm) print("Random Forest:\n", classification_report(y_test, rf_preds)) # RF ROC plot model_name = "Random Forest" rf_probs = rf_model.predict_proba(X_test_norm) prob = rf_probs[:, 1] auc_plot(prob, y_test, model_name) # Logistic regression log_reg = LogisticRegression(max_iter=300, fit_intercept=True) log_reg_fit = log_reg.fit(X_train_norm, y_train) log_preds = log_reg_fit.predict(X_test_norm) print("Logistic:\n", classification_report(y_test, log_preds)) # Logistic ROC plot model_name = "Logistic" log_probs = log_reg.predict_proba(X_test_norm) prob = log_probs[:, 1] auc_plot(prob, y_test, model_name) # SVM svm_model = svm.SVC(probability=True) svm_fitted = svm_model.fit(X_train_norm, y_train) svm_preds = svm_fitted.predict(X_test_norm) print("SVM:\n", classification_report(y_test, svm_preds)) # SVM ROC plot model_name = "SVM" svm_probs = svm_model.predict_proba(X_test_norm) prob = svm_probs[:, 1] auc_plot(prob, y_test, model_name) # KNN knn_model = KNeighborsClassifier(n_neighbors=3) knn_fitted = knn_model.fit(X_train_norm, y_train) knn_preds = knn_fitted.predict(X_test_norm) print("KNN:\n", classification_report(y_test, knn_preds)) # KNN ROC plot model_name = "K-Nearest Neighbor" knn_probs = knn_model.predict_proba(X_test_norm) prob = knn_probs[:, 1] auc_plot(prob, y_test, model_name) # Decision tree classifier dtc_model = DecisionTreeClassifier(random_state=1234) dtc_fitted = dtc_model.fit(X_train_norm, y_train) dtc_preds = dtc_fitted.predict(X_test_norm) print("Decision tree classifier:\n", classification_report(y_test, dtc_preds)) # Decision Tree Classifier ROC plot model_name = "Decision Tree Classifier" dtc_probs = dtc_model.predict_proba(X_test_norm) prob = dtc_probs[:, 1] auc_plot(prob, y_test, model_name) # Linear discriminant analysis lda_model = LinearDiscriminantAnalysis() lda_fitted = lda_model.fit(X_train_norm, y_train) lda_preds = lda_fitted.predict(X_test_norm) print("Linear discriminant analysis:\n", classification_report(y_test, lda_preds)) # Linear Discriminant Analysis ROC plot model_name = "Linear Discriminant Analysis" lda_probs = lda_model.predict_proba(X_test_norm) prob = lda_probs[:, 1] auc_plot(prob, y_test, model_name) # Gaussian Naive Bayes gnb_model = GaussianNB() gnb_fitted = gnb_model.fit(X_train_norm, y_train) gnb_preds = gnb_fitted.predict(X_test_norm) print("Gaussian Naive Bayes:\n", classification_report(y_test, gnb_preds)) # Gaussian Naive Bayes ROC plot model_name = "Gaussian Naive Bayes" gnb_probs = gnb_model.predict_proba(X_test_norm) prob = gnb_probs[:, 1] auc_plot(prob, y_test, model_name) # XGBoost xg_model = xgb.XGBClassifier( tree_method="approx", predictor="cpu_predictor", verbosity=1, eval_metric=["merror", "map", "auc"], objective="binary:logistic", eta=0.3, n_estimators=100, colsample_bytree=0.95, max_depth=3, reg_alpha=0.001, reg_lambda=150, subsample=0.8, ) xgb_model = xg_model.fit(X_train_norm, y_train) xgb_preds = xgb_model.predict(X_test_norm) print("XGBoost:\n", classification_report(y_test, xgb_preds)) # XGB ROC plot model_name = "XGBoost" xgb_probs = xgb_model.predict_proba(X_test_norm) prob = xgb_probs[:, 1] auc_plot(prob, y_test, model_name) # Good old linear regression to get output # predictor = sm.add_constant(X_train) predictor = X_train logit_model = sm.Logit(y_train, predictor) logit_fitted = logit_model.fit() # ols_model = sm.OLS(y_train, predictor) # ols_fitted = ols_model.fit() print(logit_fitted.summary()) # print(ols_fitted.summary()) # print(ols_fitted.mse_model) # print(ols_fitted.mse_resid) # print(ols_fitted.mse_total) # Create performance table model_names = [ "Random Forest", "Logistic", "SVM", "KNN", "Decision Trees", "LDA", "Gaussian Naive Bayes", "XGBoost", ] predictions = [ rf_preds, log_preds, svm_preds, knn_preds, dtc_preds, lda_preds, gnb_preds, xgb_preds, ] perf_table(model_names, predictions, y_test) return
best = result.idxmin() print(best) print(result[best]) neigh = KNeighborsClassifier(n_neighbors=best, weights='distance') neigh.fit(x, y) data = pd.read_table('objectMatrixTest.txt', sep=',', header=None) xt = data[list(range(1, 19))] print(xt) xt = pd.DataFrame(scale(xt)) yt = neigh.predict_proba(xt) lab = data[0] with open("SubmissionKNC.csv", "w") as f: f.write( 'Id,Prediction1,Prediction2,Prediction3,Prediction4,Prediction5,Prediction6,Prediction7,Prediction8,Prediction9\n' ) for i in range(0, yt.shape[0]): f.write(lab[i]) f.write(",") for j in range(0, 8): f.write(str(yt[i][j])) f.write(",") f.write(str(yt[i][8])) f.write("\n")
def tune_knn(tree, X_train, X_train_feature, y_train, val_frac, seed, logger=None, cv=5): """ Tunes KNN by choosing hyperparameters that give the best pearson correlation to the tree predictions. """ n_neighbors_grid = [3, 5, 7, 9, 11, 13, 15, 31, 45, 61] if not val_frac: knn_clf = KNeighborsClassifier(n_neighbors=3, weights='uniform') knn_clf = knn_clf.fit(X_train_feature, y_train) tune_start = time.time() # select a fraction of the training data n_samples = int(X_train.shape[0] * val_frac) np.random.seed(seed) val_indices = np.random.choice(np.arange(X_train.shape[0]), size=n_samples) X_val = X_train[val_indices] X_val_feature = X_train_feature[val_indices] y_val = y_train[val_indices] # result containers results = [] fold = 0 # tune C skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed) for train_index, test_index in skf.split(X_val_feature, y_val): fold += 1 # obtain fold data X_val_train = X_val[train_index] X_val_test = X_val[test_index] X_val_feature_train = X_val_feature[train_index] X_val_feature_test = X_val_feature[test_index] y_val_train = y_val[train_index] # gridsearch n_neighbors correlations = [] for n_neighbors in n_neighbors_grid: start = time.time() # fit a tree ensemble and surrogate model m1 = clone(tree).fit(X_val_train, y_val_train) m2 = KNeighborsClassifier(n_neighbors=n_neighbors, weights='uniform').fit( X_val_feature_train, y_val_train) # generate predictions m1_proba = m1.predict_proba(X_val_test)[:, 1] m2_proba = m2.predict_proba(X_val_feature_test)[:, 1] # measure correlation correlation = pearsonr(m1_proba, m2_proba)[0] correlations.append(correlation) if logger: s = '[Fold {}] n_neighbors={:<2}: {:.3f}s; corr={:.3f}' logger.info( s.format(fold, n_neighbors, time.time() - start, correlation)) results.append(correlations) results = np.vstack(results).mean(axis=0) best_ndx = np.argmax(results) best_n_neighbors = n_neighbors_grid[best_ndx] if logger: logger.info('chosen n_neighbors: {}'.format(best_n_neighbors)) logger.info('total tuning time: {:.3f}s'.format(time.time() - tune_start)) logger.info('training...') train_start = time.time() knn_clf = KNeighborsClassifier(n_neighbors=best_n_neighbors, weights='uniform') knn_clf = knn_clf.fit(X_train_feature, y_train) if logger: logger.info('total training time: {:.3f}s'.format(time.time() - train_start)) return knn_clf
print('Naive Bayes: | ', precisionNB, ' | ', recallNB, ' | ', f1NB) print('Support Vector |', '\n', 'Machines | ', precisionSVM, ' | ', recallSVM, ' | ', f1SVM) print('Nearest Neigh: | ', precisionNN, ' | ', recallNN, ' | ', f1NN) print('Nearest Neigh 2:| ', precisionNN2, ' | ', recallNN2, ' | ', f1NN2) print('Neur Network: | ', precisionANN, ' | ', recallANN, ' | ', f1ANN) print('Neur Network 2: | ', precisionANN2, ' | ', recallANN2, ' | ', f1ANN2) pr_y_test_pred_DT = clfDT.predict_proba(x_test) pr_y_test_pred_SVM = grid.predict_proba(x_test) pr_y_test_pred_NN = clfNN.predict_proba(x_test) pr_y_test_pred_ANN2 = clfANN2.predict_proba(x_test) pr_y_test_pred_NB = clfNB.predict_proba(x_test) #clfSVM.predict_proba #ROC curve fprDT, tprDT, thresholdsDT = roc_curve(y_test, pr_y_test_pred_DT[:, 1], pos_label=None) fprSVM, tprSVM, thresholdsSVM = roc_curve(y_test, pr_y_test_pred_SVM[:, 1], pos_label=None) fprNN, tprNN, thresholdsNN = roc_curve(y_test, pr_y_test_pred_NN[:, 1], pos_label=None)
#!/usr/bin/env python # -*- coding=utf-8 -*- __author__ = "柯博文老師 Powen Ko, www.powenko.com" X=[[9,9],[9.2,9.2],[9.6,9.2],[9.2,9.2],[6.7,7.1],[7,7.4],[7.6,7.5], [7.2,10.3], [7.3,10.5], [7.2,9.2], [7.3,10.2], [7.2,9.7], [7.3,10.1], [7.3,10.1]] y=[1,1,1,1,1,1,1, 2,2,2,2,2,2,2] from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) print("預測答案=",neigh.predict([[7,9]])) print("預測樣本距離=",neigh.predict_proba([[7,9]])) # 測試數據X的返回概率估計。
from sklearn.metrics import confusion_matrix con = confusion_matrix(y_test, y_pred) print(con) #checking Accuracy accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) #Classification report from sklearn.metrics import classification_report print(classification_report(y_test, y_pred)) #ROC and AUC curve from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve clf_probs = reg.predict_proba(x_test) clf_probs = clf_probs[:, 1] print(clf_probs) ras = roc_auc_score(y_test, clf_probs) print("Logistic : ROC AUC = %.3f" % (ras)) from sklearn.preprocessing import label_binarize y = label_binarize(y_test, classes=[1, 2]) n_classes = y.shape[1] fpr, tpr, _ = roc_curve(y, clf_probs) plt.figure() lw = 2 plt.plot(fpr, tpr, color="orange", lw=lw, label="ROC curve (area = %0.2f" % ras)
def knn(X_train, y_train, X_test): knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) knn.fit(X_train, y_train) return knn.predict(X_test), knn.predict_proba(X_train), knn.predict_proba( X_test)
# In[36]: classifier_kn.fit(x_train, y_train) # In[37]: guesses = classifier_kn.predict(x_test) # In[38]: guesses_prob = classifier_kn.predict_proba(x_test) guesses_prob # In[39]: print(accuracy_score(y_test, guesses)) print(recall_score(y_test, guesses)) print(precision_score(y_test, guesses)) print(f1_score(y_test, guesses)) # In[40]:
X = np.asarray(features)[:data_size[j], :] y = np.asarray(labels)[:data_size[j]] # Split train and validation data set X_input, X_validate, y_input, y_validate = train_test_split(X, y, test_size=0.1, shuffle=True) # Split Train and Test data set X_train, X_test, y_train, y_test = train_test_split(X_input, y_input, test_size=0.1, shuffle=True) # Optimize classifier using train and test data pso = PSO(knn_optimize, [classifier_neighbor_range[1]], [classifier_neighbor_range[0]], fitness_minimize=False, cost_function_args=(X_input, y_input), verbose=False, ndview=False, max_iteration=50) knn_particles, knn_global_best, knn_best_costs = pso.run() # Classify using test data set classifier = KNeighborsClassifier(n_neighbors=int(knn_global_best["position"][0])) classifier.fit(X_train, y_train) test_probs = classifier.predict_proba(X_test) inpsize_test.append(data_size[j]) # Compute ROC curve and ROC area for each class of test data y_test_bin = np.empty((len(y_test), len(label_map))) for k in range(y_test_bin.shape[0]): arr = [0 for _ in range(len(label_map))] arr[labels[k]] = 1 y_test_bin[k] = np.asarray(arr) print('Test Label original shape: ' + str(np.asarray(y_test).shape)) print('Test Label binary shape: ' + str(y_test_bin.shape)) print('Test score shape:' + str(test_probs.shape)) fpr = dict() tpr = dict() roc_auc = dict()
leaf_size=30, p=2, metric='minkowski', metric_params=None) # In[68]: knMod.fit(X_train, y_train) # In[69]: knMod.score(X_test, y_test) # In[70]: test_labels = knMod.predict_proba(np.array(X_test.values))[:, 1] # In[71]: roc_auc_score(y_test, test_labels, average='macro', sample_weight=None) # In[72]: glmMod = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None,
'time': time_list }) resultDf['trueMove'] = resultDf['trueMove'].astype(int) resultDf['equal'] = (resultDf.predictionSVM == resultDf.trueMove.astype(int)) print('--------------Plot ROC-AUC --------------') from sklearn import metrics print("NB Accuracy", metrics.accuracy_score(resultDf.trueMove, pred_list_NB)) plt.figure(figsize=(9, 7)) y_pred_proba = nb.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_NB, y_pred_proba) auc = metrics.roc_auc_score(pred_list_NB, y_pred_proba) plt.plot(fpr, tpr, label="NB auc=" + str('% 6.3f' % auc)) y_pred_proba2 = knn.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_KNN, y_pred_proba2) auc2 = metrics.roc_auc_score(pred_list_KNN, y_pred_proba2) plt.plot(fpr, tpr, label="KNN auc=" + str('% 6.3f' % auc2)) y_pred_proba3 = svm.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_SVM, y_pred_proba3) auc3 = metrics.roc_auc_score(pred_list_SVM, y_pred_proba3) plt.plot(fpr, tpr, label="SVM auc=" + str('% 6.3f' % auc3)) plt.xlabel("false positive") plt.ylabel("true positive") plt.legend(loc=4) plt.show()
list_probs_gp = [] pred_gp = np.zeros(shape=(200, )) for i in range(200): list_probs_gp.append( fun.probabilities_gp([ fun.LogCP2(X_test.iloc[i].values, m_scen1_train, ld_scen1, il_scen1, 0.5).value(), fun.LogCP2(X_test.iloc[i].values, m_scen2_train, ld_scen2, il_scen2, 0.5).value() ])) pred_gp[i] = fun.classif(list_probs_gp[-1]) proba_gp = np.asarray(list_probs_gp) classifier_knn = KNeighborsClassifier(n_neighbors=10) classifier_knn.fit(X_train, y_train) proba_knn = classifier_knn.predict_proba(X_test) pred_knn = classifier_knn.predict(X_test) classifier_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) classifier_rf.fit(X_train, y_train) proba_rf = classifier_rf.predict_proba(X_test) pred_rf = classifier_rf.predict(X_test) classifier_mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, ), random_state=1) classifier_mlp.fit(X_train, y_train) proba_mlp = classifier_mlp.predict_proba(X_test)
min_df=2, max_features=1000, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(clean_data['tidy']) train_d = tfidf[:7000, :] test_d = tfidf[7000:, :] x_train, x_valid, y_train, y_valid = train_test_split( train_d, clean_data['score'][:7000], random_state=42, test_size=0.3) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train, y_train) pickle.dump(svc, open("knn_tfidf.sav", "wb")) prediction = knn.predict_proba(x_valid) prediction_int = prediction[:, 1] >= 0.3 prediction_int = prediction_int.astype(np.int) print("Accuracy= ", accuracy_score(y_valid, prediction_int)) print("Precision= ", precision_score(y_valid, prediction_int)) print("F1 score= ", f1_score(y_valid, prediction_int)) test_pred = knn.predict_proba(test_d) test_pred_int = test_pred[:, 1] >= 0.3 test_pred_int = test_pred_int.astype(np.int) x = clean_data["score"][7000:] clean_data["score"][7000:] = test_pred_int submission = clean_data[['id', 'score', 'review']][7000:] submission.to_csv('outputtfidf_KNN.csv', index=False)
class KNeighborsClassifier(BaseEstimator, ClassifierMixin): """k nearest neighbors classifier. Parameters ---------- n_neighbors : int, optional (default = 1) Number of neighbors to use. weights : str or callable, optional (default = 'uniform') weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. - 'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors. Ignored ff ``metric`` is either 'dtw', 'dtw_sakoechiba', 'dtw_itakura', 'dtw_multiscale', 'dtw_fast' or 'boss' ('brute' will be used). Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : string or DistanceMetric object (default = 'minkowski') the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class from scikit-learn for a list of available metrics. For Dynamic Time Warping, the available metrics are 'dtw', 'dtw_sakoechiba', 'dtw_itakura', 'dtw_multiscale', 'dtw_fast' and 'boss'. p : integer, optional (default = 2) Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``n_jobs=-1``, then the number of jobs is set to the number of CPU cores. Doesn't affect :meth:`fit` method. """ def __init__(self, n_neighbors=1, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1, **kwargs): self.n_neighbors = n_neighbors self.weights = weights self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.metric = metric self.metric_params = metric_params self.n_jobs = n_jobs self.kwargs = kwargs def fit(self, X, y): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Training vector. y : array-like, shape = (n_samples,) Class labels for each data sample. Returns ------- self : object """ X, y = check_X_y(X, y) if self.metric == 'dtw': self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'dtw_classic': self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw_classic, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'dtw_sakoechiba': n_timestamps = X.shape[1] if self.metric_params is None: region = sakoe_chiba_band(n_timestamps) else: if 'window_size' not in self.metric_params.keys(): window_size = 0.1 else: window_size = self.metric_params['window_size'] region = sakoe_chiba_band(n_timestamps, window_size) self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw_region, metric_params={'region': region}, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'dtw_itakura': n_timestamps = X.shape[1] if self.metric_params is None: region = itakura_parallelogram(n_timestamps) else: if 'max_slope' not in self.metric_params.keys(): max_slope = 2. else: max_slope = self.metric_params['max_slope'] region = itakura_parallelogram(n_timestamps, max_slope) self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw_region, metric_params={'region': region}, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'dtw_multiscale': self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw_multiscale, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'dtw_fast': self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=dtw_fast, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) elif self.metric == 'boss': self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm='brute', metric=boss, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) else: self._clf = SklearnKNN(n_neighbors=self.n_neighbors, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, metric=self.metric, metric_params=self.metric_params, n_jobs=self.n_jobs, **self.kwargs) self._clf.fit(X, y) return self def predict_proba(self, X): """Return probability estimates for the test data X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Test samples. Returns ------- p : array, shape = (n_samples, n_classes) Probability estimates. """ check_is_fitted(self, '_clf') return self._clf.predict_proba(X) def predict(self, X): """Predict the class labels for the provided data. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Test samples. Returns ------- y : array-like, shape = (n_samples,) Class labels for each data sample. """ check_is_fitted(self, '_clf') return self._clf.predict(X)
def cross_val_default(X, y, fold_type='kfold', nr_folds=5): """ Cross validation for multiple classifiers with default settings """ if fold_type == 'stratified': kf = StratifiedKFold(n_splits=nr_folds, shuffle=True) else: kf = KFold(n_splits=nr_folds, shuffle=True) lr_train, lr_vals = [], [] sv_train, sv_vals = [], [] rf_train, rf_vals = [], [] gb_train, gb_vals = [], [] kn_train, kn_vals = [], [] ens_train, ens_vals = [], [] for train_ind, val_ind in kf.split(X, y): x_train, y_train = X.iloc[train_ind], y.iloc[train_ind] x_val, y_val = X.iloc[val_ind], y.iloc[val_ind] scaler = MinMaxScaler() x_train_scaled = scaler.fit_transform(x_train) x_val_scaled = scaler.transform(x_val) lr = LogisticRegression() lr.fit(x_train_scaled, y_train) y_score_val = lr.predict_proba(x_val_scaled)[:, 1] y_score_train = lr.predict_proba(x_train_scaled)[:, 1] lr_vals.append(roc_auc_score(y_val, y_score_val)) lr_train.append(roc_auc_score(y_train, y_score_train)) sv = SVC(probability=True) sv.fit(x_train_scaled, y_train) y_score_val = sv.predict_proba(x_val_scaled)[:, 1] y_score_train = sv.predict_proba(x_train_scaled)[:, 1] sv_vals.append(roc_auc_score(y_val, y_score_val)) sv_train.append(roc_auc_score(y_train, y_score_train)) rf = RandomForestClassifier() rf.fit(x_train, y_train) y_score_val = rf.predict_proba(x_val)[:, 1] y_score_train = rf.predict_proba(x_train)[:, 1] rf_vals.append(roc_auc_score(y_val, y_score_val)) rf_train.append(roc_auc_score(y_train, y_score_train)) gb = GradientBoostingClassifier() gb.fit(x_train, y_train) y_score_val = gb.predict_proba(x_val)[:, 1] y_score_train = gb.predict_proba(x_train)[:, 1] gb_vals.append(roc_auc_score(y_val, y_score_val)) gb_train.append(roc_auc_score(y_train, y_score_train)) kn = KNeighborsClassifier() kn.fit(x_train_scaled, y_train) y_score_val = kn.predict_proba(x_val_scaled)[:, 1] y_score_train = kn.predict_proba(x_train_scaled)[:, 1] kn_vals.append(roc_auc_score(y_val, y_score_val)) kn_train.append(roc_auc_score(y_train, y_score_train)) voting_classifer = VotingClassifier(estimators=[('kn', kn), ('gb', gb), ('rf', rf), ('lr', lr)], voting='soft', n_jobs=-1) ens = voting_classifer.fit(x_train, y_train) y_score_val = ens.predict_proba(x_val)[:, 1] y_score_train = ens.predict_proba(x_train)[:, 1] ens_vals.append(roc_auc_score(y_val, y_score_val)) ens_train.append(roc_auc_score(y_train, y_score_train)) print('ROC_AUC scores: \n') print( f'log reg: val {np.mean(lr_vals):.3f} +- {np.std(lr_vals):.3f} | ' f'train {np.mean(lr_train):.3f} +- {np.std(lr_train):.3f}') print( f'random forest: val {np.mean(rf_vals):.3f} +- {np.std(rf_vals):.3f} | ' f'train {np.mean(rf_train):.3f} +- {np.std(rf_train):.3f}') print( f'gradient boost: val {np.mean(gb_vals):.3f} +- {np.std(gb_vals):.3f} | ' f'train {np.mean(gb_train):.3f} +- {np.std(gb_train):.3f}') print( f'knn: val {np.mean(kn_vals):.3f} +- {np.std(kn_vals):.3f} | ' f'train {np.mean(kn_train):.3f} +- {np.std(kn_train):.3f}') print( f'SVC: val {np.mean(sv_vals):.3f} +- {np.std(sv_vals):.3f} | ' f'train {np.mean(sv_train):.3f} +- {np.std(sv_train):.3f}') print( f'ensemble: val {np.mean(ens_vals):.3f} +- {np.std(ens_vals):.3f} | ' f'train {np.mean(ens_train):.3f} +- {np.std(ens_train):.3f}') return None
cv=cv, refit=False, verbose=1) kNNgsEda.fit(featuresEda[includeRowsTrain, :], labels[includeRowsTrain], groups[includeRowsTrain]) bestneighbors_Eda = kNNgsEda.best_params_['n_neighbors'] knnCpredAll = np.zeros(np.shape(labels)) knnCpredAcc = np.zeros(np.shape(labels)) knnCpredEda = np.zeros(np.shape(labels)) for train, test in cv.split(featuresAll, labels, groups): knnCAll = KNeighborsClassifier(n_neighbors=bestneighbors_All, algorithm='auto', metric='euclidean') knnCAll.fit(featuresAll[train, :], labels[train]) knnCpredAll[test] = knnCAll.predict_proba(featuresAll[test, :])[:, 1] knnCAcc = KNeighborsClassifier(n_neighbors=bestneighbors_Acc, algorithm='auto', metric='euclidean') knnCAcc.fit(featuresAcc[train, :], labels[train]) knnCpredAcc[test] = knnCAcc.predict_proba(featuresAcc[test, :])[:, 1] knnCEda = KNeighborsClassifier(n_neighbors=bestneighbors_Eda, algorithm='auto', metric='euclidean') knnCEda.fit(featuresEda[train, :], labels[train]) knnCpredEda[test] = knnCEda.predict_proba(featuresEda[test, :])[:, 1] # Save the scores for further analysis #np.save('knnCpredAllScores_UTD',knnCpredAll)
plt.ylabel('misclassification rate') pml.savefig('knnClassifyErrVsK.pdf') plt.show() #cross_validate scores = [] for k in ks: knn = KNN(n_neighbors=k) score = cross_val_score(knn, x_train, y_train, cv=5) scores.append(1 - score.mean()) plt.figure() plt.plot(ks, scores, 'ko-') min_k = ks[np.argmin(scores)] plt.plot([min_k, min_k], [0, 1.0], 'b-') plt.xlabel('k') plt.ylabel('misclassification rate') plt.title('5-fold cross validation, n-train = 200') #draw hot-map to show the probability of different class knn = KNN(n_neighbors=10) knn.fit(x_train, y_train) xy_predic = knn.predict_proba(xy) levels = np.arange(0, 1.01, 0.1) for i in range(3): plt.figure() plt.contourf(xy_predic[:, i].ravel().reshape(200, 200), levels) plt.colorbar() plt.title('p(y=%s | data, k=10)' % (i)) pml.savefig('knnClassifyDemo_hotmap_%s.png' % (i)) plt.show()
def Co_KNN_SVM(train_Y, train_X, test_Y, test_X, savepath=None): # 每次迭代,添加到对方分类器训练集的样本数 temp_num_svm = 55 temp_num_knn = 55 # 迭代次数 loop_num = 6 # knn中的K K = 4 # KNN和SVM用来测试的样本及测试的标签(不变) fixed_test_X = test_X.copy() fixed_test_Y = test_Y.copy() # KNN保存准确率 accuracy_knn_list = [] # SVM保存准确率 accuracy_svm_list = [] # knn训练标签和训练集特征组成的元组list train_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( train_Y.copy(), train_X.copy()) # knn测试标签和测试集特征组成的元组list test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( test_Y.copy(), test_X.copy()) # svm训练标签和训练集特征组成的元组list train_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( train_Y.copy(), train_X.copy()) # svm测试标签和测试集特征组成的元组list test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( test_Y.copy(), test_X.copy()) # 协同训练 for h in range(1, loop_num + 1): print(len(train_knn_Y_X_tuple_list)) print(len(test_knn_Y_X_tuple_list)) print(len(train_svm_Y_X_tuple_list)) print(len(test_svm_Y_X_tuple_list)) # 得到svm的训练集标签和训练集的特征 train_Y_svm_from_tuple, train_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple( train_svm_Y_X_tuple_list.copy()) # 得到knn的训练集标签和训练集的特征 train_Y_knn_from_tuple, train_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple( train_knn_Y_X_tuple_list) # train_X_svm_from_tuple_temp, test_X_svm_temp, train_Y_svm_from_tuple_temp, test_Y_svm_temp = train_test_split( # train_X_svm_from_tuple, train_Y_svm_from_tuple, test_size=11) # # train_X_knn_from_tuple_temp, test_X_knn_temp, train_Y_knn_from_tuple_temp, test_Y_knn_temp = train_test_split( # train_X_knn_from_tuple, train_Y_knn_from_tuple, test_size=11) # 得到svm的测试集标签和测试集的特征 test_Y_svm_from_tuple, test_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple( test_svm_Y_X_tuple_list.copy()) # 得到knn的测试集标签和测试集的特征 test_Y_knn_from_tuple, test_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple( test_knn_Y_X_tuple_list) # test_X_svm_from_tuple, test_X_svm_temp, test_Y_svm_from_tuple, test_Y_svm_temp = train_test_split( # test_X_svm_from_tuple, test_Y_svm_from_tuple, # test_size=0.05, random_state=42) # # test_X_knn_from_tuple, test_X_knn_temp, test_Y_knn_from_tuple, test_Y_knn_temp = train_test_split( # test_X_knn_from_tuple, test_Y_knn_from_tuple, # test_size=0.05, random_state=42) # KNN计算准确率 knn = KNeighborsClassifier(n_neighbors=K, weights='distance') # 训练 knn.fit(train_X_knn_from_tuple, train_Y_knn_from_tuple) # 获得准确率 #accuracy_knn = knn.score(fixed_test_X, fixed_test_Y) accuracy_knn = knn.score(test_X_knn_from_tuple, test_Y_knn_from_tuple) accuracy_knn_list.append(accuracy_knn * 100) print("预测结果(KNN)") print(h) print(accuracy_knn) # svm计算准确率 svc = SVC(C=15, kernel='rbf', degree=3, gamma=2, probability=True) # 训练 svc.fit(train_X_svm_from_tuple, train_Y_svm_from_tuple) # 获得准确率 #accuracy_svm = svc.score(fixed_test_X, fixed_test_Y) accuracy_svm = svc.score(test_X_svm_from_tuple, test_Y_svm_from_tuple) accuracy_svm_list.append(accuracy_svm * 100) print("预测结果(SVM)") print(h) print(accuracy_svm) if h == loop_num: break # KNN和SVM半监督训练过程 # ---------------------------------KNN测试样本预测和置信度计算过程 ---------------------------------- # 根据模型,预测样本 # 获得预测可能性 probility_knn = knn.predict_proba(test_X_knn_from_tuple) # knn的置信list confidence_knn_list = [] for i in range(0, probility_knn.shape[0]): probility_knn_temp = probility_knn[i] confidence_knn_list.append( utilities.get_confidence_knn(probility_knn_temp.copy())) # 获得预测标签 predict_Y_knn = knn.predict(test_X_knn_from_tuple) # ---------------------------------SVM测试样本预测和置信度计算过程 ---------------------------------- # 根据模型,预测样本 # 获得预测可能性 probility_svm = svc.predict_proba(test_X_svm_from_tuple) # svm的置信list confidence_svm_list = [] for i in range(0, probility_svm.shape[0]): probility_svm_temp = probility_svm[i] confidence_svm_list.append( utilities.get_confidence_svm(probility_svm_temp.copy())) # 获得预测标签 predict_Y_svm = svc.predict(test_X_svm_from_tuple) # KNN和SVM伪标签添加过程 # ---------------------------------------KNN--------------------------------------------- index_svm_label_high_confidence = utilities.get_confidence_svm_index( confidence_svm_list.copy(), predict_Y_svm.copy(), predict_Y_knn.copy(), temp_num_svm) temp_test_X_svm = [] temp_test_Y_svm = [] for i in index_svm_label_high_confidence: temp_test_X_svm.append(test_X_svm_from_tuple[i]) temp_test_Y_svm.append(predict_Y_svm[i]) temp_test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( temp_test_Y_svm.copy(), temp_test_X_svm.copy()) # 把svm的置信度较高的样本加入到knn的训练集中 train_knn_Y_X_tuple_list.extend(temp_test_svm_Y_X_tuple_list) # 获取新的测试样本 index_all_test_svm_Y_X_tuple_list = np.arange( 0, len(test_svm_Y_X_tuple_list)) diff_index_test_svm_Y_X_tuple_list = np.setdiff1d( index_all_test_svm_Y_X_tuple_list, np.array(index_svm_label_high_confidence)) diff_test_svm_Y_X_tuple_list = [] for i in diff_index_test_svm_Y_X_tuple_list: diff_test_svm_Y_X_tuple_list.append(test_svm_Y_X_tuple_list[i]) test_svm_Y_X_tuple_list = diff_test_svm_Y_X_tuple_list # ---------------------------------------SVM--------------------------------------------- index_knn_label_high_confidence = utilities.get_confidence_knn_index( confidence_knn_list.copy(), predict_Y_svm.copy(), predict_Y_knn.copy(), temp_num_knn) temp_test_X_knn = [] temp_test_Y_knn = [] for i in index_knn_label_high_confidence: temp_test_X_knn.append(test_X_knn_from_tuple[i]) temp_test_Y_knn.append(predict_Y_knn[i]) temp_test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( temp_test_Y_knn.copy(), temp_test_X_knn.copy()) # 把knn的置信度较高的样本加入到svm的训练集中 train_svm_Y_X_tuple_list.extend(temp_test_knn_Y_X_tuple_list) # 获取新的测试样本 index_all_test_knn_Y_X_tuple_list = np.arange( 0, len(test_knn_Y_X_tuple_list)) diff_index_test_knn_Y_X_tuple_list = np.setdiff1d( index_all_test_knn_Y_X_tuple_list, np.array(index_knn_label_high_confidence)) diff_test_knn_Y_X_tuple_list = [] for i in diff_index_test_knn_Y_X_tuple_list: diff_test_knn_Y_X_tuple_list.append(test_knn_Y_X_tuple_list[i]) test_knn_Y_X_tuple_list = diff_test_knn_Y_X_tuple_list print("KNN的准确率:") print(accuracy_knn_list) print("SVM的准确率:") print(accuracy_svm_list)
num_class = 2 loo = LeaveOneOut() sepscores = [] y_score = np.ones((1, 2)) * 0.5 y_class = np.ones((1, 1)) * 0.5 for train, test in loo.split(X): cv_clf = KNeighborsClassifier(n_neighbors=5) X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] y_sparse = utils.to_categorical(y) y_train_sparse = utils.to_categorical(y_train) y_test_sparse = utils.to_categorical(y_test) hist = cv_clf.fit(X_train, y_train) y_predict_score = cv_clf.predict_proba(X_test) y_predict_class = utils.categorical_probas_to_classes(y_predict_score) y_score = np.vstack((y_score, y_predict_score)) y_class = np.vstack((y_class, y_predict_class)) cv_clf = [] y_class = y_class[1:] y_score = y_score[1:] fpr, tpr, _ = roc_curve(y_sparse[:, 0], y_score[:, 0]) roc_auc = auc(fpr, tpr) acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace( len(y_class), y_class, y) result = [acc, precision, npv, sensitivity, specificity, mcc, roc_auc] row = y_score.shape[0] y_sparse = utils.to_categorical(y) yscore_sum = pd.DataFrame(data=y_score) yscore_sum.to_csv('yscore_KNN_1075_knife_no.csv')