def DTclassifier(x_train, x_test, y_train, y_test): ''' Apply Decision Trees classifier to the data Output: minidf: evaluation dataframe for target variable; dataframe includes max_depth taken and the corresponding accuracy score for training data and accuracy score for testing data ''' colnames = ("Max_depth", "Train_accuracy", "Test_accuracy") minidf = pd.DataFrame(columns=colnames) for d in [1, 3, 5, 9, None]: dec_tree = DecisionTreeClassifier(max_depth=d) dec_tree.fit(x_train, y_train) train_pred = dec_tree.predict(x_train) train_acc = accuracy(train_pred, y_train) test_pred = dec_tree.predict(x_test) test_acc = accuracy(test_pred, y_test) data = [(d, train_acc, test_acc)] df_temp = pd.DataFrame(data, columns=colnames) minidf = minidf.append(df_temp, ignore_index=True) return minidf
def try_params(n_iterations, params): n_estimators = int(round(n_iterations * trees_per_iteration)) print "n_estimators:", n_estimators pprint(params) classifier = params['classifier'] del params['classifier'] clf = eval("{}( n_estimators = n_estimators, verbose = 0, n_jobs = -1, \ **params )".format(classifier)) clf.fit(x_train, y_train) p = clf.predict_proba(x_train)[:, 1] ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) # p = clf.predict_proba(x_test)[:, 1] ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) return {'loss': ll, 'log_loss': ll, 'auc': auc}
def get_accuracy_table(criterion_list, splitter_list, max_depth_list, x_train, x_test, y_train, y_test): ''' Creates a data frame with the information of the parameter of the models and its accuracy. Inputs: - criterion_list (list of strings): list of different criterion to be used in the models. - splitter_list (list of strings): list of different splitters to be used in the models - max_depth_list (list): list of the different values to be used in the model - x_train (data frame): independent variables training set. - x_test (data frame): independent variables testing set. - y_train (data frame): dependent variable training set. - y_test (data frame): dependent variable testing set. Returns a data frame ''' results_list = [] for criterion in criterion_list: for splitter in splitter_list: for depth in max_depth_list: dec_tree = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=depth) dec_tree.fit(x_train, y_train) results_list.append([criterion, splitter, depth, accuracy(dec_tree.predict(x_train), y_train), accuracy(dec_tree.predict(x_test), y_test)]) df = pd.DataFrame(results_list) df.columns = ['criterion', 'splitter', 'max_depth', 'accuracy_train', 'accuracy_test'] return df
def train_predict(classifier, sample_size, X_train, X_test, y_train, y_test,typ): # inputs: # classifier: the learning algorithm to be trained and predicted on # sample_size: the size of samples (number) to be drawn from training set # X_train: features training set # y_train: Activity_number_ID training set # X_test: features testing set # y_test: Activity_number_ID testing set # Empty dictionary will include all dataframes and info related to training and testing. results = {} # Fitting the classifier to the training data using slicing with 'sample_size' start= timer() # Get start time classifier = classifier.fit(X_train[0:sample_size,:],y_train[0:sample_size])# fiting the classfier end = timer() # Get end time # Calculate the training time results['train_time'] = end-start # Get the predictions on the test set(X_test), # then get predictions on the first 3000 training samples(X_train) using .predict() start = timer() # Get start time predictions_test = classifier.predict(X_test) # predict predictions_train =classifier.predict(X_train[:3000,:]) end = timer() # Get end time # Calculate the total prediction time results['pred_time'] =end-start # Compute accuracy on the first 300 training samples which is y_train[:300] results['acc_train'] = accuracy(y_train[:3000],predictions_train) # Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy(y_test,predictions_test) # Adapting the confusion matrix shape to the type of data used if typ==1: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6], sample_weight=None) # columns=['WK','WU','WD','SI','ST','LD'] index=['WK','WU','WD','SI','ST','LD'] if typ==2: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7,8,9,10,11,12], sample_weight=None) columns=['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] index= ['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] if typ==3: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7], sample_weight=None) columns=['WK','WU','WD','SI','ST','LD','PT'] index=['WK','WU','WD','SI','ST','LD','PT'] if sample_size==len(X_train):# if 100% of training is achieved # apply the confusion matrix function to the last contingency table generated confusion_matrix_df=(pd.DataFrame(data=confusion_matrix,columns=columns,index=index)).pipe(full_confusion_matrix) else:# if not # create a dataframe from the contingency table confusion_matrix_df=pd.DataFrame(data=confusion_matrix,columns=columns,index=index) # Return the results return (results,confusion_matrix_df)
def loop_dt(param_dict, training_predictors, testing_predictors, training_outcome, testing_outcome): ''' Loop over series of possible parameters for decision tree classifier to train and test models, storing accuracy scores in a data frame Inputs: param_dict: (dictionary) possible decision tree parameters training_predictors: data set of predictor variables for training testing_predictors: data set of predictor variables for testing training_outcome: outcome variable for training testing_outcome: outcome variable for testing Outputs: accuracy_df: (data frame) model parameters and accuracy scores for each iteration of the model Attribution: adapted combinations of parameters from Moinuddin Quadri's suggestion for looping: https://stackoverflow.com/questions/42627795/i-want-to-loop-through-all-possible-combinations-of-values-of-a-dictionary and method for faster population of a data frame row-by-row from ShikharDua: https://stackoverflow.com/questions/10715965/add-one-row-in-a-pandas-dataframe ''' rows_list = [] for clf_type, classifier in classifier_type.items(): for params in list(itertools.product(*param_dict.values())): classifier(params) dec_tree.fit(training_predictors, training_outcome) rows_list = [] for params in list(itertools.product(*param_dict.values())): dec_tree = DecisionTreeClassifier(criterion = params[0], max_depth = params[1], max_features = params[2], min_samples_split = params[3]) dec_tree.fit(training_predictors, training_outcome) train_pred = dec_tree.predict(training_predictors) test_pred = dec_tree.predict(testing_predictors) # evaluate accuracy train_acc = accuracy(train_pred, training_outcome) test_acc = accuracy(test_pred, testing_outcome) acc_dict = {} acc_dict['criterion'], acc_dict['max_depth'], acc_dict['max_features'], acc_dict['min_samples_split'] = params acc_dict['train_acc'] = train_acc acc_dict['test_acc'] = test_acc rows_list.append(acc_dict) accuracy_df = pd.DataFrame(rows_list) return accuracy_df
def ensemble(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) clf, finalClassifier = train(X_train, y_train) y_test_pred = test(clf, finalClassifier, X_test) y_train_pred = test(clf, finalClassifier, X_train) comparePrediction(y_test_pred, y_test) # comparePrediction(y_train, y_train_pred) print(accuracy(y_test, y_test_pred)) print(accuracy(y_train, y_train_pred)) return
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} print(type(sample_size)) # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) start = time() # Get start time learner.fit(X_train[:sample_size], y_train[:sample_size]) end = time() # Get end time # TODO: Calculate the training time results['train_time'] = end - start # TODO: Get the predictions on the test set(X_test), # then get predictions on the first 300 training samples(X_train) using .predict() start = time() # Get start time predictions_test = learner.predict(X_test) predictions_train = learner.predict(X_train[:300]) end = time() # Get end time # TODO: Calculate the total prediction time results['pred_time'] = end - start # TODO: Compute accuracy on the first 300 training samples which is y_train[:300] results['acc_train'] = accuracy(y_train[:300], predictions_train) # TODO: Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy(y_test, predictions_test) # TODO: Compute F-score on the the first 300 training samples using fbeta_score() results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.5) # TODO: Compute F-score on the test set which is y_test results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5) # Success print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) # Return the results return results
def build_dec_tree(x_train, x_test, y_train, y_test): #Lab3 for reference for d in [1, 3, 5, 7]: dec_tree = DecisionTreeClassifier(max_depth=d) dec_tree.fit(x_train, y_train) train_pred = dec_tree.predict(x_train) test_pred = dec_tree.predict(x_test) train_acc = accuracy(train_pred, y_train) test_acc = accuracy(test_pred, y_test) print("Depth: {} | Train acc: {:.2f} | Test acc: {:.2f}".format( d, train_acc, test_acc))
def splitMetrics(clf, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) clf.fit(X_train, y_train) y_test_pred = clf.predict(X_test) y_train_pred = clf.predict(X_train) comparePrediction(y_test, y_test_pred) comparePrediction(y_train, y_train_pred) print(accuracy(y_test, y_test_pred)) print(accuracy(y_train, y_train_pred)) return
def tree_classifier(input_data, features_list) : """ 使用决策树 """ global count count+=1 print("=============%d============" % count) features_train, features_test, target_train, target_test = prepare_data(input_data, features_list) clf = DecisionTreeClassifier() clf = clf.fit(features_train, target_train) pred = clf.predict(features_test) accu = accuracy(target_test, pred) print("准确率%f" % accu) prec = precision_score(target_test, pred) rec = recall_score(target_test, pred) f1 = f1_score(target_test, pred) print("精度%f" % prec) print("召回率%f" % rec) print("f1 %f" % f1) importance = clf.feature_importances_ indices = list(numpy.argsort(importance)) indices = reversed(indices) important_features = [] for no, index in enumerate(indices): if importance[index]>0: print("No.%d--属性%s的权重%f" % (no, features_list[index+1], importance[index])) important_features.append(features_list[index+1]) return important_features, clf
def evaluate_classifier(y_test, predicted_scores, model_name, which_temporal_set): thresholds = { 0.01: [], 0.02: [], 0.05: [], 0.10: [], 0.20: [], 0.30: [], 0.50: [] } # threshold = 0.4 results_df = pd.DataFrame([], columns=('modelthresh', 'which_temporal', 'model', 'threshold', 'accuracy', 'precision', 'recall')) for threshold in thresholds.keys(): calc_threshold = lambda x, y: 0 if x < y else 1 predicted_test = np.array( [calc_threshold(score, threshold) for score in predicted_scores]) test_acc = accuracy(predicted_test, y_test) precision, recall, thresholds = precision_recall_curve( y_test, predicted_test) this_result = pd.DataFrame([[ model_name + str(threshold), which_temporal_set, model_name, threshold, test_acc, np.mean(precision), np.mean(recall) ]], columns=('modelthresh', 'which_temporal', 'model', 'threshold', 'accuracy', 'precision', 'recall')) results_df = results_df.append(this_result, ignore_index=True) return results_df
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh): ''' you get it ''' criterion = ['entropy', 'gini'] rd = { 'predicted': [], 'crit': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for c in criterion: scores = dectree_classifier(x_train, y_train, x_test, c) for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in list(scores)] rd['predicted'].append(preds) rd['crit'].append(c) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('dectree') return pd.DataFrame(rd)
def evaluate_rf(x_train, y_train, x_test, y_test, thresh=thresh, ntrees=[25, 100, 500], maxfeats=[1, .5, 4]): rd = { 'predicted': [], 'ntrees': [], 'nfeats': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for size in ntrees: for f in maxfeats: scores = random_forest_classifier(size, f, x_train, y_train, x_test) for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in scores] rd['predicted'].append(preds) rd['ntrees'].append(size) rd['nfeats'].append(f) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('rf') return pd.DataFrame(rd)
def sklearn_acc(model, test_data, test_target): overall_results = model.predict(test_data) test_pred = (overall_results > 0.5).astype(int) acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target), f1_score(test_pred, test_target, average='macro')] return acc_results
def score(self, pipeline_dic): tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf']) keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf']) if not keep_tfidf: self.update_tfidf(pipeline_dic['tfidf']) keep_features = keep_tfidf and self.keep_features( pipeline_dic['features']) if not keep_features: self.update_features(pipeline_dic['features']) self.model_builder = self.model_builders[pipeline_dic['model']['type']] model_dic = { key: value for key, value in pipeline_dic['model'].items() if key != 'type' } self.model = self.model_builder(**model_dic) self.model.fit(self.X_train, self.Y_train) Y_pred = self.model.predict(self.X_test) score = accuracy(Y_pred, self.Y_test) print(f"Params = {pipeline_dic}, score = {round(score, 3)}. \n") return score
def evaluate_logreg(x_train, y_train, x_test, y_test, c_values=[.01,.1,1,10,100], thresh=thresh): ''' generates df of predictions, penalties, c_values, thresholds, precision, recall, and accuracy of logistic regression ''' penalties = ['l2'] rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy':[], 'class': []} for p in penalties: for c in c_values: scores = logreg_classifier(x_train, y_train, x_test, c, p) for t in thresh: scores = list(stats.rankdata(scores, 'average')/len(scores)) preds = [compare_to_threshold(x, t)for x in scores] rd['predicted'].append(preds) rd['penalty'].append(p) rd['C'].append(c) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('logreg') return pd.DataFrame(rd)
def train_evaluate(): train = pd.read_csv(train_file) test = pd.read_csv(test_file) x_train = train.drop('y', axis=1).values y_train = train.y.values x_test = test.drop('y', axis=1).values y_test = test.y.values classifiers = [ make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=300)), make_pipeline(StandardScaler(), LogisticRegression(C=30, max_iter=300)), make_pipeline(MinMaxScaler(), SVC(kernel='rbf')), make_pipeline(MinMaxScaler(), KNeighborsClassifier()), RandomForestClassifier(n_estimators=10000), GradientBoostingClassifier(n_estimators=1000), make_pipeline(MinMaxScaler(), MLPClassifier(hidden_layer_sizes=(250, 150))), ] for clf in classifiers: clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc = accuracy(y_test, y_pred) print("Accuracy: {:.2%} \n\n{}\n\n".format(acc, clf))
def evaluate(y_true, y_pred,estimator): ''' Return ind, value of many regression metrics in loop then you need to create data frame your self with code below ind, val = evaluate(ytrain, y_pred,lin_r) pd.DataFrame([val] ,index =[ ind] ,columns=['explained_variance ','r2 ','MAE ','MSE ','RMSE ']) or to compare multiple model ind, val = [],[] for estimator in [lin_r ,las,elas,ridg,ada ,extra ,gra ,rnd ] : estimator.fit(Xtrain,ytrain) y_pred = estimator.predict(Xtrain) tmp1, tmp2 = evaluate(ytrain, y_pred,estimator) ind.append(tmp1) val.append(tmp2) result1 = pd.DataFrame(np.array(val),index = [ind ],columns=['accuracy','log_loss,'MAE','MSE','RMSE']) result1.sort_values(by=['MAE','RMSE','MSE']) ''' # Regression metrics accuracy=metrics.accuracy(y_true, y_pred) log_loss=metrics.log_loss(y_true, y_pred) mse=metrics.mean_squared_error(y_true, y_pred) median_absolute_error=metrics.median_absolute_error(y_true, y_pred) r2=metrics.r2_score(y_true, y_pred) return type(estimator).__name__,[round(accuracy,6),round(r2,6),round(log_loss,6),round(mse,6),round(np.sqrt(mse),6)]
def validate(args, model, dataset): model.eval() loss_fcn = torch.nn.BCELoss() data_dataloader = generate_batches(dataset, args.batch_size, n_workers=args.num_workers) loss_list = [] pred_list = [] label_list = [] with torch.no_grad(): for batch_data, batch_label in data_dataloader: batch_logit = model(batch_data).view(-1) loss = loss_fcn(batch_logit, batch_label) pred = (batch_logit > 0.5).int() pred_list.extend(pred) label_list.extend(batch_label) loss_list.append(loss.item()) loss_data = np.array(loss_list).mean() acc = accuracy(pred_list, label_list) f1 = f1_score(pred_list, label_list, average='macro') return loss_data, acc, f1,
def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3, 5, 7, 9, 11], thresh=thresh): ''' generates df of predictions, penalties, k values, thresholds, precision, recall, and accuracy to help find best model ''' rd = { 'predicted': [], 'k': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for k in kays: scores = knn_classifier(x_train, y_train, x_test, k)[:, 1] for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in scores] rd['predicted'].append(preds) rd['k'].append(k) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('knn') return pd.DataFrame(rd)
def present_results_simp(y_test, predictions): results_list = [] for k, v in predictions.items(): inter_list = [ k, accuracy(v, y_test), precision(v, y_test), precision_top(v, y_test, 0.01), precision_top(v, y_test, 0.02), precision_top(v, y_test, 0.05), precision_top(v, y_test, 0.1), precision_top(v, y_test, 0.2), precision_top(v, y_test, 0.3), precision_top(v, y_test, 0.5), recall(v, y_test), recall_top(v, y_test, 0.01), recall_top(v, y_test, 0.02), recall_top(v, y_test, 0.05), recall_top(v, y_test, 0.1), recall_top(v, y_test, 0.2), recall_top(v, y_test, 0.3), recall_top(v, y_test, 0.5), f1(v, y_test) ] results_list.append(inter_list) df = pd.DataFrame(results_list) df.columns = [ 'Model', 'Accuracy', 'Precision', 'Precision top 1%', 'Precision top 2%', 'Precision top 5%', 'Precision top 10%', 'Precision top 20%', 'Precision top 30%', 'Precision top 50%', 'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%', 'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%', 'F 1' ] return df
def get_metrics(prediction, y_test): ''' Computes accuracy, precision, recall, ROC-AUC and F1 metrics for consideroing predictions produced by a ML and actual values of a dependent variables. Inputs: - prediction: an array with predictions. - y_test: an array with actual values. Returns a dictionary with metrics of a ML model. ''' Accuracy = accuracy(prediction, y_test) Precision = precision(prediction, y_test) Recall = recall(prediction, y_test) try: AUC = roc_auc(prediction, y_test) except ValueError: AUC = 0 F1 = f1(prediction, y_test) metrics_dict = { 'Accuracy': Accuracy, 'Precision': Precision, 'Recall': Recall, 'AUC': AUC, 'F1': F1 } return metrics_dict
def lookup_best_c(x_train, y_train, x_test, y_test): accuracy_results = { } # empty dictionary will contain c values and accuracies related for value in C_values: # iterate throw each C value #tuned model 1 best parameters + C variable #tmp_model=LR(solver='lbfgs',class_weight= None,multi_class= 'ovr', # dual=False, penalty= 'l2',random_state=337,C=value) tmp_model = LR(solver='lbfgs', class_weight=None, multi_class='ovr', dual=False, penalty='l2', random_state=337, C=value, max_iter=10000) # train the model tmp_model.fit(x_train, y_train) # predicting activity labels tmp_predictions = tmp_model.predict(x_test) # accuracy score tmp_accuracy = accuracy(tmp_predictions, y_test) # store the tuple c_value and accuracy value in the dictionary accuracy_results[value] = tmp_accuracy # after iterating throw all c values return accuracy_results # return results
def log_metrics(logger, phase, epoch_num, y_hat, y): th = 0.5 accuracy = metrics.accuracy(y_hat, y, th, True) f1_score = metrics.f1score(y_hat, y, th, True) specificity = metrics.specificity(y_hat, y, th, True) sensitivity = metrics.sensitivity(y_hat, y, th, True) roc_auc = metrics.roc_auc(y_hat, y) classes = [ 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any' ] for acc, f1, spec, sens, roc, class_name in zip(accuracy, f1_score, specificity, sensitivity, roc_auc, classes): logger.add_scalar(f'{phase}_acc_{class_name}', acc, epoch_num) logger.add_scalar(f'{phase}_f1_{class_name}', f1, epoch_num) logger.add_scalar(f'{phase}_spec_{class_name}', spec, epoch_num) logger.add_scalar(f'{phase}_sens_{class_name}', sens, epoch_num) logger.add_scalar(f'{phase}_roc_{class_name}', roc, epoch_num) for i, class_name in enumerate(classes): logger.add_scalar(f'{phase}_bce_{class_name}', sklearn.metrics.log_loss(y[:, i], y_hat[:, i]), epoch_num)
def classify(filePath, name): labels = {1: [1], 9: [-1]} global returnVect returnVect = img2vector(filePath) y_pred = clf.predict(returnVect) pred = 1 if y_pred == [-1]: pred = 9 return pred, accuracy(labels[int(name.split('_')[0])], y_pred)
def print_prediction_metrics(clf, x, y, k): pred = cross_val_predict(clf, x, y, cv=StratifiedKFold(n_splits=k, shuffle=True)) print("Accuracy: ", round(accuracy(y, pred), 2)) print("Precision on spam: ", round(precision(y, pred, average=None)[1], 3)) print("Recall on spam: ", round(recall(y, pred, average=None)[1], 3)) return
def get_error(hidden_layer_sizes, X_train, y_train, X_test, y_test): clf = MLPClassifier( hidden_layer_sizes=hidden_layer_sizes, activation='logistic', random_state=RANDOM_SEED ) clf.fit(X_train, y_train.ravel()) error = 1 - accuracy(y_test, clf.predict(X_test)) return error
def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m): f1_value = f1(gt, pre, average="micro") f1_m.update(f1_value) p_value = precision(gt, pre, average="micro", zero_division=0) p_m.update(p_value) r_value = recall(gt, pre, average="micro") r_m.update(r_value) acc_value = accuracy(gt, pre) acc_m.update(acc_value)
def train_and_evaluate(y_train, x_train, y_val, x_val, alg): alg.fit(x_train, y_train) p = alg.predict_proba(x_val) p_bin = alg.predict(x_val) acc = accuracy(y_val, p_bin) auc = AUC(y_val, p[:,1]) return (auc, acc)
def train_and_evaluate(y_train, x_train, y_val, x_val, alg): alg.fit(x_train, y_train) p = alg.predict_proba(x_val) p_bin = alg.predict(x_val) acc = accuracy(y_val, p_bin) auc = AUC(y_val, p[:, 1]) return (auc, acc)
def compute_accuracy(dec_tree, x_data, y_data, threshold): ''' Takes: decision tree classifier object, feature and target data, and prediction probability threshold Returns: accuracy of predictions of tree on x for y ''' pred_scores = dec_tree.predict_proba(x_data)[:,1] calc_threshold = lambda x,y: 0 if x < y else 1 predicted_test = np.array( [calc_threshold(score, threshold) for score in pred_scores] ) return accuracy(predicted_test, y_data)
def train_and_evaluate( y_train, x_train, y_val, x_val ): lr = LR() lr.fit( x_train, y_train ) p = lr.predict_proba( x_val ) p_bin = lr.predict( x_val ) acc = accuracy( y_val, p_bin ) auc = AUC( y_val, p[:,1] ) return ( auc, acc )
def train_and_eval_sklearn_classifier( clf, data ): x_train = data['x_train'] y_train = data['y_train'] x_test = data['x_test'] y_test = data['y_test'] clf.fit( x_train, y_train ) try: p = clf.predict_proba( x_train )[:,1] # sklearn convention except IndexError: p = clf.predict_proba( x_train ) ll = log_loss( y_train, p ) auc = AUC( y_train, p ) acc = accuracy( y_train, np.round( p )) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc ) # try: p = clf.predict_proba( x_test )[:,1] # sklearn convention except IndexError: p = clf.predict_proba( x_test ) ll = log_loss( y_test, p ) auc = AUC( y_test, p ) acc = accuracy( y_test, np.round( p )) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc ) #return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc } return { 'loss': ll, 'log_loss': ll, 'auc': auc }
def train_and_evaluate( y_train, x_train, y_val, x_val, clf ): if clf == 'LR': lr = LR() lr.fit( x_train, y_train ) p = lr.predict_proba( x_val ) p_bin = lr.predict( x_val ) elif clf == 'RF': n_trees = 200 rf = RF( n_estimators = n_trees, verbose = True, n_jobs=4 ) rf.fit( x_train, y_train ) p = rf.predict_proba( x_val ) p_bin = rf.predict( x_val ) acc = accuracy( y_val, p_bin ) auc = AUC( y_val, p[:,1] ) return ( auc, acc )
### Fit, extrapolate, measure error from sklearn.metrics import roc_auc_score as AUC from sklearn.metrics import accuracy_score as accuracy for clf in clfs: # Fit start = clock() rf.fit(train_data, train_target) print("Fitted in {:.0f} seconds.".format(clock() - start)) # Extrapolate start = clock() predict = rf.predict_proba(test_data) predict_bin = rf.predict(test_data) print("Extrapolated in {:.0f} seconds.".format(clock() - start)) # Compute ROC AUC and accuracy acc = accuracy(test_target.values, predict_bin) auc = AUC(test_target.values, predict[:,1]) print "AUC: {:.2%}. Accuracy: {:.2%}.".format(auc, acc) """ Results RF(n_estimators = 10, verbose = True) Fitted in 3 seconds. Extrapolated in 0 seconds. AUC: 50.67%. Accuracy: 49.67%. """
def AccuracyErrorCalc( y, p ): return 1 - accuracy(y, p)
# train.to_csv( 'data/train_v.csv', index = False ) # val.to_csv( 'data/test_v.csv', index = None ) # encode the categorical variable as one-hot, drop the original column afterwards train_dummies = pd.get_dummies( train.c1 ) train_num = pd.concat(( train.drop( 'c1', axis = 1 ), train_dummies.astype( int )), axis = 1 ) # train_num.to_csv( 'data/train_v_num.csv', index = False ) val_dummies = pd.get_dummies( val.c1 ) val_num = pd.concat(( val.drop( 'c1', axis = 1 ), val_dummies.astype(int) ), axis = 1 ) # val_num.to_csv( 'data/test_v_num.csv', index = False ) # train, predict, evaluate n_trees = 100 rf = RF( n_estimators = n_trees, verbose = True ) rf.fit( train_num.drop( 'target', axis = 1 ), train_num.target ) p = rf.predict_proba( val_num.drop( 'target', axis = 1 )) p_bin = rf.predict( val_num.drop( 'target', axis = 1 )) acc = accuracy( val_num.target.values, p_bin ) auc = AUC( val_num.target.values, p[:,1] ) print "AUC: {:.2%}, accuracy: {:.2%}".format( auc, acc ) # AUC: 51.40%, accuracy: 51.14% / 100 trees # AUC: 52.16%, accuracy: 51.62% / 1000 trees
def run(X=None, y=None, X_submission=None, y_submission_val=None,pred_type='prediction',train_file='',test_file='',output_file='',stacked_level='stage1',creating_next_data=False,clfs=[],n_folds=5):#or validation #train_file = 'numerai_datasets/numerai_training_data.csv' #test_file = 'numerai_datasets/numerai_tournament_data.csv' #output_file = 'prediction/predictions_lr.csv' #x_trainはcolumns用 global x_train, test_num, auc_stage ################## Stacking ############### #この段階でpredictionならX, y, X_submission #validationならX, y, X_submission, y_submission_valがわかっていれば良い np.random.seed(0) # seed to shuffle the train set n_folds = n_folds verbose = True shuffle = True if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] #validation_flag = validation_flag[idx] skf = list(StratifiedKFold(y, n_folds)) print "Creating train and test sets for blending." #print "\nLevel 0" dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) stacked_data_columns = x_train.columns.tolist() for j, clf in enumerate(clfs): print j, clf dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) acc = [] auc = [] for i, (train, test) in enumerate(skf):# # of n_fold print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] if str(clf).split("(")[0] in ['XGBClassifier']: #evallist = [(X_test,'eval'), (X_train,'train')] clf.fit(X_train, y_train, eval_metric='logloss',eval_set=[(X_train, y_train),(X_test, y_test)]) elif str(clf).split("(")[0] in ['Classifier']: #evallist = [(X_test,'eval'), (X_train,'train')] mu = X_train.mean(0) stddev = X_train.std(0) X_train = (X_train-mu) / stddev X_test = (X_test-mu) / stddev clf.fit(X_train, y_train) elif 'Keras' in str(clf).split("(")[0]: #evallist = [(X_test,'eval'), (X_train,'train')] clf.fit(X_train, y_train,validation_data=[X_test,y_test]) else: clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission acc.append(accuracy( y_test, y_submission.round() )) auc.append(AUC( y_test, y_submission )) #if using the mean of the prediction of each n_fold dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) #if using the prediction of all train data #clf.fit(X, y) #dataset_blend_test[:,j] = clf.predict_proba(X_submission)[:,1] print "clf: {}\n".format(clf) print print "logloss: {:.4}+{:.2}, accuracy: {:.4}+{:.2} \n".format( np.mean(auc),np.std(auc), np.mean(acc), np.std(acc) ) auc_stage[stacked_level].append(np.mean(auc)) print if pred_type == 'prediction': print "saving individual model" indi_filename = output_file + '_{}{}_{}.csv'.format(str(clf).split("(")[0], j+1,stacked_level ) test_num['probability'] = dataset_blend_test[:,j] test_num.to_csv( indi_filename, columns = ( 't_id', 'probability' ), index = None ) stacked_data_columns.append('{}{}_{}.csv'.format(str(clf).split("(")[0], j+1,stacked_level)) # auc_stage[stacked_level] = np.mean(auc_stage[stacked_level]) #元のデータをpredictionにつける dataset_blend_train = np.concatenate((X,dataset_blend_train),axis=1) dataset_blend_test = np.concatenate((X_submission,dataset_blend_test),axis=1) # saving the stacked data for next stack level if pred_type == 'prediction' and creating_next_data == True: next_dataset_blend_train = pd.DataFrame(dataset_blend_train,columns=stacked_data_columns) next_dataset_blend_train['target'] = y #next_dataset_blend_train['validation'] = validation_flag next_dataset_blend_test = pd.DataFrame(dataset_blend_test,columns=stacked_data_columns) next_dataset_blend_test = pd.concat([test_num['t_id'],next_dataset_blend_test],axis=1) next_dataset_blend_train.to_csv('stacked_datasets/stacking_train_{}.csv'.format(stacked_level),index=None) next_dataset_blend_test.to_csv('stacked_datasets/stacking_test_{}.csv'.format(stacked_level),index=None) #print "\nLevel 1" print "Blending." clf = LR() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:,1] #print "Linear stretch of predictions to [0,1]" #y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) if pred_type != 'prediction': print "final logloss in validation set" print AUC( y_submission_val, y_submission ) else: print "saving..." test_num['probability'] = y_submission output_file = output_file + '_' + stacked_level + '.csv' test_num.to_csv( output_file, columns = ( 't_id', 'probability' ), index = None )
y_file = sys.argv[1] p_file = sys.argv[2] print "loading p..." p = np.loadtxt( p_file ) y_predicted = np.ones(( p.shape[0] )) y_predicted[p < 0] = -1 print "loading y..." y = np.loadtxt( y_file, usecols= [0] ) print "accuracy:", accuracy( y, y_predicted ) print "precision:", precision( y, y_predicted, average='binary' ) print "recall:", recall( y, y_predicted, average='binary' ) print "AUC:", AUC( y, p ) print print "confusion matrix:" print confusion_matrix( y, y_predicted ) """ run score.py data/test_v.txt vw/p_v_logistic.txt accuracy: 0.994675826535 confusion matrix:
sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score as accuracy clf = GaussianNB() t0 = time() clf.fit(features_train, labels_train) print "training time:",round(time()-t0,3),"s" t0 = time() pred = clf.predict(features_test) print "prediction time:", round(time()-t0,3),"s" acc = accuracy(labels_test, pred) print("Classifier accuracy: ", acc) #########################################################