def DTclassifier(x_train, x_test, y_train, y_test):
    '''
    Apply Decision Trees classifier to the data
    Output:
        minidf: evaluation dataframe for target variable; 
                dataframe includes max_depth taken and 
                the corresponding accuracy score for training data and
                accuracy score for testing data
    '''

    colnames = ("Max_depth", "Train_accuracy", "Test_accuracy")
    minidf = pd.DataFrame(columns=colnames)

    for d in [1, 3, 5, 9, None]:
        dec_tree = DecisionTreeClassifier(max_depth=d)
        dec_tree.fit(x_train, y_train)

        train_pred = dec_tree.predict(x_train)
        train_acc = accuracy(train_pred, y_train)

        test_pred = dec_tree.predict(x_test)
        test_acc = accuracy(test_pred, y_test)

        data = [(d, train_acc, test_acc)]
        df_temp = pd.DataFrame(data, columns=colnames)
        minidf = minidf.append(df_temp, ignore_index=True)

    return minidf
示例#2
0
def try_params(n_iterations, params):

    n_estimators = int(round(n_iterations * trees_per_iteration))
    print "n_estimators:", n_estimators
    pprint(params)

    classifier = params['classifier']
    del params['classifier']

    clf = eval("{}( n_estimators = n_estimators, verbose = 0, n_jobs = -1, \
		**params )".format(classifier))
    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_train)[:, 1]

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    #

    p = clf.predict_proba(x_test)[:, 1]

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    return {'loss': ll, 'log_loss': ll, 'auc': auc}
示例#3
0
def get_accuracy_table(criterion_list, splitter_list, max_depth_list, x_train, x_test, y_train, y_test):
	'''
	Creates a data frame with the information of the parameter of the models 
	and its accuracy.
	Inputs:
		- criterion_list (list of strings): list of different criterion to be 
		  used in the models.
		- splitter_list (list of strings): list of different splitters to be 
		  used in the models
		- max_depth_list (list): list of the different values to be used in 
		  the model
		- x_train (data frame): independent variables training set.
		- x_test (data frame): independent variables testing set.
		- y_train (data frame): dependent variable training set.
		- y_test (data frame): dependent variable testing set.
	Returns a data frame
	'''
	results_list = []
	for criterion in criterion_list:
		for splitter in splitter_list:
			for depth in max_depth_list:
				dec_tree = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=depth)
				dec_tree.fit(x_train, y_train)
				results_list.append([criterion, splitter, depth, accuracy(dec_tree.predict(x_train), y_train), 
					accuracy(dec_tree.predict(x_test), y_test)])
	df = pd.DataFrame(results_list)
	df.columns = ['criterion', 'splitter', 'max_depth', 'accuracy_train', 'accuracy_test']
	return df
def train_predict(classifier, sample_size, X_train, X_test, y_train,  y_test,typ): 
    
    # inputs:
    #   classifier: the learning algorithm to be trained and predicted on
    #   sample_size: the size of samples (number) to be drawn from training set
    #   X_train: features training set
    #   y_train: Activity_number_ID training set
    #   X_test: features testing set
    #   y_test: Activity_number_ID testing set
    
    # Empty dictionary will include all dataframes and info related to training and testing.
    results = {}
    
    # Fitting the classifier to the training data using slicing with 'sample_size'
    start= timer() # Get start time
    classifier = classifier.fit(X_train[0:sample_size,:],y_train[0:sample_size])# fiting the classfier
    end = timer() # Get end time
    
    # Calculate the training time
    results['train_time'] = end-start
        
    # Get the predictions on the test set(X_test),
    # then get predictions on the first 3000 training samples(X_train) using .predict()
    start = timer() # Get start time
    predictions_test = classifier.predict(X_test) # predict
    predictions_train =classifier.predict(X_train[:3000,:])
    end = timer() # Get end time
    
    # Calculate the total prediction time
    results['pred_time'] =end-start
            
    # Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy(y_train[:3000],predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy(y_test,predictions_test)
    
    # Adapting the confusion matrix shape to the type of data used
    if typ==1:
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6], sample_weight=None) # 
        columns=['WK','WU','WD','SI','ST','LD']
        index=['WK','WU','WD','SI','ST','LD']
    if typ==2:
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7,8,9,10,11,12], sample_weight=None)
        columns=['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St']
        index=  ['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] 
    if typ==3:   
        confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7], sample_weight=None)
        columns=['WK','WU','WD','SI','ST','LD','PT']
        index=['WK','WU','WD','SI','ST','LD','PT']
    
    if sample_size==len(X_train):# if 100% of training is achieved
        # apply the confusion matrix function to the last contingency table generated
        confusion_matrix_df=(pd.DataFrame(data=confusion_matrix,columns=columns,index=index)).pipe(full_confusion_matrix)
    else:# if not
        # create a dataframe from the contingency table
        confusion_matrix_df=pd.DataFrame(data=confusion_matrix,columns=columns,index=index)
        
    # Return the results
    return (results,confusion_matrix_df)
示例#5
0
def loop_dt(param_dict, training_predictors, testing_predictors,
                training_outcome, testing_outcome):
    '''
    Loop over series of possible parameters for decision tree classifier to
    train and test models, storing accuracy scores in a data frame

    Inputs:
        param_dict: (dictionary) possible decision tree parameters
        training_predictors: data set of predictor variables for training
        testing_predictors: data set of predictor variables for testing
        training_outcome: outcome variable for training
        testing_outcome: outcome variable for testing

    Outputs:
        accuracy_df: (data frame) model parameters and accuracy scores for
            each iteration of the model

    Attribution: adapted combinations of parameters from Moinuddin Quadri's
    suggestion for looping: https://stackoverflow.com/questions/42627795/i-want-to-loop-through-all-possible-combinations-of-values-of-a-dictionary
    and method for faster population of a data frame row-by-row from ShikharDua:
    https://stackoverflow.com/questions/10715965/add-one-row-in-a-pandas-dataframe
    '''


    rows_list = []
    for clf_type, classifier in classifier_type.items():

        for params in list(itertools.product(*param_dict.values())):
            classifier(params)
            dec_tree.fit(training_predictors, training_outcome)


    rows_list = []
    for params in list(itertools.product(*param_dict.values())):
        dec_tree = DecisionTreeClassifier(criterion = params[0],
                                          max_depth = params[1],
                                          max_features = params[2],
                                          min_samples_split = params[3])
        dec_tree.fit(training_predictors, training_outcome)

        train_pred = dec_tree.predict(training_predictors)
        test_pred = dec_tree.predict(testing_predictors)

        # evaluate accuracy
        train_acc = accuracy(train_pred, training_outcome)
        test_acc = accuracy(test_pred, testing_outcome)

        acc_dict = {}
        acc_dict['criterion'], acc_dict['max_depth'], acc_dict['max_features'], acc_dict['min_samples_split'] = params
        acc_dict['train_acc'] = train_acc
        acc_dict['test_acc'] = test_acc

        rows_list.append(acc_dict)

    accuracy_df = pd.DataFrame(rows_list)

    return accuracy_df
示例#6
0
def ensemble(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)
    clf, finalClassifier = train(X_train, y_train)
    y_test_pred = test(clf, finalClassifier, X_test)
    y_train_pred = test(clf, finalClassifier, X_train)
    comparePrediction(y_test_pred, y_test)
    # comparePrediction(y_train, y_train_pred)

    print(accuracy(y_test, y_test_pred))
    print(accuracy(y_train, y_train_pred))

    return
示例#7
0
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''

    results = {}

    print(type(sample_size))

    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time()  # Get start time
    learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time()  # Get end time

    # TODO: Calculate the training time
    results['train_time'] = end - start

    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time()  # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time()  # Get end time

    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start

    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy(y_train[:300], predictions_train)

    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy(y_test, predictions_test)

    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train[:300],
                                     predictions_train,
                                     beta=0.5)

    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)

    # Success
    print("{} trained on {} samples.".format(learner.__class__.__name__,
                                             sample_size))

    # Return the results
    return results
示例#8
0
def build_dec_tree(x_train, x_test, y_train, y_test):
    #Lab3 for reference
    for d in [1, 3, 5, 7]:
        dec_tree = DecisionTreeClassifier(max_depth=d)
        dec_tree.fit(x_train, y_train)
        train_pred = dec_tree.predict(x_train)
        test_pred = dec_tree.predict(x_test)

        train_acc = accuracy(train_pred, y_train)
        test_acc = accuracy(test_pred, y_test)

        print("Depth: {} | Train acc: {:.2f} | Test acc: {:.2f}".format(
            d, train_acc, test_acc))
示例#9
0
def splitMetrics(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X_train)

    comparePrediction(y_test, y_test_pred)
    comparePrediction(y_train, y_train_pred)

    print(accuracy(y_test, y_test_pred))
    print(accuracy(y_train, y_train_pred))

    return
示例#10
0
def tree_classifier(input_data, features_list) :
    """
    使用决策树
    """
    global count
    count+=1
    print("=============%d============" % count)
    features_train, features_test, target_train, target_test = prepare_data(input_data, features_list)
    
    clf = DecisionTreeClassifier()
    clf = clf.fit(features_train, target_train)
    pred = clf.predict(features_test)
    accu = accuracy(target_test, pred)
    print("准确率%f" % accu)
    prec = precision_score(target_test, pred)
    rec = recall_score(target_test, pred)
    f1 = f1_score(target_test, pred)
    print("精度%f" % prec)
    print("召回率%f" % rec)
    print("f1 %f" % f1)
    
    importance = clf.feature_importances_
    indices = list(numpy.argsort(importance))
    indices = reversed(indices)
    important_features = []
    for no, index in enumerate(indices):
        if importance[index]>0:
            print("No.%d--属性%s的权重%f" % (no, features_list[index+1], importance[index]))
            important_features.append(features_list[index+1])

    return important_features, clf
def evaluate_classifier(y_test, predicted_scores, model_name,
                        which_temporal_set):
    thresholds = {
        0.01: [],
        0.02: [],
        0.05: [],
        0.10: [],
        0.20: [],
        0.30: [],
        0.50: []
    }
    # threshold = 0.4
    results_df = pd.DataFrame([],
                              columns=('modelthresh', 'which_temporal',
                                       'model', 'threshold', 'accuracy',
                                       'precision', 'recall'))
    for threshold in thresholds.keys():
        calc_threshold = lambda x, y: 0 if x < y else 1
        predicted_test = np.array(
            [calc_threshold(score, threshold) for score in predicted_scores])
        test_acc = accuracy(predicted_test, y_test)
        precision, recall, thresholds = precision_recall_curve(
            y_test, predicted_test)
        this_result = pd.DataFrame([[
            model_name + str(threshold), which_temporal_set, model_name,
            threshold, test_acc,
            np.mean(precision),
            np.mean(recall)
        ]],
                                   columns=('modelthresh', 'which_temporal',
                                            'model', 'threshold', 'accuracy',
                                            'precision', 'recall'))
        results_df = results_df.append(this_result, ignore_index=True)

    return results_df
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
    '''
    you get it
    '''
    criterion = ['entropy', 'gini']
    rd = {
        'predicted': [],
        'crit': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }

    for c in criterion:
        scores = dectree_classifier(x_train, y_train, x_test, c)
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in list(scores)]
            rd['predicted'].append(preds)
            rd['crit'].append(c)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('dectree')

    return pd.DataFrame(rd)
def evaluate_rf(x_train,
                y_train,
                x_test,
                y_test,
                thresh=thresh,
                ntrees=[25, 100, 500],
                maxfeats=[1, .5, 4]):
    rd = {
        'predicted': [],
        'ntrees': [],
        'nfeats': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for size in ntrees:
        for f in maxfeats:
            scores = random_forest_classifier(size, f, x_train, y_train,
                                              x_test)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average') / len(scores))
                preds = [compare_to_threshold(x, t) for x in scores]
                rd['predicted'].append(preds)
                rd['ntrees'].append(size)
                rd['nfeats'].append(f)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('rf')

    return pd.DataFrame(rd)
示例#14
0
def sklearn_acc(model, test_data, test_target):
    overall_results = model.predict(test_data)
    test_pred = (overall_results > 0.5).astype(int)
    acc_results = [mae(overall_results, test_target), accuracy(test_pred, test_target),
                   f1_score(test_pred, test_target, average='macro')]

    return acc_results
示例#15
0
    def score(self, pipeline_dic):
        tfidf_vectorizer = TfidfVectorizer(**pipeline_dic['tfidf'])
        keep_tfidf = self.keep_tfidf(pipeline_dic['tfidf'])

        if not keep_tfidf:
            self.update_tfidf(pipeline_dic['tfidf'])

        keep_features = keep_tfidf and self.keep_features(
            pipeline_dic['features'])

        if not keep_features:
            self.update_features(pipeline_dic['features'])

        self.model_builder = self.model_builders[pipeline_dic['model']['type']]
        model_dic = {
            key: value
            for key, value in pipeline_dic['model'].items() if key != 'type'
        }
        self.model = self.model_builder(**model_dic)
        self.model.fit(self.X_train, self.Y_train)
        Y_pred = self.model.predict(self.X_test)
        score = accuracy(Y_pred, self.Y_test)
        print(f"Params = {pipeline_dic}, score = {round(score, 3)}. \n")

        return score
示例#16
0
def evaluate_logreg(x_train, y_train, x_test, y_test,
                    c_values=[.01,.1,1,10,100], thresh=thresh):
    '''
    generates df of predictions, penalties, c_values, thresholds, precision, recall, and
    accuracy of logistic regression
    '''
    penalties = ['l2']
    rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [],
          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    
    for p in penalties:
        for c in c_values:
            scores = logreg_classifier(x_train, y_train, x_test, c, p)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average')/len(scores))
                preds = [compare_to_threshold(x, t)for x in scores]
                rd['predicted'].append(preds)
                rd['penalty'].append(p)
                rd['C'].append(c)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('logreg')

    return pd.DataFrame(rd)
def train_evaluate():
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)

    x_train = train.drop('y', axis=1).values
    y_train = train.y.values
    x_test = test.drop('y', axis=1).values
    y_test = test.y.values

    classifiers = [
        make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=300)),
        make_pipeline(StandardScaler(), LogisticRegression(C=30,
                                                           max_iter=300)),
        make_pipeline(MinMaxScaler(), SVC(kernel='rbf')),
        make_pipeline(MinMaxScaler(), KNeighborsClassifier()),
        RandomForestClassifier(n_estimators=10000),
        GradientBoostingClassifier(n_estimators=1000),
        make_pipeline(MinMaxScaler(),
                      MLPClassifier(hidden_layer_sizes=(250, 150))),
    ]

    for clf in classifiers:
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        acc = accuracy(y_test, y_pred)
        print("Accuracy: {:.2%} \n\n{}\n\n".format(acc, clf))
示例#18
0
    def evaluate(y_true, y_pred,estimator):
        '''
        Return ind, value of many regression metrics in loop    
        then you need to  create data frame your self with code below

        ind, val = evaluate(ytrain, y_pred,lin_r)
        pd.DataFrame([val] ,index =[ ind] ,columns=['explained_variance ','r2 ','MAE ','MSE ','RMSE '])
        
        or  to compare multiple model
        
        ind, val = [],[]
        for estimator in [lin_r ,las,elas,ridg,ada ,extra ,gra ,rnd ] :  
            estimator.fit(Xtrain,ytrain)
            y_pred = estimator.predict(Xtrain)
            tmp1, tmp2  = evaluate(ytrain, y_pred,estimator)
            ind.append(tmp1)
            val.append(tmp2)
        result1 = pd.DataFrame(np.array(val),index = [ind ],columns=['accuracy','log_loss,'MAE','MSE','RMSE'])
        result1.sort_values(by=['MAE','RMSE','MSE'])
        
        '''
        # Regression metrics
        accuracy=metrics.accuracy(y_true, y_pred)
        log_loss=metrics.log_loss(y_true, y_pred) 
        mse=metrics.mean_squared_error(y_true, y_pred) 
        median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
        r2=metrics.r2_score(y_true, y_pred)

        return type(estimator).__name__,[round(accuracy,6),round(r2,6),round(log_loss,6),round(mse,6),round(np.sqrt(mse),6)]
示例#19
0
def validate(args, model, dataset):
    model.eval()
    loss_fcn = torch.nn.BCELoss()

    data_dataloader = generate_batches(dataset,
                                       args.batch_size,
                                       n_workers=args.num_workers)

    loss_list = []
    pred_list = []
    label_list = []
    with torch.no_grad():
        for batch_data, batch_label in data_dataloader:
            batch_logit = model(batch_data).view(-1)

            loss = loss_fcn(batch_logit, batch_label)

            pred = (batch_logit > 0.5).int()

            pred_list.extend(pred)
            label_list.extend(batch_label)

            loss_list.append(loss.item())

        loss_data = np.array(loss_list).mean()
        acc = accuracy(pred_list, label_list)
        f1 = f1_score(pred_list, label_list, average='macro')

    return loss_data, acc, f1,
def evaluate_knn(x_train,
                 y_train,
                 x_test,
                 y_test,
                 kays=[3, 5, 7, 9, 11],
                 thresh=thresh):
    '''
    generates df of predictions, penalties, k values, thresholds, precision,
    recall, and accuracy to help find best model
    '''
    rd = {
        'predicted': [],
        'k': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for k in kays:
        scores = knn_classifier(x_train, y_train, x_test, k)[:, 1]
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in scores]
            rd['predicted'].append(preds)
            rd['k'].append(k)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('knn')

    return pd.DataFrame(rd)
示例#21
0
def present_results_simp(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1'
    ]
    return df
示例#22
0
def get_metrics(prediction, y_test):
    '''
	Computes accuracy, precision, recall, ROC-AUC and F1 metrics for 
	consideroing predictions produced by a ML and actual values of a 
	dependent variables.
	Inputs:
		- prediction: an array with predictions.
		- y_test: an array with actual values.
	Returns a dictionary with metrics of a ML model.
	'''
    Accuracy = accuracy(prediction, y_test)
    Precision = precision(prediction, y_test)
    Recall = recall(prediction, y_test)
    try:
        AUC = roc_auc(prediction, y_test)
    except ValueError:
        AUC = 0
    F1 = f1(prediction, y_test)

    metrics_dict = {
        'Accuracy': Accuracy,
        'Precision': Precision,
        'Recall': Recall,
        'AUC': AUC,
        'F1': F1
    }
    return metrics_dict
示例#23
0
def lookup_best_c(x_train, y_train, x_test, y_test):
    accuracy_results = {
    }  # empty dictionary will contain c values and accuracies related

    for value in C_values:  # iterate throw each C value
        #tuned model 1 best parameters + C variable
        #tmp_model=LR(solver='lbfgs',class_weight= None,multi_class= 'ovr',
        #          dual=False, penalty= 'l2',random_state=337,C=value)
        tmp_model = LR(solver='lbfgs',
                       class_weight=None,
                       multi_class='ovr',
                       dual=False,
                       penalty='l2',
                       random_state=337,
                       C=value,
                       max_iter=10000)
        # train the model
        tmp_model.fit(x_train, y_train)

        # predicting activity labels
        tmp_predictions = tmp_model.predict(x_test)
        # accuracy score
        tmp_accuracy = accuracy(tmp_predictions, y_test)
        # store the tuple c_value and accuracy value in the dictionary
        accuracy_results[value] = tmp_accuracy

    # after iterating throw all c values
    return accuracy_results  # return results
示例#24
0
def log_metrics(logger, phase, epoch_num, y_hat, y):
    th = 0.5
    accuracy = metrics.accuracy(y_hat, y, th, True)
    f1_score = metrics.f1score(y_hat, y, th, True)
    specificity = metrics.specificity(y_hat, y, th, True)
    sensitivity = metrics.sensitivity(y_hat, y, th, True)
    roc_auc = metrics.roc_auc(y_hat, y)

    classes = [
        'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid',
        'subdural', 'any'
    ]
    for acc, f1, spec, sens, roc, class_name in zip(accuracy, f1_score,
                                                    specificity, sensitivity,
                                                    roc_auc, classes):
        logger.add_scalar(f'{phase}_acc_{class_name}', acc, epoch_num)
        logger.add_scalar(f'{phase}_f1_{class_name}', f1, epoch_num)
        logger.add_scalar(f'{phase}_spec_{class_name}', spec, epoch_num)
        logger.add_scalar(f'{phase}_sens_{class_name}', sens, epoch_num)
        logger.add_scalar(f'{phase}_roc_{class_name}', roc, epoch_num)

    for i, class_name in enumerate(classes):
        logger.add_scalar(f'{phase}_bce_{class_name}',
                          sklearn.metrics.log_loss(y[:, i], y_hat[:, i]),
                          epoch_num)
示例#25
0
def classify(filePath, name):
    labels = {1: [1], 9: [-1]}
    global returnVect
    returnVect = img2vector(filePath)
    y_pred = clf.predict(returnVect)
    pred = 1
    if y_pred == [-1]:
        pred = 9
    return pred, accuracy(labels[int(name.split('_')[0])], y_pred)
def print_prediction_metrics(clf, x, y, k):
    pred = cross_val_predict(clf,
                             x,
                             y,
                             cv=StratifiedKFold(n_splits=k, shuffle=True))
    print("Accuracy: ", round(accuracy(y, pred), 2))
    print("Precision on spam: ", round(precision(y, pred, average=None)[1], 3))
    print("Recall on spam: ", round(recall(y, pred, average=None)[1], 3))
    return
示例#27
0
def get_error(hidden_layer_sizes, X_train, y_train, X_test, y_test):
    clf = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation='logistic',
        random_state=RANDOM_SEED
    )
    clf.fit(X_train, y_train.ravel())
    error = 1 - accuracy(y_test, clf.predict(X_test))
    return error
示例#28
0
 def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m):
     f1_value = f1(gt, pre, average="micro")
     f1_m.update(f1_value)
     p_value = precision(gt, pre, average="micro", zero_division=0)
     p_m.update(p_value)
     r_value = recall(gt, pre, average="micro")
     r_m.update(r_value)
     acc_value = accuracy(gt, pre)
     acc_m.update(acc_value)
示例#29
0
def train_and_evaluate(y_train, x_train, y_val, x_val, alg):
	alg.fit(x_train, y_train)

	p = alg.predict_proba(x_val)
	p_bin = alg.predict(x_val)

	acc = accuracy(y_val, p_bin)
	auc = AUC(y_val, p[:,1])
	
	return (auc, acc)
示例#30
0
def train_and_evaluate(y_train, x_train, y_val, x_val, alg):
    alg.fit(x_train, y_train)

    p = alg.predict_proba(x_val)
    p_bin = alg.predict(x_val)

    acc = accuracy(y_val, p_bin)
    auc = AUC(y_val, p[:, 1])

    return (auc, acc)
示例#31
0
def compute_accuracy(dec_tree, x_data, y_data, threshold):
    ''' Takes: decision tree classifier object, feature and target data, and
                prediction probability threshold
        Returns: accuracy of predictions of tree on x for y
    '''

    pred_scores = dec_tree.predict_proba(x_data)[:,1]
    calc_threshold = lambda x,y: 0 if x < y else 1 
    predicted_test = np.array( [calc_threshold(score, threshold) for score in pred_scores] )
    return accuracy(predicted_test, y_data)
示例#32
0
def train_and_evaluate( y_train, x_train, y_val, x_val ):

	lr = LR()
	lr.fit( x_train, y_train )

	p = lr.predict_proba( x_val )
	p_bin = lr.predict( x_val )

	acc = accuracy( y_val, p_bin )
	auc = AUC( y_val, p[:,1] )
	
	return ( auc, acc )
示例#33
0
def train_and_eval_sklearn_classifier( clf, data ):
	
	x_train = data['x_train']
	y_train = data['y_train']
	
	x_test = data['x_test']
	y_test = data['y_test']	
	
	clf.fit( x_train, y_train )	
	
	try:
		p = clf.predict_proba( x_train )[:,1]	# sklearn convention
	except IndexError:
		p = clf.predict_proba( x_train )

	ll = log_loss( y_train, p )
	auc = AUC( y_train, p )
	acc = accuracy( y_train, np.round( p ))

	print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc )

	#

	try:
		p = clf.predict_proba( x_test )[:,1]	# sklearn convention
	except IndexError:
		p = clf.predict_proba( x_test )

	ll = log_loss( y_test, p )
	auc = AUC( y_test, p )
	acc = accuracy( y_test, np.round( p ))

	print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc )	
	
	#return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc }
	return { 'loss': ll, 'log_loss': ll, 'auc': auc }
示例#34
0
def train_and_evaluate( y_train, x_train, y_val, x_val, clf ):
    if clf == 'LR':
        lr = LR()
        lr.fit( x_train, y_train )

        p = lr.predict_proba( x_val )
        p_bin = lr.predict( x_val )

    elif clf == 'RF':
        n_trees = 200
        rf = RF( n_estimators = n_trees, verbose = True, n_jobs=4 )
        rf.fit( x_train, y_train )
        
        p = rf.predict_proba( x_val )
        p_bin = rf.predict( x_val )   

    acc = accuracy( y_val, p_bin )
    auc = AUC( y_val, p[:,1] )
    
    return ( auc, acc )
示例#35
0
### Fit, extrapolate, measure error

from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import accuracy_score as accuracy

for clf in clfs:

    # Fit
    start = clock()
    rf.fit(train_data, train_target)
    print("Fitted in {:.0f} seconds.".format(clock() - start))

    # Extrapolate
    start = clock()
    predict = rf.predict_proba(test_data)
    predict_bin = rf.predict(test_data)
    print("Extrapolated in {:.0f} seconds.".format(clock() - start))
    
    # Compute ROC AUC and accuracy
    acc = accuracy(test_target.values, predict_bin)
    auc = AUC(test_target.values, predict[:,1])
    print "AUC: {:.2%}. Accuracy: {:.2%}.".format(auc, acc)

"""
Results

RF(n_estimators = 10, verbose = True)
Fitted in 3 seconds. Extrapolated in 0 seconds.
AUC: 50.67%. Accuracy: 49.67%.
"""
示例#36
0
def AccuracyErrorCalc( y, p ):
    return 1 - accuracy(y, p)
示例#37
0
# train.to_csv( 'data/train_v.csv', index = False )
# val.to_csv( 'data/test_v.csv', index = None )

# encode the categorical variable as one-hot, drop the original column afterwards

train_dummies = pd.get_dummies( train.c1 )
train_num = pd.concat(( train.drop( 'c1', axis = 1 ), train_dummies.astype( int )), axis = 1 )
# train_num.to_csv( 'data/train_v_num.csv', index = False )

val_dummies = pd.get_dummies( val.c1 )
val_num = pd.concat(( val.drop( 'c1', axis = 1 ), val_dummies.astype(int) ), axis = 1 )
# val_num.to_csv( 'data/test_v_num.csv', index = False )

# train, predict, evaluate

n_trees = 100

rf = RF( n_estimators = n_trees, verbose = True )
rf.fit( train_num.drop( 'target', axis = 1 ), train_num.target )

p = rf.predict_proba( val_num.drop( 'target', axis = 1 ))
p_bin = rf.predict( val_num.drop( 'target', axis = 1 ))

acc = accuracy( val_num.target.values, p_bin )
auc = AUC( val_num.target.values, p[:,1] )
print "AUC: {:.2%}, accuracy: {:.2%}".format( auc, acc )
	
# AUC: 51.40%, accuracy: 51.14%	/ 100 trees
# AUC: 52.16%, accuracy: 51.62%	/ 1000 trees
	
示例#38
0
def run(X=None, y=None, X_submission=None, y_submission_val=None,pred_type='prediction',train_file='',test_file='',output_file='',stacked_level='stage1',creating_next_data=False,clfs=[],n_folds=5):#or validation
    #train_file = 'numerai_datasets/numerai_training_data.csv'
    #test_file = 'numerai_datasets/numerai_tournament_data.csv'
    #output_file = 'prediction/predictions_lr.csv'
    #x_trainはcolumns用
    global x_train, test_num, auc_stage

    ################## Stacking ###############
    #この段階でpredictionならX, y, X_submission
    #validationならX, y, X_submission, y_submission_valがわかっていれば良い

    np.random.seed(0) # seed to shuffle the train set

    n_folds = n_folds
    verbose = True
    shuffle = True 
    
    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]
        #validation_flag = validation_flag[idx]


    skf = list(StratifiedKFold(y, n_folds))

    print "Creating train and test sets for blending."
    #print "\nLevel 0"

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
    
    stacked_data_columns = x_train.columns.tolist()
    for j, clf in enumerate(clfs):
        print j, clf
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        acc = []
        auc = []
        for i, (train, test) in enumerate(skf):# # of n_fold
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            if str(clf).split("(")[0] in ['XGBClassifier']:
                #evallist  = [(X_test,'eval'), (X_train,'train')]
                clf.fit(X_train, y_train, eval_metric='logloss',eval_set=[(X_train, y_train),(X_test, y_test)])
            elif str(clf).split("(")[0] in ['Classifier']:
                #evallist  = [(X_test,'eval'), (X_train,'train')]
                mu = X_train.mean(0)
                stddev =  X_train.std(0)
                X_train = (X_train-mu) / stddev
                X_test = (X_test-mu) / stddev

                clf.fit(X_train, y_train)

            elif 'Keras' in str(clf).split("(")[0]:
                #evallist  = [(X_test,'eval'), (X_train,'train')]

                clf.fit(X_train, y_train,validation_data=[X_test,y_test])


            else:
                clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[test, j] = y_submission
            acc.append(accuracy( y_test, y_submission.round() ))
            auc.append(AUC( y_test, y_submission ))             
            #if using the mean of the prediction of each n_fold
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)
        #if using the prediction of all train data
        #clf.fit(X, y)
        #dataset_blend_test[:,j] = clf.predict_proba(X_submission)[:,1] 
        print "clf: {}\n".format(clf)
        print 
        print "logloss: {:.4}+{:.2}, accuracy: {:.4}+{:.2} \n".format( np.mean(auc),np.std(auc), np.mean(acc), np.std(acc) )
        auc_stage[stacked_level].append(np.mean(auc))
        print
        if pred_type == 'prediction':
            print "saving individual model"
            indi_filename = output_file + '_{}{}_{}.csv'.format(str(clf).split("(")[0], j+1,stacked_level )
            test_num['probability'] = dataset_blend_test[:,j]
            test_num.to_csv( indi_filename, columns = ( 't_id', 'probability' ), index = None ) 
            stacked_data_columns.append('{}{}_{}.csv'.format(str(clf).split("(")[0], j+1,stacked_level))
    
    #
    auc_stage[stacked_level] = np.mean(auc_stage[stacked_level])

    #元のデータをpredictionにつける
    dataset_blend_train = np.concatenate((X,dataset_blend_train),axis=1)
    dataset_blend_test = np.concatenate((X_submission,dataset_blend_test),axis=1)

    # saving the stacked data for next stack level
    if pred_type == 'prediction' and creating_next_data == True:
        next_dataset_blend_train = pd.DataFrame(dataset_blend_train,columns=stacked_data_columns)
        next_dataset_blend_train['target'] = y
        #next_dataset_blend_train['validation'] = validation_flag
        next_dataset_blend_test = pd.DataFrame(dataset_blend_test,columns=stacked_data_columns)
        next_dataset_blend_test = pd.concat([test_num['t_id'],next_dataset_blend_test],axis=1)

        next_dataset_blend_train.to_csv('stacked_datasets/stacking_train_{}.csv'.format(stacked_level),index=None)
        next_dataset_blend_test.to_csv('stacked_datasets/stacking_test_{}.csv'.format(stacked_level),index=None)

    #print "\nLevel 1"
    print "Blending."
    clf = LR()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:,1]

    #print "Linear stretch of predictions to [0,1]"
    #y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    if pred_type != 'prediction':
        print "final logloss in validation set"
        print AUC( y_submission_val, y_submission )
    else:
        print "saving..."

        test_num['probability'] = y_submission
        output_file = output_file + '_' + stacked_level + '.csv'
        test_num.to_csv( output_file, columns = ( 't_id', 'probability' ), index = None )
y_file = sys.argv[1]
p_file = sys.argv[2]

print "loading p..."

p = np.loadtxt( p_file )

y_predicted = np.ones(( p.shape[0] ))
y_predicted[p < 0] = -1

print "loading y..."

y = np.loadtxt( y_file, usecols= [0] )

print "accuracy:", accuracy( y, y_predicted )
print "precision:", precision( y, y_predicted, average='binary' )
print "recall:", recall( y, y_predicted, average='binary' )
print "AUC:", AUC( y, p )

print
print "confusion matrix:"
print confusion_matrix( y, y_predicted )


"""
run score.py data/test_v.txt vw/p_v_logistic.txt

accuracy: 0.994675826535

confusion matrix:
示例#40
0
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score as accuracy

clf = GaussianNB()
t0 = time()
clf.fit(features_train, labels_train)
print "training time:",round(time()-t0,3),"s"
t0 = time()
pred = clf.predict(features_test)
print "prediction time:", round(time()-t0,3),"s"
acc = accuracy(labels_test, pred)
print("Classifier accuracy: ", acc)

#########################################################