Exemplo n.º 1
0
def modelfit(alg, dtrain, predictors, dtest=None, dscore=None, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics=['logloss'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['target'], eval_metric='logloss')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    if isinstance(dtest, pd.DataFrame):
        dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]
    if isinstance(dscore, pd.DataFrame):
        dscore_predprob = alg.predict_proba(dscore[predictors])[:,1]
        np.savetxt('XGBoost_pred_raw.csv', dscore_predprob, delimiter=",")

    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions)
    print "Metric Score (Train): %f" % metrics.log_loss(dtrain['target'], dtrain_predprob)
    if isinstance(dtest, pd.DataFrame):
        print "Metric Score (Test): %f" % metrics.log_loss(dtest['target'], dtest_predprob)

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
Exemplo n.º 2
0
def Test():
    """Testing ConstrainedMultinomialRegression
    
    Compare the results with scikit-learn LogisticRegression v.15
    
    Returns
    -------
    Log Loss for Logistic Regression, ConstrainedMultinomialRegression
    Accuracy for Logistic Regression, ConstrainedMultinomialRegression
    """
    n = 1000; p = 10; k = 3
    X = np.random.randn(n, p)
    beta = np.random.binomial(1, .5, (p, k))
    log_odd = X.dot(beta)
    prob = np.exp(log_odd)/(1 + np.exp(log_odd))
    y = np.array([np.argmax(i) for i in prob])
    lb = LabelBinarizer()
    Y = lb.fit_transform(y)
    w = randn(k,p)
    cut = n/2
    train = np.arange(cut); valid = np.arange(cut,n) # Split Train and Test
    b = [(0,None)]*(p+1)*k # Constraint on Beta
    cl1 = LogisticRegression()
    cl2 = ConstrainedMultinomialClassifier(bounds = b)
    cl1.fit(X[train], y[train])
    cl2.fit(X[train], y[train])
    prob1 = cl1.predict_proba(X[valid])
    prob2 = cl2.predict_proba(X[valid])
    print log_loss(y[valid], prob1)
    print log_loss(y[valid], prob2)
    yhat1 = cl1.predict(X[valid])
    yhat2 = cl2.predict(X[valid])
    print accuracy_score(y[valid], yhat1)
    print accuracy_score(y[valid], yhat2)
Exemplo n.º 3
0
def generic_cv_reg(X,y,model,n_folds,random_state) :
    kf = cross_validation.KFold(y.shape[0],n_folds=n_folds, shuffle=True, random_state=random_state)
    trscores, cvscores, times = [], [], []
    i = 0
    stack_train = np.zeros((len(y))) # stacked predictions
    
    threshold = 0.000001
    
    for i, (train_fold, validate) in enumerate(kf) :
        i = i + 1
        t = time()
        trscore = log_loss(y.iloc[train_fold], model.fit(X.iloc[train_fold], y.iloc[train_fold]).predict(X.iloc[train_fold]))
        
        validation_prediction = model.predict(X.iloc[validate])
        
        validation_prediction[validation_prediction>1-threshold] = 1-threshold
        validation_prediction[validation_prediction<threshold] = threshold
        
        cvscore = log_loss(y.iloc[validate], validation_prediction)
        trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)
        
        stack_train[validate] = validation_prediction
    
    print("TRAIN %.5f | TEST %.5f | TIME %.2fm (1-fold)" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60))
    print(model.get_params(deep = True))
    print("\n")
    
    return np.mean(cvscores), stack_train
Exemplo n.º 4
0
def check_lambda(dirnm, datanm_train, datanm_valid, datanm_orig_train, datanm_orig_valid, samples_per_class, Cs, num_classes):
    spct = 10*70
    tdata, tlabels = load_full(dirnm+datanm_train, spct)
    print tdata.shape, tlabels.shape

    spct = 10
    otdata, otlabels = load_full(dirnm+datanm_orig_train, spct)

    spct = 10*30
    vdata, vlabels = load_full(dirnm+datanm_valid, spct)

    spct = 10
    ovdata, ovlabels = load_full(dirnm+datanm_orig_valid, spct)

    # artif
    ans = np.zeros((len(Cs), 4))

    for i, C in enumerate(Cs):
        clf = LogisticRegression(C  =C,     penalty='l2', multi_class = 'ovr',
                                 tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg')
        clf.fit(tdata, tlabels)

        out_train = clf.predict_proba(tdata)
        out_valid = clf.predict_proba(vdata)
        out_train_real = clf.predict_proba(otdata)
        out_valid_real = clf.predict_proba(ovdata)

        ans[i, 0] += log_loss(tlabels, out_train)
        ans[i, 1] += log_loss(vlabels, out_valid)
        ans[i, 2] += log_loss(otlabels, out_train_real)
        ans[i, 3] += log_loss(ovlabels, out_valid_real)

    np.savez("logreg_lambda", ans= ans, Cs = Cs, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
Exemplo n.º 5
0
def simple_model(data, test):
    targets = data.target
    X, tX, y, ty = train_test_split(data.drop("target", axis=1), 
                                              targets, 
                                              test_size=0.2,
                                              random_state=2016)
                                              
    
    predictions = []
    
    print("\n\nTraining")
    # Sklearn GBM
    clf = GradientBoostingClassifier(n_estimators=2500, 
                                     learning_rate=0.026, 
                                     max_depth=2,
                                     random_state=2015)
                                     
    cal = CalibratedClassifierCV(clf, cv=5, method="isotonic")
    cal.fit(X,y)
    
    pred = cal.predict_proba(tX)[:,1]
    print("\n\tValidation for Calibrated GBM")
    print("\t", log_loss(ty, pred))
    print("\t", roc_auc_score(ty, pred))
    
    # ens["gbm"] = pred
    predictions.append(cal.predict_proba(test)[:,1])
    
    # XGBoost
    data = X.values
    label = y.values
    dtrain = xgb.DMatrix(data, label=label)
    
    datat = tX.values
    dtest = xgb.DMatrix(datat)
    
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 8
    param['eval_metric'] = 'auc'
    param['silent'] = 1
    param['min_child_weight'] = 2
    param['subsample'] = 0.5
    param['colsample_bytree'] = 0.5
    param['nthread'] = 4
    num_round = 50
    
    bst = xgb.train(param, dtrain, num_round)
    pred = bst.predict(dtest)
    print("\n\tValidation for XGBoost")
    print("\t", log_loss(ty, pred))
    print("\t", roc_auc_score(ty, pred))
    
    # ens["xgb"] = pred
    predictions.append(cal.predict_proba(test)[:,1])
    
    predictions = sum(predictions)/len(predictions)
    
    return predictions
Exemplo n.º 6
0
def generic_cv_np(X,y,model,n_folds,random_state) :
    kf = cross_validation.KFold(y.shape[0],n_folds=n_folds, shuffle=True, random_state=random_state)
    trscores, cvscores, times = [], [], []
    i = 0
    stack_train = np.zeros((len(y))) # stacked predictions
    for i, (train_fold, validate) in enumerate(kf) :
        i = i + 1
        t = time()
        
        model.fit(X[train_fold,], y[train_fold])
        
        trscore = log_loss(y[train_fold], model.predict_proba(X[train_fold,]))
        
        validation_prediction = model.predict_proba(X[validate,])
        
        cvscore = log_loss(y[validate], validation_prediction)
        trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)
        
        stack_train[validate] = validation_prediction
    
    print("TRAIN %.5f | TEST %.5f | TIME %.2fm (1-fold)" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60))
    print(model.get_params())
    print("\n")
    
    return np.mean(cvscores), stack_train
Exemplo n.º 7
0
def svm_grid_search():

	#get data
	training_input,training_target,validation_input,validation_target = prepare_input()

	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	training_input = training_input[:100000]
	training_target = training_target[:100000]

	print training_input.shape[0]
	print training_target.shape[0]

	start = time.time()
	svm = SVC(random_state=31,probability=True)
	
	
	svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]}
	svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	svm_grid_obj = svm_grid_obj.fit(training_input,training_target)
	svm = svm_grid_obj.best_estimator_
	print "Best params: " + str(svm_grid_obj.best_params_)	
	svm_train_error = log_loss(training_target,svm.predict_proba(training_input))
	svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input))
	print "Best SVM training error: {:02.4f}".format(svm_train_error)
	print "Best SVM validation error: {:02.4f}".format(svm_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return svm
Exemplo n.º 8
0
def check_vb(dirnm, datanm_train, datanm_valid, C, num_classes):
    spct = 10*70
    tdata, tlabels = load_full(dirnm+datanm_train, spct)
    #print tdata.shape, tlabels.shape

    spct = 10*30
    vdata, vlabels = load_full(dirnm+datanm_valid, spct)

    h = np.arange(0, 310, 10)
    h[0] +=1
    # artif
    ans = np.zeros((h.size, 2))

    tind = kget(tlabels, num_classes, h[-1])
    vind = kget(vlabels, num_classes, h[-1])

    for l in xrange(0, h.size):

        clf = LogisticRegression(C  =C,     penalty='l2', multi_class = 'ovr',
                                 tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg')
        clf.fit(tdata[tind[:h[l]*num_classes]], tlabels[tind[:h[l]*num_classes]])

        out_train = clf.predict_proba(tdata[tind[:h[l]*num_classes]])
        out_valid = clf.predict_proba(vdata[vind[:h[l]*num_classes]])

        ans[l, 0] += log_loss(tlabels[tind[:h[l]*num_classes]], out_train)
        ans[l, 1] += log_loss(vlabels[vind[:h[l]*num_classes]], out_valid)

    np.savez("logreg_bv", ans= ans, C = C, num_classes = num_classes)
    return ans
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params):
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname2]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)
        z[cname2] /= N_splits

    vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)]
    print('validation loss: ', vloss, np.mean(vloss), np.std(vloss))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z):
    cname = sys._getframe().f_code.co_name
    v[cname], z[cname] = 0, 0
    N_splits = 5
    scores = []
    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    xgb_params = dict(
            max_depth = 3,
            learning_rate = 0.01,
            subsample = 0.7,
            #colsample_bytree = 0.8,
            objective = 'binary:logistic',
            eval_metric = 'logloss',
            seed = 1,
            silent = 1
        )
    dtest = xgb.DMatrix(test2)
    for n, (itrain, ival) in enumerate(skf.split(train2, y)):
        print('step %d of %d'%(n+1, skf.n_splits), now())
        dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
        dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
        watch = [(dtrain, 'train'), (dvalid, 'valid')]
        clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000)

        p = clf.predict(dvalid)
        v.loc[ival, cname] += pconvert(p)
        score = metrics.log_loss(y[ival], p)
        z[cname]  += pconvert(clf.predict(dtest))
        print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now())
        scores.append(score)

    print('validation loss: ', metrics.log_loss(y, v[cname]))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
    z[cname] /= N_splits
Exemplo n.º 11
0
def rf_fit():

	train_inp,valid_inp,train_target,valid_target = prepare_input()

	rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5)
	start = time.time()

	rf.fit(train_inp,train_target)

	end = time.time()
	print "fitting took {:0.4} seconds".format(end-start)

	training_output = rf.predict_proba(train_inp)
	validation_output = rf.predict_proba(valid_inp)

	training_error = log_loss(train_target,training_output)
	validation_error = log_loss(valid_target,validation_output)

	print "Train error: {:02.4f}".format(training_error)
	print "Validation error: {:02.4f}".format(validation_error)


	joblib.dump(rf,rf_filename)


	return rf
def cross_valid(X, y, params, iterations, n_folds=6, silent=True):
    print 'Running cross validation'
    pprint.pprint(params)
    print 'Iterations:', iterations
    print 'X shape', X.shape

    y_size = len(y)
    if hasattr(X, 'values'):
        X = X.values
    y = np.array(y)

    kf = cross_validation.KFold(y_size, n_folds=n_folds, shuffle=True,
                                random_state=params['seed'])
    y_pred = np.zeros((y_size, 9))

    logs = []
    for train, test in kf:
        X_train, X_test = X[train, :], X[test, :]
        y_train, y_test = y[train], y[test]

        predictions = predict(X_train, y_train, X_test, params, iterations,
                              None if silent else y_test)
        y_pred[test] = predictions

        logs.append(metrics.log_loss(y_test, predictions))
        print 'Current log_loss:', logs[-1]

    print 'Final log_loss: %s (avg: %s, stddev: %s)' % (
                                                metrics.log_loss(y, y_pred),
                                                np.mean(logs),
                                                np.std(logs))
Exemplo n.º 13
0
def xgboostcv(max_depth,
              eta,
              num_rounds,
              gamma,
              min_child_weight,
              max_delta_step,
              subsample,
              colsample_bytree,
              silent=True,
              seed=1234):
    
    print ('\nRunning XGBOOST on the cluster')
    
    # Call xgboost in distributed mode (CLI input for params)
    xgb_run = ['max_depth=%s' % int(max_depth),
               'eta=%s' % eta,
               'silent=%s' % silent,
               'gamma=%s' % gamma,
               'min_child_weight=%s' % int(min_child_weight),
               'max_delta_step=%s' % max_delta_step,
               'subsample=%s' % subsample,
               'eval_metric=logloss',
               'colsample_bytree=%s' % colsample_bytree,
               'seed=%s' % seed,
               'objective=binary:logistic',
               'eval[eval_set]=%s' % deval,
               'eval[train_set]=%s' % dtrain,
               'num_round=%s' % int(num_rounds),
               'data=%s' % dtrain,
               'model_out=%s' % model_ouput]
    argv = ['wormhole/repo/dmlc-core/tracker/dmlc_yarn.py', # Where your instance is found!!
            '-n',
            '16',
            'wormhole/bin/xgboost.dmlc', # Where your instance is found!!
            './examples/xgboost-avazu.txt'] + xgb_run
    print(' '.join(argv))
    # Cluster specific ENV VARS.
    Popen(argv,
          env = {'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.45-28.b13.el6_6.x86_64/',
                 'HADOOP_HOME': '/usr/',
                 'HADOOP_HDFS_HOME': '/usr/lib/hadoop-hdfs/',
                 'PATH': os.getenv('PATH')}).communicate()

    # Export model to local filesystem
    try:
      os.remove("avazu.model")
    except OSError:
      pass
    Popen(["hadoop","fs","-copyToLocal","/tmp/avazu.model", "."]).communicate()
    # Delete stored model.
    Popen(["hadoop","fs","-rm","/tmp/avazu.model", "."]).communicate()
    
    # Load Model file
    bst = xgb.Booster(model_file='avazu.model')
    preds = bst.predict(dtest)
    y_pred = bst.predict(dtest)
    y_valid = dtest.get_label()
    print('logloss = ', log_loss(y_valid, y_pred))
    # We are maximizing the function.
    return -log_loss(y_valid, y_pred)
def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate):
    test_loss = [log_loss(y_test, pred) for pred in test_predictions]
    train_loss = [log_loss(y_train, pred) for pred in train_predictions]

    plt.plot(test_loss, color, linewidth=2)
    plt.plot(train_loss, color+'--', linewidth=2)
    looses[learning_rate] = test_loss
def svm_model(train_data_features, train_data_split_crossfold_features, test_data_features, labels, labels_cross_validation_classwise, using_cross_validation2, kf, settings):
    if using_cross_validation2:
        C_base = 4.5
        C_step = 0.5#0.005
        C = C_base
        _results = []
        if(len(train_data_cross_validation_classwise_features) > 0):
            """train_all = np.append(train_data_features, train_data_cross_validation_classwise_features, axis=0)
            labels_all = np.append(labels, labels_cross_validation_classwise)
            kf_all = KFold(len(train_all)-1, n_folds=int(settings['Data']['CrossValidation2']), shuffle=True)
            for train, test in kf_all:
                svc = SVC(kernel="linear", C=C, probability=True)
                model = svc.fit(train_all[train], labels_all[train])
                predicted_classes = model.predict(train_all[test])
                predicted_classes_train = model.predict(train_all[train])
                class_probabilities = model.predict_proba(train_all[test])
                print("C: ",C," n points:", len(predicted_classes), " percentage: ",(labels_all[test] != predicted_classes).sum()*100/len(predicted_classes),"% percentage_train: ", (labels_all[train] != predicted_classes_train).sum()*100/len(predicted_classes_train),"%")
                _results.append((labels_all[test] != predicted_classes).sum())
                C += C_step"""
            for c in pl.frange(C_base,9, C_step):
                svc = SVC(kernel="linear", C=c, probability=True)
                model = svc.fit(train_data_features, labels)
                predicted_classes = model.predict(train_data_cross_validation_classwise_features)
                class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features)
                print("C: ",c," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%")
                print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities))
            for c in pl.frange(1,3, 1):
                svc = SVC(kernel="linear", C=c, probability=True)
                model = svc.fit(train_data_features, labels)
                predicted_classes = model.predict(train_data_cross_validation_classwise_features)
                class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features)
                print("C: ",c," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%")
                print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities))
        else:
            for train, test in kf:
                svc = SVC(kernel="linear", C=C, probability=True)
                model = svc.fit(train_data_features[train], labels[train])
                predicted_classes = model.predict(train_data_features[test])
                predicted_classes_train = model.predict(train_data_features[train])
                class_probabilities = model.predict_proba(train_data_features[test])
                print("C: ",C," n points:", len(predicted_classes), " percentage: ",(labels[test] != predicted_classes).sum()*100/len(predicted_classes),"% percentage_train: ", (labels[train] != predicted_classes_train).sum()*100/len(predicted_classes_train),"%")
                _results.append((labels[test] != predicted_classes).sum())
                C += C_step

        C = C_base + C_step * _results.index(min(_results))
        print("C: ", C)
        if(len(train_data_cross_validation_classwise_features) > 0):
            svc = SVC(kernel="linear", C=C, probability=True)
            model = svc.fit(train_data_features, labels)
            predicted_classes = model.predict(train_data_cross_validation_classwise_features)
            class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features)
            print("C: ",C," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%")
            print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities))
        svc = SVC(kernel="linear", C=C, probability=True)
        model = svc.fit(train_data_features, labels)
        return model.predict_proba(test_data_features), model.predict(test_data_features), model
    else:
        svc = SVC(kernel="linear", C=8, probability=True)
        model = svc.fit(train_data_features, labels)
        return model.predict_proba(test_data_features), model.predict(test_data_features), model
Exemplo n.º 16
0
    def go_by_category_2(category):
        input, targets, scaler = TrainingFactory.get_training_data_by_category(category,10000)
        input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1)

        test_data_sparse = TestingFactory.get_test_data(limit=1000)
        test_data_scaled = scaler.transform(test_data_sparse)
        test_data = csr_matrix(test_data_scaled)

        classif = SVC(kernel='rbf',C=0.1, tol=0.001, probability=True)
        classif.fit(input_train, target_train)

        output_targets_proba = classif.predict_proba(input_test)

        outputs_predicted_proba = [item[1] for item in output_targets_proba]
        output_targets = classif.predict(input_test)

        # print output_targets.tolist()
        # print outputs_predicted_proba
        # print target_test

        print log_loss(target_test, output_targets)
        accuracy = accuracy_score(target_test, output_targets)
        print accuracy
        print confusion_matrix(target_test, output_targets)


        testing_output = classif.predict_proba(test_data)
        testing_output_proba = [item[1] for item in testing_output]
        print testing_output_proba

        return accuracy, output_targets, testing_output_proba
Exemplo n.º 17
0
def train_model_with_feature(config_name, clf_name, fill_na_opt, PCA_n_comp, clf, X, X_test, y):
    if PCA_n_comp!=-1:
        pca = PCA(PCA_n_comp) #PCA dimension reduction
        logger.info('PCA fit on count matrix')
        # rescale num to (0,1)
        X_all = pca.fit_transform( minmax_scale(np.vstack([X, X_test])) )
        X, X_test = X_all[:X.shape[0], :], X_all[X.shape[0]:, :]
        logger.info('PCA fit done')

    logger.info('start training')
    print 'training size', X.shape, 'test size', X_test.shape
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9)
    if clf_name=='xgb':
        clf.fit(X_train,y_train,eval_metric='mlogloss')
    else:
        clf.fit(X_train,y_train)
    logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') train log-loss='\
            +str(log_loss(y_train, clf.predict_proba(X_train))))
    logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') validate log-loss='\
            +str(log_loss(y_val, clf.predict_proba(X_val))))

    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    df_test[group_list] = y_pred
    logger.info('finish training')
    # , 'phone_brand_en', 'device_model_en'
    df_test.to_csv('output/'+config_name+'-'+clf_name+'-'+fill_na_opt+'-pca'+\
            str(PCA_n_comp)+'-'+str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))\
            +'.csv', columns=['device_id']+group_list, index=False)
    logger.info('finish outputing result')
Exemplo n.º 18
0
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000):
    TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length)

    prediction_model = GradientBoostingClassifier(
        loss='deviance',
        learning_rate=0.1,
        n_estimators=30,
        subsample=1.0,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_depth=5,
    )

    x_train, y_train = clean_data(TRAIN_FILE)
    x_test, y_test = clean_data(TEST_FILE)

    with Timer('fit model'):
        prediction_model.fit(x_train, y_train)

    with Timer('evaluate model'):
        y_prediction_train = prediction_model.predict_proba(x_train)
        y_prediction_test = prediction_model.predict_proba(x_test)

        loss_train = log_loss(y_train, y_prediction_train)
        loss_test = log_loss(y_test, y_prediction_test)

    print 'loss_train: %s' % loss_train
    print 'loss_test: %s' % loss_test
Exemplo n.º 19
0
def main(job_id, params):
    print job_id, params
    params = get_params(params)
    print job_id, params

    crimes = np.load(DATA_FILE)

    model = RandomForestClassifier(n_estimators=params['n_estimators'],
                                   criterion=params['criterion'],
                                   max_depth=None if params['max_depth'] < 1 else params['max_depth'],
                                   min_samples_split=params['min_samples_split'],
                                   min_samples_leaf=params['min_samples_leaf'],
                                   max_features=params['max_features'],
                                   min_weight_fraction_leaf=0.0,
                                   max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                   random_state=42, verbose=0, warm_start=False, class_weight=None)
    model.fit(crimes['features_train'], crimes['labels_train'])
    loss_train = log_loss(crimes['labels_train'], model.predict_proba(crimes['features_train']))
    loss_val = log_loss(crimes['labels_val'], model.predict_proba(crimes['features_val']))
    loss_all = log_loss(crimes['labels'], model.predict_proba(crimes['features']))
    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42):
    v[cname], z[cname] = 0, 0
    scores = []

    dtest = xgb.DMatrix(test2)
    for s in range(N_seeds):
        xgb_params['seed'] = s + base_seed
        skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed)
        for n, (itrain, ival) in enumerate(skf.split(train2, y)):
            dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train2.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname] += pconvert(p)
            score = metrics.log_loss(y[ival], p)
            z[cname]  += pconvert(clf.predict(dtest))
            print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now())
            scores.append(score)

    z[cname] /= N_splits * N_seeds
    v[cname] /= N_seeds
    print('validation loss: ', metrics.log_loss(y, prestore(v[cname])))
    cv=np.array(scores)
    print(cv, cv.mean(), cv.std())
Exemplo n.º 21
0
def main():
    X, Y, encoder, scale = load_train_data('train.csv')
    estimators = 500
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0)
    X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)
    log.info('Loaded training file')
    X_test, _ = load_csv_file('test.csv', cut_end=False)
    log.info('Loaded test file')

    #Classifier Setup
    tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1,
                                    random_state=42, max_depth=55, min_samples_split=1)

    clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf)
    log.info('Fitting GradientBoost')
    clf.fit(X_train_real, Y_train_real)
    clf_probs = clf.predict_proba(X_test_real)
    score = log_loss(Y_test_real, clf_probs)
    log.info('Log Loss score un-trained = %f' % score)
    # Calibrate Classifier using ground truth in X,Y_valid
    sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit")
    log.info('Fitting CalibratedClassifierCV')
    sig_clf.fit(X_valid, Y_valid)
    sig_clf_probs = sig_clf.predict_proba(X_test_real)
    sig_score = log_loss(Y_test_real, sig_clf_probs)
    log.info('Log loss score trained = %f' % sig_score)

    # Ok lets predict the test data with our funky new classifier
    sig_submission_probs = sig_clf.predict_proba(X_test)

    write_out_submission(sig_submission_probs, 'submission.csv')
Exemplo n.º 22
0
def check_lambda(datanm, samples_per_class,depv, num_classes, criterion, num_iter = 100):
    data, labels = load_full(datanm, samples_per_class)
    slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None)
    ans = np.zeros((len(depv), 4))
    for train_index, test_index in slo:
        train_data = [data[train_index, :], labels[train_index]]
        valid_data = [data[test_index , :], labels[test_index ]]

        for i, d in enumerate(depv):
            clf = DecisionTreeClassifier(criterion=criterion, splitter='best',
                                         max_depth=d, min_samples_split=2,
                                         min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                         max_features=None, random_state=None,
                                         max_leaf_nodes=None, class_weight=None, presort=False)
            clf.fit(train_data[0], train_data[1])

            out_train = clf.predict_proba(train_data[0])
            out_valid = clf.predict_proba(valid_data[0])

            ans[i, 0] += log_loss(train_data[1], out_train)
            ans[i, 1] += log_loss(valid_data[1], out_valid)
            ans[i, 2] += brier(train_data[1], out_train, num_classes)
            ans[i, 3] += brier(valid_data[1], out_valid, num_classes)

    ans[:, :] /= num_iter

    np.savez("rand_forest_lambda_" + criterion, ans= ans, mdep = mdep, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
Exemplo n.º 23
0
def train_model(estimator, xtr, xcv, ytr, ycv):
    model_list = get_model_name_list()
    #for rfc, rfr, etc, etr
    if type(estimator) in model_list[:4]:        
        estimator.fit(xtr, ytr)
        #for rfc, rtc
        if hasattr(estimator, 'predict_proba'):
            train_predict = estimator.predict_proba(xtr)
            cv_predict = estimator.predict_proba(xcv)
        #for rfr, etr
        else:
            train_predict = estimator.predict(xtr)
            cv_predict = estimator.predict(xcv)
        best_iter = 0
    #for xgbc, xgbr 
    elif type(estimator) in model_list[4:]:
        estimator.fit(xtr, ytr, early_stopping_rounds=35, eval_metric='logloss', 
                      eval_set=[(xcv, ycv)], verbose=True)
        best_iter = estimator.best_iteration
        #for xgbc
        if hasattr(estimator, 'predict_proba'):
            train_predict = estimator.predict_proba(xtr, ntree_limit=best_iter)
            cv_predict = estimator.predict_proba(xcv, ntree_limit=best_iter)
        #for xgbr
        else:
            train_predict = estimator.predict(xtr, ntree_limit=best_iter)
            cv_predict = estimator.predict(xcv, ntree_limit=best_iter)
    train_loss = log_loss(ytr, train_predict)
    cv_loss = log_loss(ycv, cv_predict)
    return train_loss, cv_loss, best_iter
def gb_get_min_loss(clf, verbose=False):
    j = 0
    min_loss_test = 1

    print()
    for i, quality_train, quality_test in zip(
            range(1, 250 + 1),
            clf.staged_predict_proba(X_train),
            clf.staged_predict_proba(X_test)
    ):
        loss_train = log_loss(y_train, quality_train)
        loss_test = log_loss(y_test, quality_test)

        if min_loss_test > loss_test:
            min_loss_test = loss_test
            j = i

            if (verbose):
                print(
                    'Iteration:', i, '  ',
                    'Train:', '{0:.3f}'.format(loss_train), '  ',
                    'Test:', '{0:.3f}'.format(loss_test), '  ',
                    '-' if min_loss_test == loss_test else '+'
                )

    return min_loss_test, j
Exemplo n.º 25
0
def fit_model_and_test(params):
    crimes = np.load(DATA_FILE)
    features_train = crimes['features_train']
    all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val']))))
    hidden_units = int(params['hidden_units'])
    batch_size = 64

    labels_train = create_labels(crimes['labels_train'], all_labels)
    labels_vals = create_labels(crimes['labels_val'], all_labels)
    labels_full = create_labels(crimes['labels'], all_labels)

    labels_train = np_utils.to_categorical(labels_train)
    labels_vals = np_utils.to_categorical(labels_vals)
    labels_full = np_utils.to_categorical(labels_full)

    model = create_model_and_fit(features_train,labels_train, hidden_units, len(all_labels),  params['layers'],
                                 params['input_dropout'], params['hidden_dropout'],
                                 batch_size, crimes['features_val'], labels_vals)

    loss_train = log_loss(labels_train, model.predict_proba(crimes['features_train']))
    loss_val = log_loss(labels_vals, model.predict_proba(crimes['features_val']))
    loss_all = log_loss(labels_full, model.predict_proba(crimes['features']))

    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val
    sys.stdout.flush()
    return loss_val, model, crimes, all_labels
Exemplo n.º 26
0
def check_vb(datanm, samples_per_class, Cs, num_classes, num_iter = 100):
    data, labels = load_full(datanm, samples_per_class)
    slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.5, train_size=0.5, random_state=None)
    ans = np.zeros((len(Cs), samples_per_class/2, 2))
    for train_index, test_index in slo:
        train_data = [data[train_index, :], labels[train_index]]
        valid_data = [data[test_index , :], labels[test_index ]]

        for l in xrange(samples_per_class/2):
            ind_train = []
            ind_valid = []
            for k in xrange(num_classes):
                ind_train = ind_train + np.where(train_data[1] == k)[0].tolist()[:l+1]
                ind_valid = ind_valid + np.where(valid_data[1] == k)[0].tolist()[:l+1]

            ctrain_data = [ train_data[0][ind_train], train_data[1][ind_train] ]
            cvalid_data = [ valid_data[0][ind_valid], valid_data[1][ind_valid] ]

            for i, C in enumerate(Cs):
                clf = LogisticRegression(C  =C   , penalty='l2', multi_class = 'ovr',
                                         tol=0.001, n_jobs = -1 , verbose = 0)#, solver = 'newton-cg')
                clf.fit(ctrain_data[0], ctrain_data[1])

                out_train = clf.predict_proba(ctrain_data[0])
                out_valid = clf.predict_proba(cvalid_data[0])

                ans[i, l, 0] += log_loss(ctrain_data[1], out_train)
                ans[i, l, 1] += log_loss(cvalid_data[1], out_valid)

    ans /= num_iter

    np.savez("logreg_bv", ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class)
    return ans
Exemplo n.º 27
0
def learn(learning_rate, X_train, y_train, X_test, y_test):
	model = GradientBoostingClassifier(
		n_estimators=250,
		verbose=True,
		random_state=241,
		learning_rate=learning_rate
		)
	model.fit(X_train, y_train)
	
	# plot scores
	test_score = list(range(250))
	train_score = list(range(250))

	for i, predictions in enumerate(model.staged_decision_function(X_test)):
		predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format
		predictions = [1/(1 + math.exp(-x)) for x in predictions]
		test_score[i] = log_loss(y_test, predictions)

	for i, predictions in enumerate(model.staged_decision_function(X_train)):
		predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format
		predictions = [1/(1 + math.exp(-x)) for x in predictions]
		train_score[i] = log_loss(y_train, predictions)

	plt.figure()
	plt.plot(test_score, 'r', linewidth=2)
	plt.plot(train_score, 'g', linewidth=2)
	plt.legend(['test', 'train'])
	plt.show()
	
	return train_score, test_score
Exemplo n.º 28
0
def rf_grid_search():

	train_inp,valid_inp,train_target,valid_target = prepare_input()
	#set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false,
	#and log-loss requires a probability
	log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True)

	train_inp = train_inp[:100000]
	train_target = train_target[:100000]

	start = time.time()
	random_forest = RandomForestClassifier(random_state=31)
	# r_forest_parameters = {'n_estimators' : [120,300,500,800,1200],'max_depth':[5,8,15,25,30,None],'max_features':['log2','sqrt',None],
	# 'min_samples_split':[1,2,5,10,15,100],'min_samples_leaf':[1,2,5,10]}
	
	#75.1 minutes to run with these paramters - 72 fits

	r_forest_parameters = {'min_samples_split':[2,5,10,20,50,100],'min_samples_leaf':[1,2,5,10,50,100]}
	#grid search too slow to not use all cores, and wayyyy too slow to have no output.
	r_forest_grid_obj = GridSearchCV(random_forest,r_forest_parameters,log_loss_scorer,verbose=2,n_jobs=-1)
	r_forest_grid_obj = r_forest_grid_obj.fit(train_inp,train_target)
	random_forest = r_forest_grid_obj.best_estimator_
	print "Best params: " + str(r_forest_grid_obj.best_params_)	
	random_forest_train_error = log_loss(train_target,random_forest.predict_proba(train_inp))
	random_forest_validation_error = log_loss(valid_target,random_forest.predict_proba(valid_inp))
	print "Best random forest training error: {:02.4f}".format(random_forest_train_error)
	print "Best random forest validation error: {:02.4f}".format(random_forest_validation_error)
	end = time.time()
	print "RF grid search took {:02.4f} seconds".format(end-start)

	return random_forest
Exemplo n.º 29
0
def run_cross_validation(nfolds=10):
    img_rows, img_cols = 32, 32
    batch_size = 32
    nb_epoch = 100
    random_state = 51

    train_data, train_target, train_id = read_and_normalize_train_data(img_rows, img_cols)
    test_data, test_id = read_and_normalize_test_data(img_rows, img_cols)

    yfull_train = dict()
    yfull_test = []
    kf = KFold(len(train_data), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    sum_score = 0
    for train_index, test_index in kf:
        model = create_model(img_rows, img_cols)
        X_train, X_valid = train_data[train_index], train_data[test_index]
        Y_train, Y_valid = train_target[train_index], train_target[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=5, verbose=0),
        ]
        model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch,
              shuffle=True, verbose=2, validation_data=(X_valid, Y_valid),
              callbacks=callbacks)

        predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1)
        score = log_loss(Y_valid, predictions_valid)
        print('Score log_loss: ', score)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        # Store test predictions
        test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2)
        yfull_test.append(test_prediction)

    predictions_valid = get_validation_predictions(train_data, yfull_train)
    score = log_loss(train_target, predictions_valid)
    print("Log_loss train independent avg: ", score)

    print('Final log_loss: {}, rows: {} cols: {} nfolds: {} epoch: {}'.format(score, img_rows, img_cols, nfolds, nb_epoch))
    perc = getPredScorePercent(train_target, train_id, predictions_valid)
    print('Percent success: {}'.format(perc))

    info_string = 'loss_' + str(score) \
                    + '_r_' + str(img_rows) \
                    + '_c_' + str(img_cols) \
                    + '_folds_' + str(nfolds) \
                    + '_ep_' + str(nb_epoch)

    test_res = merge_several_folds_mean(yfull_test, nfolds)
    create_submission(test_res, test_id, info_string)
def log_res(train_data_features, train_data_cross_validation_classwise_features, test_data_features, labels, labels_cross_validation_classwise, using_cross_validation2, kf, settings):
    if using_cross_validation2:
        logres_C = 1
        logres_results = []
        if(len(train_data_cross_validation_classwise_features) > 0):
            """train_all = np.append(train_data_features, train_data_cross_validation_classwise_features, axis=0)
            labels_all = np.append(labels, labels_cross_validation_classwise)
            kf_all = KFold(len(train_all)-1, n_folds=int(settings['Data']['CrossValidation2']), shuffle=True)
            for train, test in kf_all:
                C = logres_C
                p = 'l1'
                clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01)
                model = clf_l1_LR.fit(train_all[train], labels_all[train])
                predicted_classes = model.predict(train_all[test])
                predicted_classes_train = model.predict(train_all[train])
                print("N points:", len(predicted_classes), " percentage: ",(labels_all[test] != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels_all[train] != predicted_classes_train).sum()*100/len(predicted_classes_train))
                logres_results.append((labels_all[test] != predicted_classes).sum())
                logres_C += 1"""
            for c in pl.frange(logres_C,15, 1):
                clf_l1_LR = LogisticRegression(C=c, solver='lbfgs', penalty='l2', tol=0.01)
                model = clf_l1_LR.fit(train_data_features, labels)
                predicted_classes = model.predict(train_data_cross_validation_classwise_features)
                predicted_classes_train = model.predict(train_data_features)
                class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features)
                logres_results.append(log_loss(labels_cross_validation_classwise, class_probabilities))
                print("N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),
                      "%, percentage_train: ", (labels != predicted_classes_train).sum()*100/len(predicted_classes_train))
                print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities))
        else:
            for train, test in kf:
                C = logres_C
                p = 'l1'
                clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01)
                model = clf_l1_LR.fit(train_data_features[train], labels[train])
                predicted_classes = model.predict(train_data_features[test])
                predicted_classes_train = model.predict(train_data_features[train])
                print("N points:", len(predicted_classes), " percentage: ",(labels[test] != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels[train] != predicted_classes_train).sum()*100/len(predicted_classes_train))
                logres_results.append((labels[test] != predicted_classes).sum())
                logres_C += 1
        print(logres_results)
        logres_C = logres_results.index(min(logres_results)) + 1
        print("Log Res C: ", logres_C)
        if(len(train_data_cross_validation_classwise_features) > 0):
            clf_l1_LR = LogisticRegression(C=logres_C, penalty='l2', tol=0.01)
            model = clf_l1_LR.fit(train_data_features, labels)
            predicted_classes = model.predict(train_data_cross_validation_classwise_features)
            predicted_classes_train = model.predict(train_data_features)
            class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features)
            print("N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels != predicted_classes_train).sum()*100/len(predicted_classes_train))
            print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities))
        clf_l1_LR = LogisticRegression(C=logres_C, penalty='l1', tol=0.01)
        model = clf_l1_LR.fit(train_data_features, labels)
        return model.predict_proba(test_data_features), model.predict(test_data_features), model
    else:
        C = 1
        p = 'l1'
        clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01)
        model = clf_l1_LR.fit(train_data_features, labels)
        return model.predict_proba(test_data_features), model.predict(test_data_features), model
Exemplo n.º 31
0
def error(p, x, y):
    preds = blended(p, x)
    err = log_loss(y, preds)  # 看来一维向量是可以和一个矩阵直接比较求logloss的
    return err
Exemplo n.º 32
0
# model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_CatBoostClassifier.data', 'rb'))  # rb : read
# time >>  

# evaluate
y_pred = model.predict(x_test)
# print(y_pred[:100])
# print(y_pred[100:])

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

hamm_loss = hamming_loss(y_test, y_pred)
hinge_loss = hinge_loss(y_test, y_pred)
log_loss = log_loss(y_test, y_pred)

print("accuracy : \t", accuracy)
print("recall : \t", recall)
print("precision : \t", precision)
print("f1 : \t", f1)

print("hamming_loss : \t", hamm_loss)
print("hinge_loss : \t", hinge_loss)                    # SVM에 적합한 cross-entropy
print("log_loss : \t", log_loss)                        # Cross-entropy loss와 유사한 개념

# predict 데이터
pred_pathAudio = 'E:/nmb/nmb_data/pred_voice/'
files = librosa.util.find_files(pred_pathAudio, ext=['wav'])
files = np.asarray(files)
for file in files:   
def stacking(clf,train_x,train_y,test_x,clf_name,class_num=3):
    train=np.zeros((train_x.shape[0],class_num))
    test=np.zeros((test_x.shape[0],class_num))
    test_pre=np.empty((folds,test_x.shape[0],class_num))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf):
        tr_x=train_x[train_index]
        tr_y=train_y[train_index]
        te_x=train_x[test_index]
        te_y = train_y[test_index]
        if clf_name in ["rf","ada","gb","et","lr","knn","mnb","ovr","gnb"]:
            clf.fit(tr_x,tr_y)
            pre=clf.predict_proba(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.predict_proba(test_x)
            cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["lsvc"]:
            clf.fit(tr_x,tr_y)
            pre=clf.decision_function(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.decision_function(test_x)
            cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["xgb"]:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {'booster': 'gbtree',
                      'objective': 'multi:softprob',
                      'eval_metric': 'mlogloss',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 4,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12,
                      "num_class": class_num
                      }

            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'),
                         (test_matrix, 'eval')
                         ]
            if test_matrix:
                model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
                train[test_index]=pre
                test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)
                cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["lgb"]:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            #z = clf.Dataset(test_x, label=te_y)
            #z=test_x
            params = {
                      'boosting_type': 'gbdt',
                      #'boosting_type': 'dart',
                      'objective': 'multiclass',
                      'metric': 'multi_logloss',
                      'min_child_weight': 1.5,
                      'num_leaves': 2**4,
                      'lambda_l2': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'learning_rate': 0.03,
                      'tree_method': 'exact',
                      'seed': 2017,
                      'nthread': 12,
                      "num_class": class_num,
                      'silent': True,
                      }
            num_round = 10000
            early_stopping_rounds = 100
            if test_matrix:
                model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                                  early_stopping_rounds=early_stopping_rounds
                                  )
                pre= model.predict(te_x,num_iteration=model.best_iteration)
                train[test_index]=pre
                test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)
                cv_scores.append(log_loss(te_y, pre))
        elif clf_name in ["nn"]:
            from keras.layers import Dense, Dropout, BatchNormalization,SReLU
            from keras.optimizers import SGD,RMSprop
            from keras.callbacks import EarlyStopping, ReduceLROnPlateau
            from keras.utils import np_utils
            from keras.regularizers import l2
            from keras.models import Sequential
            clf = Sequential()
            clf.add(Dense(64, input_dim=tr_x.shape[1],activation="relu", W_regularizer=l2()))
            #clf.add(SReLU())
            #clf.add(Dropout(0.2))
            clf.add(Dense(64,activation="relu",W_regularizer=l2()))
            #clf.add(SReLU())
            #clf.add(Dense(64, activation="relu", W_regularizer=l2()))
            # model.add(Dropout(0.2))
            clf.add(Dense(class_num, activation="softmax"))
            clf.summary()
            early_stopping = EarlyStopping(monitor='val_loss', patience=20)
            reduce = ReduceLROnPlateau(min_lr=0.0002,factor=0.05)
            clf.compile(optimizer="rmsprop", loss="categorical_crossentropy")
            clf.fit(tr_x, tr_y,
                      batch_size=2560,
                      nb_epoch=1000,
                      validation_data=[te_x, te_y],
                      callbacks=[early_stopping,reduce])
            pre=clf.predict_proba(te_x)
            train[test_index]=pre
            test_pre[i,:]=clf.predict_proba(test_x)
            cv_scores.append(log_loss(te_y, pre))
        else:
            raise IOError("Please add new clf.")
        print "%s now score is:"%clf_name,cv_scores
        with open("score.txt","a") as f:
            f.write("%s now score is:"%clf_name+str(cv_scores)+"\n")
    test[:]=test_pre.mean(axis=0)
    print "%s_score_list:"%clf_name,cv_scores
    print "%s_score_mean:"%clf_name,np.mean(cv_scores)
    with open("score.txt", "a") as f:
        f.write("%s_score_mean:"%clf_name+str(np.mean(cv_scores))+"\n")
    return train.reshape(-1,class_num),test.reshape(-1,class_num)
Exemplo n.º 34
0
def run_cross_validation_create_models(num_fold=5):
    #Input image dimensions
    batch_size = 4
    nb_epoch = 50

    restore_from_last_checkpoint = 1

    data, target = preprocess_data()
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.3,
                                                        random_state=42)
    t1 = DenseNet(classes=4,
                  input_shape=(300, 300, 3),
                  depth=40,
                  growth_rate=12,
                  bottleneck=True,
                  reduction=0.5)
    #    model = Model(input=img_input, output = output_3)
    #    model.compile(optimizer = 'adam', loss  = 'categorical_crossentropy', metrics = ['accuracy'])
    #    model.summary()
    #    model = base_model()
    top_model = Sequential()
    top_model.add(t1)
    top_model.add(Flatten(input_shape=t1.output_shape[1:]))
    top_model.add(Dense(256, activation='relu'))
    top_model.add(Dropout(0.5))
    top_model.add(Dense(4, activation='softmax'))

    # note that it is necessary to start with a fully-trained
    # classifier, including the top classifier,
    # in order to successfully do fine-tuning

    # add the model on top of the convolutional base
    model = Sequential()
    model.add(top_model)
    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=1e-2, momentum=0.9),
                  metrics=['accuracy'])

    num_fold += 1
    print('Start KFold number {} from {}'.format(num_fold, num_fold))
    print('Split train:', len(X_train), len(y_train))
    print('Split test:', len(X_test), len(y_test))
    kfold_weights_path = os.path.join(
        'cache', 'weights_kfold_vgg16_' + str(num_fold) + '.h5')
    #    if not os.path.isfile(kfold_weights_path) or restore_from_last_checkpoint == 0:
    callbacks = [
        #                EarlyStoppingbyLossVal(monitor = 'val_loss', value = 0.00001, verbose = 1),
        #                EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 1),
        ModelCheckpoint(kfold_weights_path,
                        monitor='val_loss',
                        save_best_only=True,
                        verbose=0),
        TensorBoard(log_dir='./LogsForAUC', write_images=True)
    ]
    cnn = model.fit(X_train,
                    y_train,
                    batch_size=batch_size,
                    epochs=nb_epoch,
                    shuffle=True,
                    verbose=1,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

    if os.path.isfile(kfold_weights_path):
        model.load_weights(kfold_weights_path)
    score1 = model.evaluate(X_test, y_test, show_accuracy=True, verbose=0)
    print('Score on test was : ', score1)
    predictions = model.predict(X_train.astype('float32'),
                                batch_size=batch_size,
                                verbose=1)
    score = log_loss(y_test, predictions)
    print('Score log_loss on test is', score)

    plt.plot(cnn.history['acc'])
    plt.plot(cnn.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(cnn.history['loss'])
    plt.plot(cnn.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    pd.DataFrame(cnn.history).to_csv("/historyAUC.csv")
Exemplo n.º 35
0
 def __call__(self, y_true_proba, y_proba):
     score = log_loss(y_true_proba, y_proba)
     return score
        X.append(xi)
        y.append(yi)

    X = np.asarray(X)
    y = np.asarray(y)
    Test_ind = np.asarray(Test_ind)

    X_train, y_train = X[:int(datanum*0.6)], y[:int(datanum*0.6)]
    X_valid, y_valid = X[int(datanum*0.6):int(datanum*0.8)], y[int(datanum*0.6):int(datanum*0.8)]
    X_train_valid, y_train_valid = X[:int(datanum*0.8)], y[:int(datanum*0.8)]
    X_test, y_test = X[int(datanum*0.8):], y[int(datanum*0.8):]

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train_valid, y_train_valid)
    clf_probs = clf.predict_proba(X_test)
    score = log_loss(y_test, clf_probs)
    print clf.score(X_test, y_test)
    x_pro = clf.predict_proba(X)
    Test_res = np.c_[Test_ind, x_pro]
    val_res = np.c_[X_test, y_test]
    val_res = np.c_[val_res, clf_probs]


    
    Real_ind = []
    rdata = []
    infile = open(realdata,'rb')
    for line in infile:
        line = line.strip('\n')
        sent = line.split('\t')
        Real_ind.append(sent)
    log = pd.DataFrame(columns=log_cols)

    for clf in classifiers:
        clf.fit(X_train, y_train)
        name = clf.__class__.__name__

        # print("=" * 30)
        # print(name)

        # print('****Results****')
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)  # from sklearn
        # print("Accuracy: {:.4%}".format(acc))

        train_predictions = clf.predict_proba(X_test)
        ll = log_loss(y_test, train_predictions)
        # print("Log Loss: {}".format(ll))

        log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols)
        log = log.append(log_entry)
    # print("=" * 30)
    print log
    # LogisticRegression 59.090909 4.173216

######################
# I would like to choose
# LogisticRegression
# for future improvements by tuning their hyper-parameters
######################
if log_reg:
    # We initialise the Exhaustive Grid Search, we leave the scoring as the default function of
Exemplo n.º 38
0
y_pred = rfc_model.predict(X_test)



precScore = precision_score(Y_test, y_pred, average="macro")

print "precision score", precScore



predProb = rfc_model.predict_proba(X_test)

print "y predicted", set(y_pred)

log_loss = log_loss(Y_test, predProb)

print "log loss", log_loss



acc = accuracy_score(Y_test, y_pred)

print "Accuracy is : ", acc





gnb = GaussianNB()
print("DT F1-score: ", f1_score_dt)

# Logistic Regression

# In[143]:

# predicted y
yhat_lg = LR.predict(x_test)
yhat_lg_prob = LR.predict_proba(x_test)

# jaccard
jaccard_lg = jaccard_similarity_score(y_test, yhat_lg)
print("LR Jaccard index: ", jaccard_lg)

# f1_score
f1_score_lg = f1_score(y_test, yhat_lg, average='weighted')
print("LR F1-score: ", f1_score_lg)

# logloss
logloss_lg = log_loss(y_test, yhat_lg_prob)
print("LR log loss: ", logloss_lg)

# # Report
# You should be able to report the accuracy of the built model using different evaluation metrics:

# | Algorithm          | Jaccard | F1-score | LogLoss |
# |--------------------|---------|----------|---------|
# | KNN                | 0.56302       | 0.547122        | NA      |
# | Decision Tree      | 0.56285       | 0.534773        | NA      |
# | LogisticRegression | 0.52435       | 0.509146        | 0.68563     |
Exemplo n.º 40
0
def computeloss_lib(p, labels):
    return metrics.log_loss(labels, p)
Exemplo n.º 41
0
 def calculateLoss(self, y_true, pred):
     """ This functions calculates the cross entropy for analytical purposes"""
     return log_loss(y_true, pred)
Exemplo n.º 42
0
def train_linear_classifier_model(learning_rate, steps, batch_size,
                                  training_examples, training_targets,
                                  validation_examples, validation_targets):
    """Trains a linear classification model.

    In addition to training, this function also prints training progress information,
    as well as a plot of the training and validation loss over time.

    Args:
      learning_rate: A `float`, the learning rate.
      steps: A non-zero `int`, the total number of training steps. A training step
        consists of a forward and backward pass using a single batch.
      batch_size: A non-zero `int`, the batch size.
      training_examples: A `DataFrame` containing one or more columns from
        `california_housing_dataframe` to use as input features for training.
      training_targets: A `DataFrame` containing exactly one column from
        `california_housing_dataframe` to use as target for training.
      validation_examples: A `DataFrame` containing one or more columns from
        `california_housing_dataframe` to use as input features for validation.
      validation_targets: A `DataFrame` containing exactly one column from
        `california_housing_dataframe` to use as target for validation.

    Returns:
      A `LinearClassifier` object trained on the training data.
    """

    periods = 10
    steps_per_period = steps / periods

    # Create a linear classifier object.
    my_optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(
        my_optimizer, 5.0)
    linear_classifier = tf.estimator.LinearClassifier(
        feature_columns=construct_feature_columns(training_examples),
        optimizer=my_optimizer)
    # YOUR CODE HERE: Construct the linear classifier.

    # Create input functions.
    training_input_fn = lambda: my_input_fn(training_examples,
                                            training_targets[
                                                "median_house_value_is_high"],
                                            batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(
        training_examples,
        training_targets["median_house_value_is_high"],
        num_epochs=1,
        shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(
        validation_examples,
        validation_targets["median_house_value_is_high"],
        num_epochs=1,
        shuffle=False)

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print "Training model..."
    print "LogLoss (on training data):"
    training_log_losses = []
    validation_log_losses = []
    for period in range(0, periods):
        # Train the model, starting from the prior state.
        linear_classifier.train(input_fn=training_input_fn,
                                steps=steps_per_period)
        # Take a break and compute predictions.
        training_probabilities = linear_classifier.predict(
            input_fn=predict_training_input_fn)
        training_probabilities = np.array(
            [item['probabilities'] for item in training_probabilities])

        validation_probabilities = linear_classifier.predict(
            input_fn=predict_validation_input_fn)
        validation_probabilities = np.array(
            [item['probabilities'] for item in validation_probabilities])

        training_log_loss = metrics.log_loss(training_targets,
                                             training_probabilities)
        validation_log_loss = metrics.log_loss(validation_targets,
                                               validation_probabilities)
        # Occasionally print the current loss.
        print "  period %02d : %0.2f" % (period, training_log_loss)
        # Add the loss metrics from this period to our list.
        training_log_losses.append(training_log_loss)
        validation_log_losses.append(validation_log_loss)
    print "Model training finished."

    # Output a graph of loss metrics over periods.
    plt.ylabel("LogLoss")
    plt.xlabel("Periods")
    plt.title("LogLoss vs. Periods")
    plt.tight_layout()
    plt.plot(training_log_losses, label="training")
    plt.plot(validation_log_losses, label="validation")
    plt.legend()
    plt.show()

    return linear_classifier
Exemplo n.º 43
0
def run_cross_validation_train_models(train_data,
                                      train_target,
                                      model_struc,
                                      nfolds=10,
                                      nb_epoch=200):
    # input image dimensions
    batch_size = 600

    yfull_train = dict()
    # kf = KFold(, n_folds=nfolds, shuffle=True, random_state=random_state)
    skf = StratifiedKFold(n_splits=nfolds, shuffle=True)
    num_fold = 0
    sum_score = 0
    models = []
    for train_index, test_index in skf.split(train_data, train_target):

        X_train = train_data[train_index]
        Y_train = train_target[train_index]
        X_valid = train_data[test_index]
        Y_valid = train_target[test_index]

        num_fold += 1
        print('-- Start KFold train number {} from {}'.format(
            num_fold, nfolds))
        print('---- Split train: ', len(X_train), len(Y_train))
        print('---- Split valid: ', len(X_valid), len(Y_valid))

        model = make_cnn(model_struc, train_data.shape[1:], verbose=False)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model.fit(X_train,
                  Y_train,
                  batch_size=batch_size,
                  epochs=nb_epoch,
                  shuffle=True,
                  verbose=1,
                  validation_data=(X_valid, Y_valid))

        predictions_valid, summary = DL_mcc(model, X_valid, Y_valid)

        score = log_loss(Y_valid, predictions_valid)
        print('-- Score log_loss: ', score)
        sum_score += score * len(test_index)

        # Store valid predictions
        for i in range(len(test_index)):
            yfull_train[test_index[i]] = predictions_valid[i]

        models.append(model)

    score = sum_score / len(train_data)
    print("-- Train: Log_loss independent avg: ", score)
    ytrue = train_target[list(yfull_train.keys())]
    pred = np.array(list(yfull_train.values()))
    binary_y = score_to_binary(pred.reshape(-1))
    # print(ytrue)
    summary = PNmetrics2(ytrue, binary_y)

    info_string = '-- loss_' + str(score) + '_folds_' + str(
        nfolds) + '_ep_' + str(nb_epoch)
    return (info_string, models, summary)
Exemplo n.º 44
0
    'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'
]  # 只取星期几和街区作为分类器输入特征
# 下面这行是添加 hour 属性, 只要取消注释即可
# features = features + [x for x in range(0, 24)]  # 添加犯罪的小时时间点作为特征
training, validation = train_test_split(trainData,
                                        test_size=.40)  # 分割训练集(3/5)和测试集(2/5)
##################################################################
## 朴素贝叶斯建模, 计算 log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])  # 这个很快
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print("朴素贝叶斯建模耗时 %f 秒" % (nbCostTime))
print("朴素贝叶斯 log 损失为 %f" %
      (log_loss(validation['crime'],
                predicted)))  # 2.617892 / 2.582167; 后者是将上面的 特征加入 hour 后
##################################################################
## 逻辑回归建模, 计算 log_loss
model = LogisticRegression(C=.01)
lrStart = time.time()
model.fit(training[features], training['crime'])  # 这个就很慢了
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
print("逻辑回归建模耗时 %f 秒" % (lrCostTime))  # 近 2min
print(
    "逻辑回归 log 损失为 %f" %
    (log_loss(validation['crime'],
              predicted)))  # 2.624773 / 2.592119; 还没 NB 好, 后者是将上面的 特征加入 hour 后

# 可以看到在这三个类别特征下, 朴素贝叶斯相对于逻辑回归, 依旧有一定的优势(log 损失更小), 同时训练时间很短, 这意味着模型虽然简单, 但是效果依旧强大
# 顺便提一下, 朴素贝叶斯 1.13s 训练出来的模型, 预测的效果在 Kaggle 排行榜上已经能进入 Top 35%了, 如果进行一些优化,
Exemplo n.º 45
0
for i,img_id in tqdm(enumerate(labels['id'])):
    img = read_img(img_id,'train',(INPUT_SIZE,INPUT_SIZE))
    x = xception.preprocess_input(np.expand_dims(img.copy(),axis=0))
    x_train[i] = x
print 'Train Image shape: {} size: {:,}'.format(x_train.shape,x_train.size)

x_test = np.zeros((len(sample_submission),INPUT_SIZE,INPUT_SIZE,3),dtype='float32')
for i,img_id in tqdm(enumerate(sample_submission['id'])):
    img = read_img(img_id,'test',(INPUT_SIZE,INPUT_SIZE))
    x = xception.preprocess_input(np.expand_dims(img.copy(),axis=0))
    x_test[i] = x
print 'Test Image shape: {} size: {:,}'.format(x_test.shape,x_test.size)

print x_train.shape
xception_bottleneck = xception.Xception(weights='imagenet',include_top=False,pooling=POOLING)
train_x_bf = xception_bottleneck.predict(x_train,batch_size=32,verbose=1)
test_x_bf = xception_bottleneck.predict(x_test,batch_size=32,verbose=1)
print 'Xception train bottleneck features shape: {} size: {:,}'.format(train_x_bf.shape,train_x_bf.size)

logreg = LogisticRegression(multi_class='multinomial',solver='lbfgs',random_state=SEED)
logreg.fit(train_x_bf,(y_train*range(NUM_CLASSES)).sum(axis=1))
train_probs = logreg.predict_proba(train_x_bf)
train_preds = logreg.predict(train_x_bf)
print 'Xception train loss: {}'.format(log_loss(y_train,train_probs))
print 'Xception train accuracy: {}'.format(accuracy_score((y_train*range(NUM_CLASSES)).sum(axis=1),train_preds))

test_probs = logreg.predict_proba(test_x_bf)
test_preds = logreg.predict(test_x_bf)

result = pd.DataFrame(data=test_probs,index=sample_submission.id,columns=unique_breeds,dtype='float32',copy=True)
result.to_csv('my_submission.csv')
Exemplo n.º 46
0
def gbdt_lr_predict(data, category_feature, continuous_feature):  # 0.43616
    # 离散特征one-hot编码
    print('开始one-hot...')
    for col in category_feature:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print('one-hot结束')

    train = data[data['flag'] != -1]
    target = train.pop('flag')
    test = data[data['flag'] == -1]
    test.drop(['flag'], axis=1, inplace=True)

    # 划分数据集
    print('划分数据集...')
    x_train, x_val, y_train, y_val = train_test_split(train,
                                                      target,
                                                      test_size=0.2,
                                                      random_state=2018)

    print('开始训练gbdt..')
    gbm = lgb.LGBMRegressor(
        objective='binary',
        subsample=0.8,
        min_child_weight=0.5,
        colsample_bytree=0.7,
        num_leaves=100,
        max_depth=12,
        learning_rate=0.05,
        n_estimators=10,
    )

    gbm.fit(
        x_train,
        y_train,
        eval_set=[(x_train, y_train), (x_val, y_val)],
        eval_names=['train', 'val'],
        eval_metric='binary_logloss',
        # early_stopping_rounds = 100,
    )
    model = gbm.booster_
    print('训练得到叶子数')
    gbdt_feats_train = model.predict(train, pred_leaf=True)
    gbdt_feats_test = model.predict(test, pred_leaf=True)
    gbdt_feats_name = [
        'gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])
    ]
    df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train,
                                       columns=gbdt_feats_name)
    df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name)

    print('构造新的数据集...')
    train = pd.concat([train, df_train_gbdt_feats], axis=1)
    test = pd.concat([test, df_test_gbdt_feats], axis=1)
    train_len = train.shape[0]
    data = pd.concat([train, test])
    del train
    del test
    gc.collect()

    # # 连续特征归一化
    # print('开始归一化...')
    # scaler = MinMaxScaler()
    # for col in continuous_feature:
    #     data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    # print('归一化结束')

    # 叶子数one-hot
    print('开始one-hot...')
    for col in gbdt_feats_name:
        print('this is feature:', col)
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    print('one-hot结束')

    train = data[:train_len]
    test = data[train_len:]
    del data
    gc.collect()

    x_train, x_val, y_train, y_val = train_test_split(train,
                                                      target,
                                                      test_size=0.3,
                                                      random_state=2018)
    # lr
    print('开始训练lr..')
    lr = LogisticRegression()

    #lbl = preprocessing.LabelEncoder()
    #x_train['hav_car_grp_ind'] = lbl.fit_transform(x_train['hav_car_grp_ind'].astype(str))
    #x_train['hav_hou_grp_ind'] = lbl.fit_transform(x_train['hav_hou_grp_ind'].astype(str))
    #x_train['job_year'] = lbl.fit_transform(x_train['job_year'].astype(str))
    lr.fit(x_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
    print('tr-logloss: ', tr_logloss)
    val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
    print('val-logloss: ', val_logloss)
    print('开始预测...')
    y_pred = lr.predict_proba(test)[:, 1]
    print('写入结果...')
    for i in y_pred:
        print(i)
    #res = pd.read_csv('data/test.csv')
    #submission = pd.DataFrame({'id': res['id'], 'flag': y_pred})
    #submission.to_csv('submission/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
    print('结束')
Exemplo n.º 47
0
 def evaluate(self, feat_index, feat_val, label):
     y_pred = self.predict(feat_index, feat_val)
     if self.metric_type == 'auc':
         return roc_auc_score(label, y_pred)
     elif self.metric_type == 'logloss':
         return log_loss(label, y_pred)
def check_all_variants(
    start_config={
        "dataset":
        "Superstore",
        "file_name_list": [
            "Superstore", "Superstore_train", "Superstore_test",
            "Superstore_valid"
        ],
        "col_name_list": ["Kundenname", "Produktname"],
        "show_progress":
        False,
        "use_user_info":
        False,
        "user_features": ["Segment", "Kategorie"],
        "user_features_prep_types": ["one_hot", "one_hot"],
        "n_info_cols":
        0,
        "approach":
        "binary",
        "model_type":
        "complement",
        "count":
        True,
        "split_ratio": [0.7, 0.2, 0.1],
        "split":
        "clients",
        "alpha":
        1.0,
        "fit_prior":
        True,
        "train_batch_size":
        5000,
        "pred_batch_size":
        5000,
        "n_Produkte":
        1915,
        "n_Kunden":
        784,
        "info_string":
        "",
        "fit_set":
        "train",
        "pred_set":
        "test",
    },
    param_dict={
        "approach": ["multi", "binary"],
        "model_type": ["multinomial", "complement", "bernoulli"],
        "split": ["clients", "orders"],
        "use_user_info": [False, True],
        "count": [True, False],
        "alpha": [1.0, 0.9, 0.8, 0.7],
        "fit_prior": [True, False],
        "user_features": [["Segment"], ["Kategorie"], ["Segment",
                                                       "Kategorie"]],
        "user_features_prep_types": [["one_hot"], ["one_hot"],
                                     ["one_hot", "one_hot"]]
    }):
    top_n_list = [10, 20, 50, 100, 200, 500]

    full_out_dict = {
        "approach": [],
        "model_type": [],
        "split": [],
        "use_user_info": [],
        "threshold": [],
        "count": [],
        "info_str": [],
        "filename": [],
        "mse": [],
        "neg_log_loss": [],
        "Accuracy": [],
        "Precision": [],
        "Recall": [],
        "F1": [],
        "tn": [],
        "fp": [],
        "fn": [],
        "tp": []
    }

    for top_n in top_n_list:
        full_out_dict["top_" + str(top_n) + "_score"] = []

    dataset = start_config["dataset"]
    #configure.do(dataset)

    with open(dataset + "/json_files/config.json", "w") as fp:
        json.dump(start_config, fp, indent=5)

    for approach in param_dict["approach"]:
        for model_type in param_dict["model_type"]:
            for split in param_dict["split"]:
                for count in param_dict["count"]:
                    for fit_prior in param_dict["fit_prior"]:
                        for alpha in param_dict["alpha"]:
                            for use_user_info in param_dict["use_user_info"]:
                                for user_features, user_features_prep_types in zip(
                                        param_dict["user_features"],
                                        param_dict["user_features_prep_types"]
                                ):

                                    if not use_user_info and not user_features == param_dict[
                                            "user_features"][-1]:
                                        continue

                                    update_config(dataset, approach,
                                                  model_type, split,
                                                  use_user_info, user_features,
                                                  user_features_prep_types,
                                                  count, fit_prior, alpha)

                                    if use_user_info:
                                        info_str = str(user_features)
                                    else:
                                        info_str = ""
                                    print()
                                    print("Precess with new config:")
                                    print("approach", "model_type", "split",
                                          "use_user_info", "info_str", "count",
                                          "fit_prior", "alpha")
                                    print(approach, model_type, split,
                                          use_user_info, info_str, count,
                                          fit_prior, alpha)
                                    print()

                                    serve_data.do(dataset)
                                    NaiveBayes.do(dataset)

                                    with open(
                                            dataset +
                                            "/json_files/config.json",
                                            "r") as fp:
                                        config = json.load(fp)

                                    title = dataset + "_predictions_" + \
                                            "fit" + config["fit_set"] + \
                                            "_pred" + config["pred_set"] + \
                                            "_" + config["model_type"] + \
                                            "_approach" + str(config["approach"]) + \
                                            "_split" + config["split"] + \
                                            "_count" + str(config["count"]) + \
                                            "_info" + str(config["use_user_info"]) + config["info_string"]

                                    pred_file = dataset + "/npy_files/" + title + ".npy"

                                    if split == "orders":
                                        KPM = np.sign(
                                            np.load(dataset +
                                                    "/npy_files/test_KPM.npy"))
                                    elif split == "clients":
                                        KPM = np.sign(
                                            np.load(dataset +
                                                    "/npy_files/full_KPM.npy")
                                            [np.load(
                                                dataset +
                                                "/npy_files/test_index.npy")])

                                    if approach == "binary":
                                        threshold = 0.5
                                    elif approach == "multi":
                                        threshold = 1 / config["n_Produkte"]

                                    n_orders = np.sum(KPM, axis=None)
                                    predictions = np.load(pred_file)
                                    y_prop = predictions.flatten()
                                    y_soll = KPM.flatten()
                                    y_pred = y_prop > threshold

                                    top_n_score_list = []
                                    for top_n in top_n_list:
                                        n_hits = 0
                                        for client_index in range(
                                                len(predictions)):
                                            bought_items = np.argwhere(
                                                KPM[client_index] == 1)[:, 0]
                                            for item_index in bought_items:
                                                if item_index in np.array(
                                                        sorted(zip(
                                                            predictions[
                                                                client_index],
                                                            np.arange(
                                                                len(predictions[
                                                                    client_index]
                                                                    ))),
                                                               reverse=True)
                                                )[:, 1][:top_n]:
                                                    n_hits += 1
                                        top_n_score_list.append(n_hits /
                                                                n_orders)
                                    cmat = metrics.confusion_matrix(
                                        y_soll, y_pred)
                                    [[tn, fp], [fn, tp]] = cmat
                                    out_dict = {
                                        "filename":
                                        str(pred_file),
                                        "mse":
                                        float(
                                            metrics.mean_squared_error(
                                                y_soll, y_prop)),
                                        "neg_log_loss":
                                        float(metrics.log_loss(y_soll,
                                                               y_prop)),
                                        "Accuracy":
                                        float(
                                            metrics.accuracy_score(
                                                y_soll, y_pred)),
                                        "Precision":
                                        float(
                                            metrics.precision_score(
                                                y_soll, y_pred)),
                                        "Recall":
                                        float(
                                            metrics.recall_score(
                                                y_soll, y_pred)),
                                        "F1":
                                        float(metrics.f1_score(y_soll,
                                                               y_pred)),
                                        "tn":
                                        int(tn),
                                        "fp":
                                        int(fp),
                                        "fn":
                                        int(fn),
                                        "tp":
                                        int(tp)
                                    }

                                    for top_n, score in zip(
                                            top_n_list, top_n_score_list):
                                        full_out_dict["top_" + str(top_n) +
                                                      "_score"].append(
                                                          float(score))

                                    print(pred_file + ":")
                                    print("MSE", out_dict["mse"])
                                    print("neg_log_loss",
                                          out_dict["neg_log_loss"])
                                    print("Accuracy", out_dict["Accuracy"])
                                    print("Precision", out_dict["Precision"])
                                    print("Recall", out_dict["Recall"])
                                    print("F1", out_dict["F1"])

                                    print("Confusion Matrix (tn,fp,fn,tp)")
                                    print(cmat)

                                    for top_n, score in zip(
                                            top_n_list, top_n_score_list):
                                        print(
                                            str(score * 100) +
                                            "%\tder getätigten käufte sind in der top",
                                            top_n, "der Produktempfehlungen")

                                    full_out_dict["filename"].append(
                                        str(pred_file))
                                    full_out_dict["approach"].append(
                                        str(approach))
                                    full_out_dict["model_type"].append(
                                        str(model_type))
                                    full_out_dict["split"].append(str(split))
                                    full_out_dict["count"].append(str(count))
                                    full_out_dict["use_user_info"].append(
                                        str(use_user_info))
                                    full_out_dict["threshold"].append(
                                        float(threshold))
                                    full_out_dict["info_str"].append(
                                        str(info_str))
                                    full_out_dict["fit_prior"].append(
                                        fit_prior)
                                    full_out_dict["alpha"].append(float(alpha))
                                    full_out_dict["mse"].append(
                                        float(out_dict["mse"]))
                                    full_out_dict["neg_log_loss"].append(
                                        float(out_dict["neg_log_loss"]))
                                    full_out_dict["Accuracy"].append(
                                        float(out_dict["Accuracy"]))
                                    full_out_dict["Precision"].append(
                                        float(out_dict["Precision"]))
                                    full_out_dict["Recall"].append(
                                        float(out_dict["Recall"]))
                                    full_out_dict["F1"].append(
                                        float(out_dict["F1"]))
                                    full_out_dict["tn"].append(
                                        int(out_dict["tn"]))
                                    full_out_dict["fp"].append(
                                        int(out_dict["fp"]))
                                    full_out_dict["fn"].append(
                                        int(out_dict["fn"]))
                                    full_out_dict["tp"].append(
                                        int(out_dict["tp"]))

                                    pd.DataFrame(full_out_dict).to_csv(
                                        dataset +
                                        "/csv_files/variant_check.csv",
                                        index_label="row_index",
                                        sep=";")
                                    print("-" * 100)
    def eval_model(cls,
                   master_gpu_id,
                   model,
                   eval_dataset,
                   eval_batch_size=1,
                   use_cuda=False,
                   num_workers=1):
        model.eval()

        eval_dataloader = DataLoader(dataset=eval_dataset,
                                     pin_memory=use_cuda,
                                     batch_size=eval_batch_size,
                                     num_workers=num_workers,
                                     shuffle=False)

        predicted_probs = []
        true_labels = []

        batch_count = 1
        for batch in tqdm(eval_dataloader,
                          unit="batch",
                          ncols=100,
                          desc="Evaluating process: "):
            labels = batch["label"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["label"]

            tokens = batch["tokens"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch["tokens"]
            segment_ids = batch["segment_ids"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                "segment_ids"]
            attention_mask = batch["attention_mask"].cuda(
                master_gpu_id
            ) if use_cuda and master_gpu_id is not None else batch[
                "attention_mask"]

            with torch.no_grad():
                main_output, asr_output = model(tokens, segment_ids,
                                                attention_mask)

                # 将模型输出转为列表
                main_output = torch.softmax(main_output, dim=1).cpu().tolist()
                # 获取正例结果
                prob = np.array(main_output)[:, 1]
                # 将该Batch的正例预测值列表拼接至全局正例预测值列表中
                predicted_probs.extend(prob.tolist())

                # 将真实label列表拼接至全局真实label列表
                true_labels.extend(labels.tolist())

                LoggerHelper.info("Batch: " + str(batch_count))
                batch_count += 1

        predicted_probs = [round(prob, 2) for prob in predicted_probs]
        precision, recall, _thresholds = precision_recall_curve(
            true_labels, predicted_probs)
        auc = roc_auc_score(true_labels, predicted_probs)
        logloss = log_loss(true_labels, predicted_probs)
        for i in range(len(_thresholds)):
            log_str_th = 'VAL => Thresholds: {0:>2}, Precision: {1:>7.2%}, Recall: {2:>7.2%}, F1: {3:>7.2%}'.format(
                _thresholds[i], precision[i], recall[i],
                f1_score(precision[i], recall[i]))
            LoggerHelper.info(log_str_th)

        LoggerHelper.info("AUC: " + str(auc))
        LoggerHelper.info("Logloss: " + str(logloss))

        return
#from sklearn.metrics import accuracy_score 
from sklearn.metrics import accuracy_score 
print ("Accuracy : ", accuracy_score(y_test,predictions)*100)

#MAE L1 loss function - Should be close to 0
from sklearn.metrics import mean_absolute_error  
mean_absolute_error(y_test,predictions) #y_target, y_pred

#MAE L2 loss function - Should be close to 0
from sklearn.metrics import mean_squared_error  
mean_squared_error(y_test,predictions) #y_target, y_pred

# Log Loss  - Should be close to 0 - Only for classification models
from sklearn.metrics import log_loss
log_loss(y_test,predictions)

# Get ROC curve for Logistic Regression

get_roc(y_test,predictions)

get_prec_recall(y_test,predictions)

"""Logistic Regression Model evaluation based on K-fold cross-validation using cross_validate() function"""

from sklearn.model_selection import cross_validate

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

results = cross_validate(logmodel, X, y, cv=10, scoring=list(scoring.values()), 
                         return_train_score=False)
Exemplo n.º 51
0
        # optimizer = Nadam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        # optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5)
        scheduler = None
        # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
        #                                                 max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(train_loader))
        # loss_fn = nn.BCEWithLogitsLoss()
        loss_fn = SmoothBCEwLogits(smoothing=0.005)

        model_weights = f"{CACHE_PATH}/online_model{_fold}.pth"
        es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max")
        for epoch in range(EPOCHS):
            train_loss = train_fn(model, optimizer, scheduler, loss_fn, train_loader, device)

            valid_pred = inference_fn(model, valid_loader, device)
            valid_auc = roc_auc_score(valid[target_cols].values, valid_pred)
            valid_logloss = log_loss(valid[target_cols].values, valid_pred)
            valid_pred = np.median(valid_pred, axis=1)
            valid_pred = np.where(valid_pred >= 0.5, 1, 0).astype(int)
            valid_u_score = utility_score_bincount(date=valid.date.values, weight=valid.weight.values,
                                                   resp=valid.resp.values, action=valid_pred)
            print(f"FOLD{_fold} EPOCH:{epoch:3} train_loss={train_loss:.5f} "
                      f"valid_u_score={valid_u_score:.5f} valid_auc={valid_auc:.5f} "
                      f"time: {(time.time() - start_time) / 60:.2f}min")
            es(valid_auc, model, model_path=model_weights) #auc로 중단하지말고 train_loss나 u_score를 기준으로 중단해야할 듯
            if es.early_stop:
                print("Early stopping")
                break
        # torch.save(model.state_dict(), model_weights)
    if True:
        valid_pred = np.zeros((len(valid), len(target_cols)))
        for _fold in range(NFOLDS):
Exemplo n.º 52
0
def compare_models(X_train, y_train) :
    # Split between train and cross-validation sets
    X_train, X_cv, y_train, y_cv = train_test_split( X_train, y_train, test_size=0.2, random_state=4)
    
    ### KNN
    K = 10
    accuracy = np.zeros((K-1))
    
    for n in range(1,K):
        #Train model
        KNN = KNeighborsClassifier(n_neighbors = n)
        KNN.fit(X_train,y_train)
        
        # Predict
        yhat_KNN = KNN.predict(X_cv)
        accuracy[n-1] = metrics.accuracy_score(y_cv, yhat_KNN)
    
    # Display results
#    plt.plot(range(1,K),accuracy,'g')
#    plt.legend('Accuracy')
#    plt.ylabel('Accuracy ')
#    plt.xlabel('Number of Neighbours (K)')
#    plt.tight_layout()
#    plt.show()
#    print( "KNearestNeighbour's accuracy (with k =", accuracy.argmax()+1, ") :", accuracy.max())
    
    # Train model with the best k
    k_KNN = accuracy.argmax()+1
    KNN = KNeighborsClassifier(n_neighbors = k_KNN)
    KNN.fit(X_train,y_train)
    yhat_KNN = KNN.predict(X_test)
    Jaccard_KNN = metrics.jaccard_score(y_test, yhat_KNN, pos_label='PAIDOFF')
    F1Score_KNN = f1_score(y_test, yhat_KNN, average='weighted')
    KNN_validity = sum(yhat_KNN != 'PAIDOFF')/len(yhat_KNN)
    if KNN_validity < 0.1 :
        KNN_validity = False
    else :
        KNN_validity = True
    print("KNN\n", (classification_report(y_test, yhat_KNN)))
    
    
    
    ### Decision Tree
    # Train model
    K = 20
    accuracy = np.zeros((K-3))
    
    for depth in range(3,K) :
        # Train model
        LoanTree = DecisionTreeClassifier(criterion="entropy", max_depth = depth)
        LoanTree.fit(X_train,y_train)
        
        # Predict
        yhat_Tree = LoanTree.predict(X_cv)
        accuracy[depth-3] = metrics.accuracy_score(y_cv, yhat_Tree)
    
    # Display results
#    plt.plot(range(3,K),accuracy,'g')
#    plt.legend('Accuracy')
#    plt.ylabel('Accuracy ')
#    plt.xlabel('Max depth')
#    plt.tight_layout()
#    plt.show()
#    print( "Decision Tree's accuracy (with max depth =", accuracy.argmax()+3, ") :", accuracy.max())
    
    # Train model with the best max_depth
    max_depth = accuracy.argmax()+1
    LoanTree = DecisionTreeClassifier(criterion="entropy", max_depth = max_depth)
    LoanTree.fit(X_train,y_train)
    yhat_Tree = LoanTree.predict(X_test)
    Jaccard_Tree = metrics.jaccard_score(y_test, yhat_Tree, pos_label='PAIDOFF')
    F1Score_Tree = f1_score(y_test, yhat_Tree, average='weighted')
    Tree_validity = sum(yhat_Tree != 'PAIDOFF')/len(yhat_KNN)
    if Tree_validity < 0.1 :
        Tree_validity = False
    else :
        Tree_validity = True
    print("Decision Tree\n", classification_report(y_test, yhat_Tree))
    
    
    
    ### SVM
    # Train model
    SVM = svm.SVC()
    param_grid_SVM = [{'C': [0.01, 0.1, 0.3, 1, 10], 'gamma': [0.001], 'kernel': ['linear', 'rbf', 'sigmoid']}]
    
    gridSVM=GridSearchCV(SVM, param_grid=param_grid_SVM, cv=5)
    gridSVM.fit(X_train,y_train)
    #print('Accuracy :', gridSVM.best_score_)
    SVM_params = gridSVM.best_params_
    #print('Best parameters :', gridSVM.best_params_)
    
    # Train model with best parameters
    SVM = svm.SVC(C = SVM_params['C'], gamma = SVM_params['gamma'], kernel = SVM_params['kernel'])
    SVM.fit(X_train, y_train)
    yhat_SVM = SVM.predict(X_test)
    Jaccard_SVM = jaccard_score(y_test, yhat_SVM, pos_label='PAIDOFF')
    F1Score_SVM = f1_score(y_test, yhat_SVM, average='weighted')
    SVM_validity = sum(yhat_SVM != 'PAIDOFF')/len(yhat_SVM)
    if SVM_validity < 0.1 :
        SVM_validity = False
    else :
        SVM_validity = True
    print("SVM\n", classification_report(y_test, yhat_SVM))
    
    
    
    ### Logistic regression
    LR = LogisticRegression(C=0.01, solver='liblinear')
    LR.fit(X_train,y_train)
    
    reg_parameter = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]
    accuracy = np.zeros(len(reg_parameter))
    
    for i,c in enumerate(reg_parameter) :
        # Train model
        LR = LogisticRegression(C = c, solver='liblinear')
        LR.fit(X_train,y_train)
        
        # Predict
        yhat_LR = LR.predict(X_cv)
        accuracy[i] = metrics.accuracy_score(y_cv, yhat_LR)
    
    # Display results
#    plt.semilogx(reg_parameter,accuracy,'g')
#    plt.legend('Accuracy')
#    plt.ylabel('Accuracy ')
#    plt.xlabel('C')
#    plt.tight_layout()
#    plt.show()
#    print( "Logistic regression's accuracy (with C =", reg_parameter[accuracy.argmax()], ") :", accuracy.max())
    
    # Train model with the best C
    C = reg_parameter[accuracy.argmax()]
    LR = LogisticRegression(C = C, solver='liblinear')
    LR.fit(X_train,y_train)
    yhat_LR = LR.predict(X_test)
    yhat_prob = LR.predict_proba(X_test)
    Jaccard_LR = jaccard_score(y_test, yhat_LR, pos_label='PAIDOFF')
    F1Score_LR = f1_score(y_test, yhat_LR, average='weighted')
    Log_LR = log_loss(y_test, yhat_prob)
    LR_validity = sum(yhat_LR != 'PAIDOFF')/len(yhat_LR)
    if LR_validity < 0.1 :
        LR_validity = False
    else :
        LR_validity = True
    print("Logistic regression\n", classification_report(y_test, yhat_LR))
    
    
    
    #####################
    ### Final results ###
    #####################
    Table = pd.DataFrame()
    Table['Algorithm'] = ["KNN", "Decision Tree", "SVM", "Logistic Regression"]
    Table['Jaccard'] = [Jaccard_KNN, Jaccard_Tree, Jaccard_SVM, Jaccard_LR]
    Table['F1-score'] = [F1Score_KNN, F1Score_Tree, F1Score_SVM, F1Score_LR]
    Table['Log Loss'] = ["NA", "NA", "NA", Log_LR]
    Table['Valid'] = [KNN_validity, Tree_validity, SVM_validity, LR_validity]
    
    return print("Results table\n", Table.head())
    # 线下评分
    pre_score = model.predict(test_all, num_iteration=model.best_iteration)
    #pre_score = model.predict(test_all)
    score_data = test_all.copy()
    score_data['label'] = pre_score
    score_data = score_data[["order_id", "product_id", "label"]].copy()
    #保存训练的以后后期做stacking使用
    if st == 0:
        stacking_data = score_data
    else:
        stacking_data = stacking_data.append(score_data)
    st += 1
    #test_all[:10000].to_csv("save_little_2_%s.csv"%st)
    #pd.DataFrame(y_test).to_csv("save_y_2_%s.csv"%st)
    logloss = log_loss(y_test, pre_score)
    logloss_list.append(logloss)
    print(logloss_list)

    pred = model.predict(test_data, num_iteration=model.best_iteration)
    preds[:, j] = pred
    j += 1

    del model
    gc.collect()

stacking_data.to_csv("stacking_data_v12_shuff_10000.csv", index=None)

with open("score_note.txt", "a") as f:
    f.write(
        str(train_x.shape[1]) + "\n" + str(score_list) + "=====>" +
    def train(self,
              data,
              labels,
              epochs=30,
              cv_split_num=None,
              validation_data=None,
              savebest=False,
              filepath=None):
        """
        train network on given data

        parameters:
          - data: numpy array
            2d numpy array (doc x word ids) of input data
          - labels: numpy array
            2d numpy array of one-hot-encoded labels
          - epochs: int (default: 30)
            number of epochs to train for
          - validation_data: tuple (optional)
            tuple of numpy arrays (X,y) representing validation data
          - savebest: boolean (default: False)
            set to True to save the best model based on validation score per epoch
          - filepath: string (optional)
            path to save model if savebest is set to True

        outputs:
            None
        """
        if savebest == True and filepath == None:
            raise Exception("Please enter a path to save the network")

        if validation_data:
            validation_size = len(validation_data[0])
        else:
            validation_size = len(data)

        print('training network on %i documents, validating on %i documents' \
              % (len(data), validation_size))

        # Removing: with self.sess as sess:
        #with self.sess as sess:
        # Output directory for models and summaries
        timestamp = str(int(time.time()))

        # Track best model for saving.
        prevbest = 0
        for i in range(epochs):
            # TODO FEATURE Add gathering of stats for confusion matrix.
            correct = 0
            y_pred = []
            y_true = []
            start = time.time()

            # Train.
            counter = 0
            for doc in range(len(data)):
                counter += 1

                inputval = self._list_to_numpy(data[doc])
                feed_dict = {
                    self.doc_input: inputval,
                    self.labels: labels[doc],
                    self.dropout: self.dropout_keep
                }
                pred, cost, _ = self.sess.run(
                    [self.prediction, self.loss, self.optimizer],
                    feed_dict=feed_dict)

                # Collect raw stats for calculating metrics.
                if np.argmax(pred) == np.argmax(labels[doc]):
                    correct += 1

                # Collect predictions for calculating metrics with sklearn.
                # Build array of y_pred.
                # Insert each prediction at the same index of its label
                # in the y_true array.
                y_pred.insert(doc, np.argmax(pred))
                y_true.insert(doc, np.argmax(labels[doc]))

                sys.stdout.write(
                    "epoch %i, sample %i of %i, loss: %f      \r" %
                    (i + 1, doc + 1, len(data), cost))
                sys.stdout.flush()

                if (doc + 1) % 50000 == 0:
                    score = self.score(validation_data[0], validation_data[1])
                    print("iteration %i validation accuracy: %.4f%%" %
                          (doc + 1, score * 100))

            print()
            # print("training time: %.2f" % (time.time()-start))
            trainscore = correct / len(data)
            print("epoch %i (Gao's) training accuracy: %.4f" %
                  (i + 1, trainscore))

            # Log metrics per epoch.
            # TODO Print a clean, well-organized report.
            logging.debug(print('correct:', correct))
            logging.debug(print('total:', counter))
            logging.debug(confusion_matrix(y_true, y_pred))

            # Get results from confusion matrix.
            conf_matrix_arr = confusion_matrix(y_true, y_pred)
            TP = conf_matrix_arr[1][1]
            FP = conf_matrix_arr[0][1]
            TN = conf_matrix_arr[0][0]
            FN = conf_matrix_arr[1][0]

            logging.debug(classification_report(y_true, y_pred))
            logging.debug(print('accuracy:', accuracy_score(y_true, y_pred)))
            logging.debug(print('precision:', precision_score(y_true, y_pred)))
            logging.debug(print('recall:', recall_score(y_true, y_pred)))
            logging.debug(print('f1:', f1_score(y_true, y_pred)))
            logging.debug(print('log loss:', log_loss(y_true, y_pred)))

            # Validate.
            if validation_data:
                score = self.score(validation_data[0], validation_data[1])
                print("epoch %i validation accuracy: %.4f%%" % (i + 1, score))

                # Write results to file.

                results_dir = Path('results') / str(self.run_id)
                # Create directory for run within results directory.
                try:
                    os.makedirs(results_dir)
                except FileExistsError:
                    logging.info('Run directory already exists.')

                # Build path to which to write results.
                results_filename = str(self.run_id) + '.csv'
                results_path = results_dir / results_filename

                # Check for existence of csv to determine if header is needed.
                results_file_exists = os.path.isfile(results_path)

                # Open file, write/append results.
                with open(str(results_path), mode='a') as csv_file:
                    fieldnames = [
                        'cv_split', 'epoch', 'num_recs', 'tp', 'fp', 'tn',
                        'fn', 'skl_acc', 'skl_prec', 'skl_recall', 'skl_f1',
                        'skl_f1_micro_avg', 'skl_f1_macro_avg',
                        'skl_f1_weighted_avg', 'skl_log_loss', 'skl_auc',
                        'gao_train_acc', 'gao_val_acc'
                    ]
                    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

                    # Write header only if results csv did not exist at beginning
                    # of this trip through.
                    if not results_file_exists:
                        writer.writeheader()

                    # Write row for each epoch.
                    csv_row = {
                        'cv_split':
                        cv_split_num,
                        'epoch':
                        i + 1,
                        'num_recs':
                        counter,
                        'tp':
                        TP,
                        'fp':
                        FP,
                        'tn':
                        TN,
                        'fn':
                        FN,
                        'skl_acc':
                        accuracy_score(y_true, y_pred),
                        'skl_prec':
                        precision_score(y_true, y_pred),
                        'skl_recall':
                        recall_score(y_true, y_pred),
                        'skl_f1':
                        f1_score(y_true, y_pred),
                        'skl_f1_micro_avg':
                        f1_score(y_true, y_pred, average='micro'),
                        'skl_f1_macro_avg':
                        f1_score(y_true, y_pred, average='macro'),
                        'skl_f1_weighted_avg':
                        f1_score(y_true, y_pred, average='weighted'),
                        'skl_log_loss':
                        log_loss(y_true, y_pred),
                        'skl_auc':
                        roc_auc_score(y_true, y_pred),
                        'gao_train_acc':
                        trainscore,
                        'gao_val_acc':
                        score
                    }
                    writer.writerow(csv_row)

                # Plot ROC Curve and AUC score for last epoch.
                if i == epochs - 1:
                    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
                    auc = roc_auc_score(y_true, y_pred)
                    plt.clf()
                    plt.plot(fpr, tpr, 'r-', label='CNN: %.3f' % auc)
                    plt.plot([0, 1], [0, 1], 'k-', label='Random')
                    plt.plot([0, 0, 1, 1], [0, 1, 1, 1], 'g-', label='Perfect')
                    plt.legend()
                    plt.xlabel('False Positive Rate')
                    plt.ylabel('True Positive Rate')
                    fig_filename = 'ROC_CV_Split_' + str(cv_split_num)
                    fig_path = results_dir / fig_filename
                    plt.savefig(fig_path)

            # Save if performance better than previous best.
            if savebest and score >= prevbest:
                prevbest = score
                self.save(filepath)
def train_linear_classifier(learning_rate, steps, batch_size, df_train,
                            df_validate, features, target, threshold):
    """Trains a dnn classification model.

    In addition to training, this function also prints training progress information,
    a plot of the training and validation loss over time, and a confusion
    matrix.

    Args:
    learning_rate: An `int`, the learning rate to use.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    df_train: A `DataFrame` containing the training features and labels.
    df_validate: A `DataFrame` containing the validation features and labels.
    Returns:
    The trained `DNNClassifier` object.
    """

    periods = 10
    steps_per_period = steps / periods

    # prepare features and targets
    train_features = df_train[features]
    train_targets = df_train[target]
    validate_features = df_validate[features]
    validate_targets = df_validate[target]
    # create the input functions.
    train_fn = lambda: train_input_fn(features=train_features,
                                      targets=train_targets,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_epochs=None)
    train_pred_fn = lambda: train_input_fn(features=train_features,
                                           targets=train_targets,
                                           batch_size=1,
                                           shuffle=False,
                                           num_epochs=1)
    validate_pred_fn = lambda: train_input_fn(features=validate_features,
                                              targets=validate_targets,
                                              batch_size=1,
                                              shuffle=False,
                                              num_epochs=1)

    # Create a LinearClassifier object.
    my_optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(
        my_optimizer, 5.0)
    classifier = tf.estimator.LinearClassifier(
        feature_columns=construct_feature_columns(train_features),
        optimizer=my_optimizer,
        config=tf.estimator.RunConfig(keep_checkpoint_max=1))

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    train_validate_metrics = pd.DataFrame()
    for period in range(0, periods):
        # Train the model, starting from the prior state.
        classifier.train(input_fn=train_fn, steps=steps_per_period)

        # Take a break and compute probabilities.
        train_pred = list(classifier.predict(input_fn=train_pred_fn))
        train_prob = np.array([item['probabilities'] for item in train_pred])

        validate_pred = list(classifier.predict(input_fn=validate_pred_fn))
        validate_prob = np.array(
            [item['probabilities'] for item in validate_pred])
        # Compute training and validation errors.
        train_metrics = {
            'train-logloss': [metrics.log_loss(train_targets, train_prob)],
            'test-logloss':
            [metrics.log_loss(validate_targets, validate_prob)],
            'train-error': [
                calc_err_at_threshold([p[1] for p in train_prob],
                                      train_targets, threshold)
            ],
            'test-error': [
                calc_err_at_threshold([p[1] for p in validate_prob],
                                      validate_targets, threshold)
            ],
        }
        # Occasionally print the current loss.
        print(
            "  period %02d (%d samples): LogLoss: %0.2f/%0.2f, Error: %0.2f/%0.2f"
            %
            (period, (period + 1) * steps_per_period * batch_size,
             train_metrics['train-logloss'][0],
             train_metrics['test-logloss'][0], train_metrics['train-error'][0],
             train_metrics['test-error'][0]))
        # Add the loss metrics from this period to our list.
        train_validate_metrics = train_validate_metrics.append(
            train_metrics, ignore_index=True)

    print("Model training finished.")
    # Remove event files to save disk space.
    _ = map(
        os.remove,
        glob.glob(os.path.join(classifier.model_dir, 'events.out.tfevents*')))
    # Output a graph of loss metrics over periods.
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.ylabel("LogLoss")
    plt.xlabel("Periods")
    plt.title("LogLoss vs. Periods")
    plt.plot(list(train_validate_metrics['train-logloss']), label="training")
    plt.plot(list(train_validate_metrics['test-logloss']), label="validation")
    plt.legend()
    # Output a graph of error metrics over periods.
    plt.subplot(1, 2, 2)
    plt.ylabel("Error")
    plt.xlabel("Periods")
    plt.title("Error vs. Periods")
    plt.plot(list(train_validate_metrics['train-error']), label="training")
    plt.plot(list(train_validate_metrics['test-error']), label="validation")
    plt.legend()
    plt.tight_layout()

    return classifier
Exemplo n.º 56
0
# Lets look what Hydra sees
y_valud_preds_df = hydra_model_df.predict(prepare_input_data(
    hydra_model_df, X_valid),
                                          verbose=0)

base_class = np.argmax(y_valid, axis=1)
preds = np.argmax(y_valud_preds_df, axis=1)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
sns.heatmap(pd.DataFrame(confusion_matrix(base_class, preds)),
            annot=True,
            linewidths=.5,
            fmt="d")
print("f1: {:0.6f} log loss: {:0.6f}".format(
    f1_score(base_class, preds, average='macro'),
    log_loss(y_valid, y_valud_preds_df)))
print(timer(start_time))

sub_preds_df = hydra_model_df.predict(prepare_input_data(hydra_model_df, test),
                                      verbose=0)
predictions_df = pd.DataFrame(
    sub_preds_df, columns=["Class_1", "Class_2", "Class_3", "Class_4"])

blend_l1 = pd.read_csv(
    "/kaggle/input/tps05blender-v2/tps05-remek-blender_v2.csv")

output = predictions_df.copy()
output["Class_1"] = (predictions_df.Class_1 * 0.3 + blend_l1.Class_1 * 0.7)
output["Class_2"] = (predictions_df.Class_2 * 0.3 + blend_l1.Class_2 * 0.7)
output["Class_3"] = (predictions_df.Class_3 * 0.3 + blend_l1.Class_3 * 0.7)
output["Class_4"] = (predictions_df.Class_4 * 0.3 + blend_l1.Class_4 * 0.7)
Exemplo n.º 57
0
    if (log < min):
        min = log
        a = (n, learning_rate)
    test_loss = np.hstack((test_loss, np.array([log])))

y_pred_gen = gbc.staged_decision_function(X_train)
for i in y_pred_gen:
    #print(i)
    y_pred = 1 / (1 + np.exp(-i))
    train_loss = np.hstack((train_loss, np.array([log_loss(y_train, y_pred)])))






import matplotlib.pyplot as plt
#%matplotlib inline
plt.figure()
plt.plot(test_loss, 'r', linewidth=2)
plt.plot(train_loss, 'g', linewidth=2)
plt.legend(['test', 'train'])
'''

clf = RandomForestClassifier(n_estimators=37, random_state=241)

clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)
y_pred = 1 / (1 + np.exp(-predictions))
print(log_loss(y_test, predictions))
Exemplo n.º 58
0
def buildKB15():
    ## data
    # read the training/test data
    print('Importing Data')
    xtrain = pd.read_csv('../input/train.csv')
    xtest = pd.read_csv('../input/test.csv')

    xtrain.fillna(-1, inplace=True)
    xtest.fillna(-1, inplace=True)

    # separate
    id_train = xtrain.ID
    xtrain.drop('ID', axis=1, inplace=True)
    ytrain = xtrain.target
    xtrain.drop('target', axis=1, inplace=True)
    id_test = xtest.ID
    xtest.drop('ID', axis=1, inplace=True)

    # drop v22 - categorical with 18211 possible values
    xtrain.drop('v22', axis=1, inplace=True)
    xtest.drop('v22', axis=1, inplace=True)

    # folds for cv
    xfolds = pd.read_csv('../input/xfolds.csv')
    fold_index = xfolds.fold5
    fold_index = np.array(fold_index) - 1
    n_folds = len(np.unique(fold_index))

    ## processing
    # identify columns classes
    categorical_cols = [
        f for f in xtrain.columns
        if xtrain[f].dtype not in ['float64', 'int64']
    ]
    numerical_cols = [
        f for f in xtrain.columns if xtrain[f].dtype in ['float64']
    ]

    # number of unique values
    # headcounts = [len(np.unique(xtrain[f])) for f in categorical_cols]

    # convert all categoricals: expand into binary indicators, use as features
    # fed into NaiveBayes, drop the original
    for col in categorical_cols:
        print(col)
        newname = 'nb_' + col
        # transform the joint set into dummies
        xloc = pd.concat((xtrain[col], xtest[col]), axis=0, ignore_index=True)
        xloc = pd.get_dummies(xloc)
        # separate back into training and test
        xtr = xloc.ix[range(0, xtrain.shape[0])]
        xte = xloc.ix[range(xtrain.shape[0], xloc.shape[0])]
        # storage vector for the new features (train and test)
        newvar_train = np.zeros((xtrain.shape[0]))
        # build a stacked version along the training set
        for j in range(0, n_folds):
            idx0 = np.where(fold_index != j)
            idx1 = np.where(fold_index == j)
            x0 = np.array(xtr)[idx0, :][0]
            x1 = np.array(xtr)[idx1, :][0]
            y0 = np.array(ytrain)[idx0]
            y1 = np.array(ytrain)[idx1]
            nb = BernoulliNB()
            nb.fit(x0, y0)
            newvar_train[idx1] = nb.predict_proba(x1)[:, 1]
            print(log_loss(y1, newvar_train[idx1]))
        # build a stacked version along the test set
        nb.fit(xtr, ytrain)
        newvar_test = nb.predict_proba(xte)[:, 1]
        # park into training and test sets
        xtrain[newname] = newvar_train
        xtest[newname] = newvar_test
        xtrain.drop(col, axis=1, inplace=True)
        xtest.drop(col, axis=1, inplace=True)

    ## store the results
    # add indices etc
    xtrain = pd.DataFrame(xtrain)
    xtrain['ID'] = id_train
    xtrain['target'] = ytrain
    #
    xtest = pd.DataFrame(xtest)
    xtest['ID'] = id_test
    #
    #
    #    # save the files
    xtrain.to_csv('../input/xtrain_kb15.csv', index=False, header=True)
    xtest.to_csv('../input/xtest_kb15.csv', index=False, header=True)

    return
Exemplo n.º 59
0
from sklearn.externals import joblib


def load_train_data():
    train = pd.read_csv('train.csv')
    labels = train.target.values
    lbl_enc = preprocessing.LabelEncoder()
    labels = lbl_enc.fit_transform(labels)

    train = train.drop('id', axis=1)
    train = train.drop('target', axis=1)
    return train.values, labels.astype('int32')


def train_model_random_forest(train, labels):
    # train a random forest classifier
    model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000)
    model.fit(train, labels)
    joblib.dump(model, 'rf_model2.model')
    return model


X, y = load_train_data()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=3253)
model = train_model_random_forest(X_train, y_train)
preds = model.predict_proba(X_test)
print "MLogloss: ", log_loss(y_test, preds)
Exemplo n.º 60
0
            y_train, y_valid = y[train_index], y[valid_index]
            if count == 0:
                actual = y_valid
            else:
                actual = np.append(actual, y_valid, axis=0)
            
            list_result = Parallel(n_jobs=6)(delayed(train_class)(X_train, y_train, X_valid, i) for i in range(num_classes))
            preds_fold = pd.concat(list_result, axis = 1)
              
            if count == 0:
                preds_epoch = preds_fold.copy()
            else:
                preds_epoch = preds_epoch.append(preds_fold, ignore_index=True)

            count += 1
            print "logloss", log_loss(actual, preds_epoch.as_matrix())
    if cv == 0:
        preds_epoch['id'] = ids_test.astype(float).astype(int)
        preds_epoch.to_csv('../data/output-py/test_raw/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
        preds_epoch = preds_epoch.drop('id', axis=1)
    else:
        preds_epoch['id'] = ids_train_folds.astype(float).astype(int)
        preds_epoch.to_csv('../data/output-py/train_raw/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
        preds_epoch = preds_epoch.drop('id', axis=1)
    
    if e == 0:
        preds = preds_epoch.copy()
    else:
        preds = preds.add(preds_epoch, fill_value=0)
    if cv == 1:
        preds_epoch = preds.copy()