示例#1
0
def try_params(n_iterations, params):

    n_estimators = int(round(n_iterations * trees_per_iteration))
    print "n_estimators:", n_estimators
    pprint(params)

    classifier = params['classifier']
    del params['classifier']

    clf = eval("{}( n_estimators = n_estimators, verbose = 0, n_jobs = -1, \
		**params )".format(classifier))
    clf.fit(x_train, y_train)

    p = clf.predict_proba(x_train)[:, 1]

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    #

    p = clf.predict_proba(x_test)[:, 1]

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
        ll, auc, acc)

    return {'loss': ll, 'log_loss': ll, 'auc': auc}
示例#2
0
    def score(self, submission_text: str):
        public_preds, private_preds, public_actuals, private_actuals = [
            1, 0
        ], [1, 0], [1, 0], [1, 0]
        tmp_file = StringIO(submission_text)
        csv_reader = csv.DictReader(tmp_file)
        public_event_ids_pkl_name = '{}/public_validation_event_ids.pkl'.format(
            cur_dir_path)

        with open(public_event_ids_pkl_name,
                  'rb') as public_validation_event_ids_file:
            public_validation_event_ids = pickle.load(
                public_validation_event_ids_file)

        for row in csv_reader:
            if row['event_id'] in public_validation_event_ids:
                public_preds.append(float(row['conversion_probability']))
            else:
                private_preds.append(float(row['conversion_probability']))

        with open('{}/all_validation_labels.txt'.format(cur_dir_path),
                  'r') as all_validation_labels_file:
            for line in all_validation_labels_file:
                event_id, event_label = line.rstrip().split(' ')
                if event_id in public_validation_event_ids:
                    public_actuals.append(float(event_label))
                else:
                    private_actuals.append(float(event_label))

            public_score = AUC(public_actuals, public_preds)
            private_score = AUC(private_actuals, private_preds)

        return public_score, private_score, None
示例#3
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_stop=100,
                  retrain=True):

    model_name = os.path.splitext(
        os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    model = AutoLGB(objective='binary', metric='auc', n_random_col=0)
    model.tune(pd.DataFrame(X), pd.Series(y))

    params = model.params
    n_est = model.n_best

    logging.info(f'params: {params}')
    logging.info(f'n_best: {n_est}')

    p = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn])
        val_lgb = lgb.Dataset(X[i_val], label=y[i_val])

        logging.info('Training with early stopping')
        clf = lgb.train(params,
                        trn_lgb,
                        n_est,
                        val_lgb,
                        early_stopping_rounds=n_stop,
                        verbose_eval=100)
        n_best = clf.best_iteration
        n_bests.append(n_best)
        logging.info('best iteration={}'.format(n_best))

        p[i_val] = clf.predict(X[i_val])
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        p_tst += clf.predict(X_tst) / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
示例#4
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  C, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='lr_{}_{}.log'.format(
                                                        C, feature_name
                                                       ))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    clf = SVC(C=C, class_weight='auto', random_state=2015, probability=True)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training CV #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.4f}'.format(AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.4f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
示例#5
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, lrate=.1, l1=.0, l2=.0, n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-4]
    algo_name = 'xgl_{}_{}_{}_{}'.format(n_est, lrate, l1, l2)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    param = {'eta': lrate,
             'objective': 'binary:logistic',
             'colsample_bytree': .7,
             'subsample': .5,
             'eval_metric': 'auc',
             'seed': 2015,
             'booster': 'gblinear',
             'alpha': l1,
             'lambda': l2}

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        dtrain = xgb.DMatrix(X[i_trn], label=y[i_trn])
        dvalid = xgb.DMatrix(X[i_val], label=y[i_val])
        watchlist = [(dvalid, 'eval'), (dtrain, 'train')]

        clf = xgb.train(param, dtrain, n_est, watchlist)

        p_val[i_val] = clf.predict(dvalid)
        logging.info('AUC TRN = {:.6f}'.format(AUC(y[i_trn], clf.predict(dtrain))))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    dtrain = xgb.DMatrix(X, label=y)
    dtest = xgb.DMatrix(test_file)
    watchlist = [(dtrain, 'train')]

    clf = xgb.train(param, dtrain, n_est, watchlist)
    p_tst = clf.predict(dtest)

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
示例#6
0
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est=100, batch_size=1024, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    dims = X.shape[1]
    logging.info('{} dims'.format(dims))

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros_like(y)
    p_tst = np.zeros((X_tst.shape[0],))
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X[i_trn],
                                                    y[i_trn],
                                                    batch_size,
                                                    True),
                          nb_epoch=n_est,
                          samples_per_epoch=X[i_trn].shape[0],
                          verbose=0)

        p[i_val] = clf.predict_generator(generator=batch_generatorp(X[i_val], batch_size, False),
                                         val_samples=X[i_val].shape[0])[:, 0]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        if not retrain:
            p_tst += clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False),
                                           val_samples=X_tst.shape[0])[:, 0] / N_FOLD

    logging.info('Saving validation predictions...')
    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        clf = nn_model(dims)
        clf.fit_generator(generator=batch_generator(X, Y, batch_size, True),
                          nb_epoch=n_est)
        p_tst = clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False),
                                      val_samples=X_tst.shape[0])[:, 0]

    logging.info('Saving normalized test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
示例#7
0
def eval_pred(y_true, y_pred, eval_type=eval_type):
    if eval_type == 'logloss':  #eval_typeはここに追加
        print "logloss: ", ll(y_true, y_pred)
        return ll(y_true, y_pred)

    elif eval_type == 'auc':
        print "AUC: ", AUC(y_true, y_pred)
        return AUC(y_true, y_pred)

    elif eval_type == 'rmse':
        print "rmse: ", np.sqrt(mean_squared_error(y_true, y_pred))
        return np.sqrt(mean_squared_error(y_true, y_pred))
示例#8
0
文件: ml.py 项目: davidrs/ml-kiva
def trainModels():
    # SVM looks much better in validation

    print "training SVM..."

    # although one needs to choose these hyperparams
    C = 20  #173
    gamma = 0.001  #1.31e-5
    shrinking = True

    probability = True
    verbose = True

    svc = SVC(C=C,
              gamma=gamma,
              shrinking=shrinking,
              probability=probability,
              verbose=verbose)
    svc.fit(x_train, y_train)
    p = svc.predict_proba(x_test)

    print x_test[12]
    print svc.predict_proba(x_test[12])
    print svc.predict_proba(x_test[13])
    print svc.predict_proba(x_test[14])

    auc = AUC(y_test, p[:, 1])
    print "SVM AUC", auc

    print "training random forest..."

    n_trees = 100
    max_features = int(round(sqrt(x_train.shape[1]) *
                             2))  # try more features at each split
    max_features = 'auto'
    verbose = 1
    n_jobs = 1

    rf = RF(n_estimators=n_trees,
            max_features=max_features,
            verbose=verbose,
            n_jobs=n_jobs)
    rf.fit(x_train, y_train)

    p = rf.predict_proba(x_test)

    print x_test[12]
    print rf.predict_proba(x_test[12])
    print rf.predict_proba(x_test[13])
    print rf.predict_proba(x_test[14])

    auc = AUC(y_test, p[:, 1])
    print "RF AUC", auc
示例#9
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est=100,
                  depth=4,
                  lrate=.1,
                  n_fold=5,
                  n_bag=50,
                  subrow=.5,
                  subcol=.8):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, lrate, subrow, subcol,
                            feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    xg = xgb.XGBClassifier(max_depth=depth,
                           learning_rate=lrate,
                           n_estimators=n_est,
                           colsample_bytree=.8,
                           subsample=.5,
                           nthread=4)

    clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
示例#10
0
def calculate_statistics(y_pred, y_true):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print('Bayes model results:')
    print('True Positives: ' + str(tp))
    print('True Negative: ' + str(tn))
    print('False Positives: ' + str(fp))
    print('False Negatives: ' + str(fn))
    fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
    print('AUC ' + str(AUC(fpr, tpr)))
    print('Accuracy: ' + str((float(tp + tn) / (tp + tn + fp + fn) * 100)) +
          '%')
    return tn, fp, fn, tp, AUC(fpr, tpr), str(
        (float(tp + tn) / (tp + tn + fp + fn) * 100))
    def SuperVisedMakeModel(self,
                            model_name,
                            train_f,
                            train_t,
                            test_f,
                            test_t,
                            is_valid=0,
                            do_cv=2):
        """Training Model"""
        t1 = time.time()
        self._ModelSetting(model_name)
        print('------------------------------------')
        print('Model            : %15s' % model_name)
        print('Parameters       : %15s' % self.model_p)

        self.clf.fit(train_f, train_t)
        acc1 = float(self.clf.score(train_f, train_t))
        #self.clf.fit(test_f, test_t)
        acc2 = float(self.clf.score(test_f, test_t))
        t2 = time.time()
        print('Training         time: %7s seconds' % str(np.round(t2 - t1, 0)))
        """Predicting"""
        train_pred1 = self.clf.predict(train_f)
        #train_pred2 = self.clf.predict_proba(train_f)
        test_pred1 = self.clf.predict(test_f)
        #test_pred2 = self.clf.predict_proba(test_f)
        t3 = time.time()
        print('Predicting       time: %7s seconds' % str(np.round(t3 - t2, 0)))
        """CV"""
        if do_cv > 0:
            cv_scores = CV.cross_val_score(self.clf,
                                           train_f,
                                           train_t,
                                           cv=do_cv,
                                           scoring='accuracy')
            #cv_scores2 = CV.cross_val_score(self.clf, train_f, train_t, cv = 2, scoring = 'roc_auc', n_jobs = -1)
            #cv_scores = CV.cross_val_score(self.clf, train_f, train_t, cv = 5, scoring = 'log_loss', n_jobs = -1)
            t4 = time.time()
            print('Cross Validation time: %7s seconds' %
                  str(np.round(t4 - t3, 0)))
            print('CV Accuracy      : %.9f (+/- %0.5f)' %
                  (cv_scores.mean(), cv_scores.std() * 2))
            print(cv_scores)
            self.models_dict["%s_score" % model_name] = cv_scores.mean()
        """Scoring"""
        print('Training Accuracy: %.9f' % acc1)
        print('Training AUC     : %.9f' % AUC(train_t, train_pred1))
        #print('Training LogLoss : %.9f' % LOGLOSS(train_t, train_pred2, eps = 1e-15))
        if is_valid:
            print('Valid Accuracy   : %.9f' % acc2)
            print('Valid AUC        : %.9f' % AUC(test_t, test_pred1))
def train_predict(train_file, test_file, predict_valid_file, 
		  predict_test_file, retrain=True):

    model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0]

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    logging.info('Loading CV Ids')
    cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

    p = np.zeros(X.shape[0])
    p_tst = np.zeros(X_tst.shape[0])
    n_bests = []
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        logging.info('Training model #{}'.format(i))
        glm = linear_model.LogisticRegression(solver='lbfgs',
					      max_iter=2020,
					      fit_intercept=True,
					      penalty='none',
					      verbose=0)
        glm.fit(X[i_trn], y[i_trn])
        p[i_val] = glm.predict_proba(X[i_val])[:, 1]
        logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val])))

        if not retrain:
            p_tst += glm.predict_proba(X_tst)[:,1] / N_FOLD

    logging.info('CV: {:.4f}'.format(AUC(y, p)))
    logging.info('Saving validation predictions...')
    np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',')

    if retrain:
        logging.info('Retraining with 100% training data')
        
        glm = linear_model.LogisticRegression(random_state=1,
					      solver='lbfgs',
					      max_iter=2020,
					      fit_intercept=True,
					      penalty='none',
					      verbose=0)
        glb = glb.fit(X, y)
        p_tst = glb.predict_proba(X_tst)

    logging.info('Saving test predictions...')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
示例#13
0
 def testNB(self):
     self.clf = GaussianNB()
     self.clf.fit(self.X_train, self.y)
     self.NB_p = self.clf.predict_proba(self.X_test)
     self.aucNB = AUC(self.y_test.values, self.NB_p[:, 1])
     print self.aucNB
     return self.rf_p
def drift_detector(S, T, threshold=0.75):
    T = pd.DataFrame(T)
    S = pd.DataFrame(S)
    # Give slack variable in_target which is 1 for old and 0 for new
    T['in_target'] = 0  # in target set
    S['in_target'] = 1  # in source set
    # Combine source and target with new slack variable
    ST = pd.concat([T, S], ignore_index=True, axis=0)
    labels = ST['in_target'].values
    ST = ST.drop('in_target', axis=1).values
    # You can use any classifier for this step. We advise it to be a simple one as we want to see whether source
    # and target differ not to classify them.
    clf = LogisticRegression(solver='liblinear')
    predictions = np.zeros(labels.shape)
    # Divide ST into two equal chunks
    # Train LR on a chunk and classify the other chunk
    # Calculate AUC for original labels (in_target) and predicted ones
    skf = StratifiedKFold(n_splits=2, shuffle=True)
    for train_idx, test_idx in skf.split(ST, labels):
        X_train, X_test = ST[train_idx], ST[test_idx]
        y_train, y_test = labels[train_idx], labels[test_idx]
        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_test)[:, 1]
        predictions[test_idx] = probs
    auc_score = AUC(labels, predictions)
    # Signal drift if AUC is larger than the threshold
    if auc_score > threshold:
        return True
    else:
        return False
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_est, depth, n_fold=5):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='rf_{}_{}_{}.log'.format(n_est,
                                                          depth,
                                                          feature_name))


    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015)

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    logging.info('Cross validation...')
    p_val = np.zeros_like(y)
    for i_trn, i_val in cv:
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    logging.info('AUC = {:.4f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
示例#16
0
 def on_epoch_end(self, epoch, logs={}):
     if epoch % self.interval == 0:
         y_pred = self.model.predict_proba(self.X_val, verbose=0)
         score = AUC(self.y_val, y_pred)
         #logging.info("interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score))
         print "interval evaluation - epoch: {:d} - score: {:.6f}".format(
             epoch, score)
示例#17
0
def train_predict(train_file,
                  test_file,
                  predict_valid_file,
                  predict_test_file,
                  n_est,
                  depth,
                  n_fold=5,
                  n_bag=50):

    feature_name = os.path.basename(train_file)[:-4]
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='et_bag{}_{}_{}_{}.log'.format(
                            n_bag, n_est, depth, feature_name))

    logging.info('Loading training and test data...')
    X, y = load_data(train_file)
    X_tst, _ = load_data(test_file)

    et = ET(n_estimators=n_est,
            max_depth=depth,
            random_state=2015,
            class_weight='auto',
            bootstrap=True)

    clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9)
    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        clf.fit(X[i_trn], y[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]
        logging.info('AUC TRN = {:.6f}'.format(
            AUC(y[i_trn],
                clf.predict_proba(X[i_trn])[:, 1])))
        logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val])))

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))

    logging.info('Retraining with 100% data...')
    clf.fit(X, y)
    p_tst = clf.predict_proba(X_tst)[:, 1]

    logging.info('Saving predictions...')
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')
    np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file,
                  n_iter=100, dim=4, lrate=.1, n_fold=5):

    dir_feature = os.path.dirname(train_file)
    dir_val = os.path.dirname(predict_valid_file)

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'libfm_{}_{}_{}'.format(n_iter, dim, lrate)
    model_name = '{}_{}'.format(algo_name, feature_name)

    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG, filename='{}.log'.format(model_name))

    logging.info('Loading training data')
    X, y = load_data(train_file)
    n_tst = sum(1 for line in open(test_file))

    cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015)

    p_val = np.zeros_like(y)
    for i, (i_trn, i_val) in enumerate(cv, 1):
        logging.info('Training model #{}...'.format(i))
        valid_train_file = os.path.join(dir_feature, '{}.trn{}.sps'.format(feature_name, i))
        valid_test_file = os.path.join(dir_feature, '{}.val{}.sps'.format(feature_name, i))
        valid_predict_file = os.path.join(dir_val, '{}.val{}.yht'.format(model_name, i))

        # if there is no CV training or validation file, then generate them
        # first.
        if (not os.path.isfile(valid_train_file) or not os.path.isfile(valid_test_file)):
            dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file,
                               zero_based=False)
            dump_svmlight_file(X[i_val], y[i_val], valid_test_file,
                               zero_based=False)

        subprocess.call(["libFM",
                         "-task", "c",
                         '-dim', '1,1,{}'.format(dim),
                         '-init_stdev', str(lrate),
                         '-iter', str(n_iter),
                         '-train', valid_train_file,
                         '-test', valid_test_file,
                         '-out', valid_predict_file])

        p_val[i_val] = np.loadtxt(valid_predict_file)
        os.remove(valid_predict_file)

    logging.info('AUC = {:.6f}'.format(AUC(y, p_val)))
    np.savetxt(predict_valid_file, p_val, fmt='%.6f')

    logging.info('Retraining with 100% data...')
    subprocess.call(["libFM",
                     "-task", "c",
                     '-dim', '1,1,{}'.format(dim),
                     '-init_stdev', str(lrate),
                     '-iter', str(n_iter),
                     '-train', train_file,
                     '-test', test_file,
                     '-out', predict_test_file])
示例#19
0
 def testRF(self):
     forest = RandomForestClassifier(n_estimators=1000,
                                     n_jobs=-1,
                                     verbose=1)
     self.forest = forest.fit(self.X_train, self.y)
     self.rf_p = self.forest.predict_proba(self.X_test)
     self.aucRF = AUC(self.y_test.values, self.rf_p[:, 1])
     print self.aucRF
     return self.rf_p
示例#20
0
def train_and_eval_auc(train_x, train_y, test_x, test_y, model=LR()):
    model.fit(train_x, train_y)
    p = model.predict_proba(test_x)
    print p
    # hack
    p = p[:, 1] if p.shape[1] > 1 else p[:, 0]

    auc = AUC(test_y, p)
    print "AUC:", auc
示例#21
0
def train_and_evaluate(y_train, x_train, y_val, x_val, alg):
    alg.fit(x_train, y_train)

    p = alg.predict_proba(x_val)
    p_bin = alg.predict(x_val)

    acc = accuracy(y_val, p_bin)
    auc = AUC(y_val, p[:, 1])

    return (auc, acc)
def crossValidate(clf,x,y,folds=5,runs=5):
    '''
    Function for doing K-Fold cross validation.
    clf = classifier
    x = training data, numpy NDarray
    y = labels, numpy 0D array
    folds = number of partitions to be made for the training data
    runs = number of times to repeat the cross validation process, each time with a different random partition.

    folds=5 and runs=10 will do a 5-fold cross validation 10 times on the dataset and calculate the AUC deviation accross these 50 instances.
    '''
    ypred = np.zeros((len(y),runs))
    fold_auc = np.zeros((runs,folds))
    r=0
    score = np.zeros(runs)
    for run in range(runs):
        i=0
        x,y = shuffle(x,y,random_state=19*(run+3)) # some random seeding to be unique
        kf = KFold(y,n_folds=folds,random_state=18*(run+93))
        print 'Cross Validating...'
        for train_ind,test_ind in kf:
            print 'CV Fold ' + str(i+1) + ' out of ' + str(folds)
            xtrain,ytrain = x[train_ind,:],y[train_ind]
            xtest,ytest = x[test_ind,:],y[test_ind]
            clf.fit(xtrain,ytrain)
            #a =  100*clf.feature_importances_
            #print ["%0.3f" % f for f in a]
            fold_pred = clf.predict_proba(xtest)[:,1]
            fold_pred[xtest[:,1]<23]=0
            fold_auc[r,i] = AUC(ytest,fold_pred)
            ypred[test_ind,r]=fold_pred
            i=i+1
        score[r] = AUC(y,ypred[:,r])
        r=r+1
    print 'Fold AUC: ' + str(fold_auc)
    print 'Mean: ' + str(np.mean(fold_auc))
    print 'Deviation: ' + str(np.std(fold_auc))
    
    print '\nOverall AUC: '+ str(score)
    print 'Mean: ' + str(np.mean(score))
    print 'Deviation: ' + str(np.std(score))
    return score
示例#23
0
def findBetterValidation(df, dft):
    traindf = df
    dft['Y'] = -1
    testdf = dft
    traindf['target'] = 0
    testdf['target'] = 1
    datadf = pd.concat((traindf, testdf))
    datadf = datadf.iloc[np.random.permutation(len(datadf))]
    datadf.reset_index(drop=True, inplace=True)
    x = datadf.drop(['target', 'Y'], axis=1)
    y = datadf.target
    n_estimators = 100
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 n_jobs=16,
                                 random_state=0)
    scores = cross_val_score(clf, x, y, scoring='roc_auc', cv=5)
    print('old val scores', scores)
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 n_jobs=16,
                                 random_state=0)
    predictions = np.zeros(y.shape)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5678)
    cv.get_n_splits(x, y)
    for f, (train_i, test_i) in enumerate(cv.split(x, y)):
        x_train = x.iloc[train_i]
        x_test = x.iloc[test_i]
        y_train = y.iloc[train_i]
        y_test = y.iloc[test_i]
        clf.fit(x_train, y_train)
        p = clf.predict_proba(x_test)[:, 1]
        auc = AUC(y_test, p)
        print("# AUC: {:.2%}\n".format(auc), auc)
        print('p', p)
        predictions[test_i] = np.abs(p - 0.5)
    x['p'] = predictions
    x['target'] = datadf.target.copy()
    x['Y'] = datadf.Y.copy()
    index = predictions.argsort()
    train_sorted = x.iloc[index]
    vallen = int(len(train_sorted) * 0.7)

    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 n_jobs=16,
                                 random_state=0)
    scores = cross_val_score(clf,
                             train_sorted.drop(['target', 'Y'],
                                               axis=1).iloc[:vallen],
                             train_sorted.target.iloc[:vallen],
                             scoring='roc_auc',
                             cv=5)
    print('new val scores', scores)

    train_sorted = train_sorted[train_sorted.target == 0]
    return train_sorted.drop(['target'], axis=1)
示例#24
0
def train_and_evaluate( y_train, x_train, y_val, x_val ):

	lr = LR()
	lr.fit( x_train, y_train )

	p = lr.predict_proba( x_val )
	p_bin = lr.predict( x_val )

	acc = accuracy( y_val, p_bin )
	auc = AUC( y_val, p[:,1] )
	
	return ( auc, acc )
示例#25
0
def train_and_eval_sklearn_classifier(clf, data):

    x_train = data['x_train']
    y_train = data['y_train']

    x_test = data['x_test']
    y_test = data['y_test']

    clf.fit(x_train, y_train)

    try:
        p = clf.predict_proba(x_train)[:, 1]  # sklearn convention
    except IndexError:
        p = clf.predict_proba(x_train)

    ll = log_loss(y_train, p)
    auc = AUC(y_train, p)
    acc = accuracy(y_train, np.round(p))

    print("\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".
          format(ll, auc, acc))

    #

    try:
        p = clf.predict_proba(x_test)[:, 1]  # sklearn convention
    except IndexError:
        p = clf.predict_proba(x_test)

    ll = log_loss(y_test, p)
    auc = AUC(y_test, p)
    acc = accuracy(y_test, np.round(p))

    print(
        "# testing  | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format(
            ll, auc, acc))

    #return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc }
    return {'loss': ll, 'log_loss': ll, 'auc': auc}
示例#26
0
def evaluate(data, model, ver, cuda_flag, time_fn):
    from sklearn.metrics import roc_auc_score as AUC
    from sklearn.metrics import average_precision_score as AP

    correct_list = []
    score_list = []
    for (X, y) in data:
        outputs, targets = calculate(X, y, model, ver, cuda_flag, time_fn)
        correct_list.extend(y)
        score_list.extend(outputs.data.tolist())
    auc = AUC(correct_list, score_list)
    ap = AP(correct_list, score_list)
    return correct_list, score_list, auc, ap
示例#27
0
def test_metric(testset):
    frame_labels = []
    frame_preds = []
    video_labels = []
    video_preds = []
    for i in testset:
        frame_preds += i[2]
        frame_labels += [i[1]] * len(i[2])
        video_preds.append(i[3])
        video_labels.append(i[1])
    video_thres, video_acc = acc_eval(video_labels, video_preds)
    frame_thres, frame_acc = acc_eval(frame_labels, frame_preds)
    video_auc = AUC(video_labels, video_preds)
    frame_auc = AUC(frame_labels, frame_preds)
    rs = {
        'video_acc': video_acc,
        'video_threshold': video_thres,
        'video_auc': video_auc,
        'frame_acc': frame_acc,
        'frame_threshold': frame_thres,
        'frame_auc': frame_auc
    }
    return rs
示例#28
0
def eval_pred( y_true, y_pred, eval_type):
    if eval_type == 'logloss':
        loss = ll( y_true, y_pred )
        print("logloss: ", loss)
        return loss            
    
    elif eval_type == 'auc':
        loss = AUC( y_true, y_pred, multi_class='ovo')
        print("AUC: ", loss)
        return loss             
    
    elif eval_type == 'rmse':
        loss = np.sqrt(mean_squared_error(y_true, y_pred))
        print("rmse: ", loss)
        return loss
示例#29
0
def eval_pred(y_true, y_pred, eval_type):
    if eval_type == 'logloss':  #eval_typeはここに追加
        loss = ll(y_true, y_pred)
        print "logloss: ", loss
        return loss

    elif eval_type == 'auc':
        loss = AUC(y_true, y_pred)
        print "AUC: ", loss
        return loss

    elif eval_type == 'rmse':
        loss = np.sqrt(mean_squared_error(y_true, y_pred))
        print "rmse: ", loss
        return loss
def metrics(label_list, pred_list, pos_prob_list):

    metric_dict = dict()
    for m in config['metric']:
        if m == 'fbs':
            metric_dict[m] = FBS(label_list, pred_list, 1)
        elif m == 'acc':
            metric_dict[m] = ACC(label_list, pred_list)
        elif m == 'auc':
            metric_dict[m] = AUC(label_list, pos_prob_list)
        else:
            print('Error : No such metric. Implement it.')
            raise

    return metric_dict