Пример #1
0
def main(submit=0):
    X_all_df,y = prepare.Prepare_0(model=4).load(preproc=0, update=False)
    names = list(X_all_df.columns)
    #X_all_df,y = X_all_df.iloc[:500,:],y[:300]
    lentrain = len(y)
    Xtrain_df = X_all_df.iloc[:lentrain,:]

    clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=random_state)
    clf2 = RandomForestClassifier(n_estimators=200, max_depth=24,
            n_jobs=-1, random_state=random_state, verbose=0)

    clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24,
            random_state=random_state, verbose=2, subsample=0.9)

    clf4 = svm.SVC(probability=True)

    clf5 = KNeighborsClassifier(n_neighbors=5)

    clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5,
           random_state=random_state, rho=None, shuffle=False, verbose=0,
           warm_start=False)

    clf = clf1
   
    if 0:
        clf = RFECVp(clf,clf, step=4, cv=4, scoring="roc_auc", verbose=2)

    rd = Pipeline([
        ("trans", Transformer(names=names)),
        
        #("scaler",StandardScaler()), 
        ("scaler",StandardScaler(with_mean=False)), 
        
        #("selector", SelectPercentile(chi2, percentile=50)),
        #("selector", SelectPercentile(f_classif, percentile=50)),
        #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)),
        #("pca", PCA(n_components='mle')),
        #("pca", PCA(n_components=500)),
        #("svd", TruncatedSVD(n_components=50, random_state=random_state )),
        #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)),
        ("est", clf)
        ])

    
    if not submit:
        cv_run(rd, Xtrain_df, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(Xtrain_df,y)
    Xtest_df = X_all_df.iloc[lentrain:,:]
    pred = rd.predict_proba(Xtest_df)[:,1]
    import submit
    submit.do_submit(pred)
Пример #2
0
def main(submit=0):
    Xall_df,y = prepare.Prepare_0().load(preproc=0, update=False)
    #Xall_df,y = Xall_df.iloc[:500,:],y[:300]
    lentrain = len(y)
    Xtrain_df = Xall_df.iloc[:lentrain,:]

    clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=random_state)
    clf2 = RandomForestClassifier(n_estimators=200, max_depth=24,
            n_jobs=-1, random_state=random_state, verbose=0)

    clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24,
            random_state=random_state, verbose=2, subsample=0.9)

    clf4 = svm.SVC(probability=True)

    clf5 = KNeighborsClassifier(n_neighbors=5)

    clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5,
           random_state=random_state, rho=None, shuffle=False, verbose=0,
           warm_start=False)

    clf = clf1
   
    if 0:
        selector = RFECVp(clf,clf, step=10, cv=4, scoring="roc_auc", verbose=2)
        selector = selector.fit( Transformer().fit_transform(Xtrain_df, y), y)
        clf = selector

    rd = Pipeline([
        ("trans", Transformer()),
        #("selector", SelectPercentile(chi2, percentile=90)),
        #("selector", SelectPercentile(f_classif, percentile=50)),
        #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)),
        #("pca", PCA(n_components='mle')),
        #("pca", PCA(n_components=500)),
        #("svd", TruncatedSVD(n_components=200, random_state=random_state )),
        #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)),
        ("est", clf)
        ])

    
    if not submit:
        cv_run(rd, Xtrain_df, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(Xtrain_df,y)
    Xtest_df = Xall_df.iloc[lentrain:,:]
    pred = rd.predict_proba(Xtest_df)[:,1]
    import submit
    submit.do_submit(pred)
Пример #3
0
def main(n_iter, n_folds, smodels, n_jobs=None, stack=0, use_vote=0, gnrl='KNC', 
        modsel=0, rfe=0, psearch=0,
        starter=0, verbose=0, submit=0):
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)

    models = []
    for m in smodels.split('+'):
        models.append( eval('Model%02d()'%int(m)) )
    #models = (Model02(),Model12(),Model10(),) # ***
    logger.debug("models:%s", models)
    X = X_all[:n_train]

    logger.info('Find params for models')
    for model in models:
        model.set_params(**find_params(model, X, y, scoring='roc_auc', n_iter=n_iter,
            n_jobs=n_jobs, random_state=random_state+1, psearch=psearch)
        )

    rd = ModelStack(models,gnrl=gnrl,stack=stack, use_vote=use_vote,
        modsel=modsel,rfe=rfe)
    if starter:
        logger.info('Starters start')
        rd.starter()
   
    if psearch > 1 and len(models)==1:  # update current model best score
        y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, 
            n_jobs=n_jobs, random_state=random_state+2)
        update_params_best_score(models[0], np.mean(scores))
        return
    elif not submit:
        logger.debug('Cross validation starts')
        y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, 
            n_jobs=n_jobs, random_state=random_state)
        prepare.Prepare_0().dump_ypred_residuals(y,y_pred)
        if verbose > 1:
            plot_errors(X,y,y_pred)
        if stack:
            logger.info("Mean Coefs: %s", rd.mean_coefs())
        return
    else:
        logger.info("Prepare submission..")

    logger.info("training on full data")
    rd.fit(X_all[:n_train],y)
    Xtest = X_all[n_train:]
    pred = rd.predict_proba(Xtest)[:,1]
    import submit
    submit.do_submit(pred)
Пример #4
0
def main(submit=0):
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)
    X = X_all[:n_train]
    
    rd = Model14()
    rd.starter()
    
    if not submit:
        cv_run(rd, X, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(X_all[:n_train],y)
    Xtest = X_all[n_train:]
    pred = rd.predict_proba(Xtest)[:,1]
    import submit
    submit.do_submit(pred)
Пример #5
0
def main(submit=0):
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)
    X = X_all[:n_train]

    rd = Model09()
    rd.starter()

    if not submit:
        cv_run(rd, X, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(X_all[:n_train], y)
    Xtest = X_all[n_train:]
    pred = rd.predict_proba(Xtest)[:, 1]
    import submit
    submit.do_submit(pred)