Пример #1
0
def main(n_iter, n_folds, smodels, n_jobs=None, stack=0, use_vote=0, gnrl='KNC', 
        modsel=0, rfe=0, psearch=0,
        starter=0, verbose=0, submit=0):
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)

    models = []
    for m in smodels.split('+'):
        models.append( eval('Model%02d()'%int(m)) )
    #models = (Model02(),Model12(),Model10(),) # ***
    logger.debug("models:%s", models)
    X = X_all[:n_train]

    logger.info('Find params for models')
    for model in models:
        model.set_params(**find_params(model, X, y, scoring='roc_auc', n_iter=n_iter,
            n_jobs=n_jobs, random_state=random_state+1, psearch=psearch)
        )

    rd = ModelStack(models,gnrl=gnrl,stack=stack, use_vote=use_vote,
        modsel=modsel,rfe=rfe)
    if starter:
        logger.info('Starters start')
        rd.starter()
   
    if psearch > 1 and len(models)==1:  # update current model best score
        y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, 
            n_jobs=n_jobs, random_state=random_state+2)
        update_params_best_score(models[0], np.mean(scores))
        return
    elif not submit:
        logger.debug('Cross validation starts')
        y_pred, scores = cv_run(rd, X, y, n_folds=n_folds, n_iter=n_iter, 
            n_jobs=n_jobs, random_state=random_state)
        prepare.Prepare_0().dump_ypred_residuals(y,y_pred)
        if verbose > 1:
            plot_errors(X,y,y_pred)
        if stack:
            logger.info("Mean Coefs: %s", rd.mean_coefs())
        return
    else:
        logger.info("Prepare submission..")

    logger.info("training on full data")
    rd.fit(X_all[:n_train],y)
    Xtest = X_all[n_train:]
    pred = rd.predict_proba(Xtest)[:,1]
    import submit
    submit.do_submit(pred)
Пример #2
0
    def fit(self, Xmask, y):
        pr = prepare.Prepare_0(model=13,
                               n_components=128,
                               min_df=3,
                               preproc=0,
                               use_svd=True,
                               tfidf=2,
                               stemmer=0)
        (X_all_df, _, BP, params) = pr.load_transform(update=False)
        names = list(X_all_df.columns)
        X_all = np.asarray(X_all_df)
        self.X_all, self.names = X_all, names

        clf = lm.LogisticRegression(penalty='l1',
                                    dual=False,
                                    tol=0.00001,
                                    C=0.05,
                                    fit_intercept=True,
                                    intercept_scaling=1.0,
                                    class_weight=None,
                                    random_state=random_state)

        self.rd = Pipeline([
            ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
            #("scaler",StandardScaler(with_mean=False)),
            ("scaler", StandardScaler(with_mean=True)),
            ("est", clf)
        ])

        self.rd.fit(Xmask, np.asarray(y))
        return self
Пример #3
0
    def fit(self, Xmask, y):
        pr = prepare.Prepare_0(model=10,
                               preproc=1,
                               min_df=1,
                               use_svd=False,
                               tfidf=2,
                               stemmer=0)
        (X_all_df, _, BP, params) = pr.load_transform(update=False)
        names = list(X_all_df.columns)
        X_all = np.asarray(X_all_df)
        self.X_all, self.names = X_all, names

        clf0 = GaussianNB()
        clf1 = MultinomialNB(alpha=0.8)
        clf2 = BernoulliNB(alpha=1, binarize=0.01)

        clf = clf1
        self.rd = Pipeline([
            ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
            #("scaler",StandardScaler(with_mean=False)),
            ("est", clf)
        ])

        self.rd.fit(Xmask, np.asarray(y))
        return self
Пример #4
0
 def _get_featureset(self):
     return prepare.Prepare_0(model=14,
                              n_components=512,
                              preproc=1,
                              min_df=1,
                              use_svd=True,
                              tfidf=2,
                              stemmer=0)
Пример #5
0
def main(submit=0):
    Xall_df,y = prepare.Prepare_0().load(preproc=0, update=False)
    #Xall_df,y = Xall_df.iloc[:500,:],y[:300]
    lentrain = len(y)
    Xtrain_df = Xall_df.iloc[:lentrain,:]

    clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=random_state)
    clf2 = RandomForestClassifier(n_estimators=200, max_depth=24,
            n_jobs=-1, random_state=random_state, verbose=0)

    clf3 = GradientBoostingClassifier(n_estimators=42, max_depth=24,
            random_state=random_state, verbose=2, subsample=0.9)

    clf4 = svm.SVC(probability=True)

    clf5 = KNeighborsClassifier(n_neighbors=5)

    clf6 = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=50, n_jobs=1, penalty='elasticnet', power_t=0.5,
           random_state=random_state, rho=None, shuffle=False, verbose=0,
           warm_start=False)

    clf = clf1
   
    if 0:
        selector = RFECVp(clf,clf, step=10, cv=4, scoring="roc_auc", verbose=2)
        selector = selector.fit( Transformer().fit_transform(Xtrain_df, y), y)
        clf = selector

    rd = Pipeline([
        ("trans", Transformer()),
        #("selector", SelectPercentile(chi2, percentile=90)),
        #("selector", SelectPercentile(f_classif, percentile=50)),
        #("selector", lm.RandomizedLogisticRegression(C=1, random_state=random_state, verbose=1)),
        #("pca", PCA(n_components='mle')),
        #("pca", PCA(n_components=500)),
        #("svd", TruncatedSVD(n_components=200, random_state=random_state )),
        #("lasso",svm.LinearSVC(C=0.5, penalty="l1", dual=False)),
        ("est", clf)
        ])

    
    if not submit:
        cv_run(rd, Xtrain_df, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(Xtrain_df,y)
    Xtest_df = Xall_df.iloc[lentrain:,:]
    pred = rd.predict_proba(Xtest_df)[:,1]
    import submit
    submit.do_submit(pred)
Пример #6
0
    def fit(self, Xmask, y):
        pr = prepare.Prepare_0(model=14,
                               n_components=512,
                               preproc=1,
                               min_df=1,
                               use_svd=True,
                               tfidf=2,
                               stemmer=0)
        (X_all_df, _, BP, params) = pr.load_transform(update=False)
        names = list(X_all_df.columns)
        X_all = np.asarray(X_all_df)
        self.X_all, self.names = X_all, names

        clf1 = lm.LogisticRegression(penalty='l2',
                                     dual=True,
                                     tol=0.00001,
                                     C=1,
                                     fit_intercept=True,
                                     intercept_scaling=1.0,
                                     class_weight=None,
                                     random_state=random_state)

        class LassoCV_proba(lm.LassoCV):
            def predict_proba(self, X):
                print 'alpha_:', self.alpha_
                y = self.predict(X)
                y = 1. / (1 + np.exp(-(y - 0.5)))
                return np.vstack((1 - y, y)).T

        class RidgeCV_proba(lm.RidgeCV):
            def predict_proba(self, X):
                print 'alpha_:', self.alpha_
                y = self.predict(X)
                if 0:
                    y_min, y_max = y.min(), y.max()
                    if y_max > y_min:
                        y = (y - y_min) / (y_max - y_min)
                else:
                    y = 1. / (1 + np.exp(-(y - 0.5)))
                return np.vstack((1 - y, y)).T

        clf2 = RidgeCV_proba(alphas=np.linspace(0, 10), cv=4)
        clf3 = LassoCV_proba(alphas=None, cv=4)
        clf4 = svm.SVR(C=3, kernel='linear')

        clf = clf1

        self.rd = Pipeline([
            ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
            #("scaler",StandardScaler(with_mean=False)),
            #("filter",lm.LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=random_state)),
            ("est", clf)
        ])

        self.rd.fit(Xmask, np.asarray(y))
        return self
Пример #7
0
def test():
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)
    X = X_all[:200]
    y = y[:200]
    rd = Model09()
    rd.starter()
    cv_run(rd, X, y)

    print "tests ok"
Пример #8
0
def get_model04_data():
    fname = '../data/model04_data'
    print "get %s" % fname
    try:
        (X_all, names) = joblib.load(fname)
    except:
        X_all_df, y = prepare.Prepare_0().load(preproc=0, update=False)
        names = list(X_all_df.columns)
        X_all = np.asarray(X_all_df)
        joblib.dump((X_all, names), fname)
    return X_all, names
Пример #9
0
def test():
    Xall_df,y = prepare.Prepare_0().load()
    Xall_df,y = Xall_df.iloc[:400,:],y[:200]
    lentrain = len(y)
    
    clf1 = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, 
                             C=1, fit_intercept=True, intercept_scaling=1.0, 
                             class_weight=None, random_state=random_state)
    clf5 = KNeighborsClassifier(n_neighbors=5)
    
    clf= clf5

    rd = Pipeline([
        ("trans", Transformer()),
        ("est", clf)
        ])
    cv_run(rd, Xall_df.iloc[:lentrain,:], y)
    
    print "tests ok"
Пример #10
0
 def _get_featureset(self):
     extra_par = dict(ngram_max=self.ngram_max,
                      max_df=self.max_df,
                      binary=self.binary,
                      max_features=self.max_features,
                      use_idf=self.use_idf,
                      smooth_idf=self.smooth_idf,
                      sublinear_tf=self.sublinear_tf,
                      norm=self.norm,
                      token_min=self.token_min,
                      do_remove_stopwords=self.do_remove_stopwords)
     extra_js = json.dumps(extra_par)
     return prepare.Prepare_0(n_components=self.n_components,
                              preproc=self.preproc,
                              min_df=self.min_df,
                              use_svd=self.use_svd,
                              tfidf=self.tfidf,
                              stemmer=self.stemmer,
                              fit_area=self.fit_area,
                              extra=extra_js)
Пример #11
0
def main(submit=0):
    y, colnames, n_train, n_test, n_all = prepare.Prepare_0().load_y_colnames()
    X_all = np.arange(n_all)
    X = X_all[:n_train]

    rd = Model09()
    rd.starter()

    if not submit:
        cv_run(rd, X, y)
        return
    else:
        print "Prepare submission.."

    print "training on full data"
    rd.fit(X_all[:n_train], y)
    Xtest = X_all[n_train:]
    pred = rd.predict_proba(Xtest)[:, 1]
    import submit
    submit.do_submit(pred)
Пример #12
0
    def fit(self, Xmask, y):
        X_all_df, _ = prepare.Prepare_0(model=4).load(preproc=0, update=False)
        names = list(X_all_df.columns)
        X_all = np.asarray(X_all_df)
        self.X_all, self.names = X_all, names
        clf = lm.LogisticRegression(penalty='l2',
                                    dual=True,
                                    tol=0.0001,
                                    C=1,
                                    fit_intercept=True,
                                    intercept_scaling=1.0,
                                    class_weight=None,
                                    random_state=random_state)

        self.rd = Pipeline([("trans", Transformer(names=self.names)),
                            ("scaler", StandardScaler(with_mean=False)),
                            ("est", clf)])

        self.rd.fit(m(self.X_all, Xmask), np.asarray(y))
        return self