def make_mf_regression(X ,y, clf, qid, X_test, n_round=3):
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor
    '''
    print clf
    mf_tr = np.zeros(X.shape[0])
    mf_te = np.zeros(X_test.shape[0])
    for i in range(n_round):
        skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            clf.fit(X_tr, y_tr)
            mf_tr[ind_te] += clf.predict(X_te)
            mf_te += clf.predict(X_test)*0.5

            y_pred = np.round(clf.predict(X_te))
            kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred)
            acc = np.mean(y_te == y_pred)
            print 'pred[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    return (mf_tr / n_round, mf_te / n_round)
예제 #2
0
def LogR(C=1):
    from sklearn.linear_model import LogisticRegression

    lr = LogisticRegression( C=C,
        penalty="l2", dual=False, tol=1e-5,
        fit_intercept=True, intercept_scaling=1.0,
        class_weight='balanced',
        n_jobs = -1,
        max_iter = 10000,
        solver="lbfgs"
        ) 

    lr.fit(x_train,y_train)
    y_pred = lr.predict(x_test)
    y_pred = list(y_pred)
    
    qwk=quadratic_weighted_kappa(y_test,y_pred)
    print("kappa",qwk)

    print("Log  验证集 Acc = ",calcAcc(y_pred,y_test))
    #################################################
    y_pred = lr.predict(x_train)
    y_pred = list(y_pred)

    print("Log  训练集 Acc = ",calcAcc(y_pred,y_train))

    return  y_pred
예제 #3
0
def RF():
    from sklearn.ensemble import RandomForestClassifier

    #train
    #initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators= 15000,
    max_features = "auto",
    max_depth = None,
    n_jobs = -1,
    )

    #use the data set labeled_train_data
    '''
    n_classes_ : int or list
    The number of classes (single output problem), or a list containing the number of classes for each output (multi-output problem).
    '''

    forest = forest.fit(x_train, y_train)

    #predict
    y_pred = forest.predict(x_test)
    y_pred = list(y_pred)

    qwk=quadratic_weighted_kappa(y_test,y_pred)
    print("kappa",qwk)
    print("RF Acc = ",calcAcc(y_pred,y_test))
def make_mf_classification2(X ,y, clf, qid, X_test, n_round=3):
    '''
    Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier and only 2 class presented
    '''
    print clf
    mf_tr = np.zeros((X.shape[0], 2))
    mf_te = np.zeros((X_test.shape[0], 2))
    for i in range(n_round):
        skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            clf.fit(X_tr, y_tr)
            try:
                mf_tr[ind_te] += clf.predict_proba(X_te)
                mf_te += clf.predict_proba(X_test)*0.5
            except:
                mf_tr[ind_te, 0] += clf.decision_function(X_te)
                mf_te[:, 0] += clf.decision_function(X_test)*0.5
                
            y_pred = np.round(clf.predict(X_te))
            kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred)
            acc = np.mean(y_te == y_pred)
            print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    print
    return (mf_tr / n_round, mf_te / n_round)
def make_mf_classification4(X ,y, clf, qid, X_test, n_round=3):
    print clf
    mf_tr = np.zeros((X.shape[0], 5))
    mf_te = np.zeros((X_test.shape[0], 5))
    for i in range(n_round):
        skf = StratifiedKFold(qid, n_folds=2, shuffle=True, random_state=42+i*1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            clf.fit(X_tr, y_tr)
            mf_tr[ind_te, 4] += clf.predict(X_te)
            mf_te[:, 4] += clf.predict(X_test)*0.5
            try:
                mf_tr[ind_te, :4] += clf.predict_proba(X_te)
                mf_te[:, :4] += clf.predict_proba(X_test)*0.5
            except:
                mf_tr[ind_te, :4] += clf.decision_function(X_te)
                mf_te[:,:4] += clf.decision_function(X_test)*0.5
            y_pred = np.round(clf.predict(X_te))
            kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred)
            acc = np.mean(y_te == y_pred)
            print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    print
    return (mf_tr / n_round, mf_te / n_round)
예제 #6
0
def make_mf_classification2(X, y, clf, qid, X_test, n_round=3):
    print clf
    mf_tr = np.zeros((X.shape[0], 2))
    mf_te = np.zeros((X_test.shape[0], 2))
    for i in range(n_round):
        skf = StratifiedKFold(qid,
                              n_folds=2,
                              shuffle=True,
                              random_state=42 + i * 1000)
        for ind_tr, ind_te in skf:
            X_tr = X[ind_tr]
            X_te = X[ind_te]

            y_tr = y[ind_tr]
            y_te = y[ind_te]

            clf.fit(X_tr, y_tr)
            try:
                mf_tr[ind_te] += clf.predict_proba(X_te)
                mf_te += clf.predict_proba(X_test) * 0.5
            except:
                mf_tr[ind_te, 0] += clf.decision_function(X_te)
                mf_te[:, 0] += clf.decision_function(X_test) * 0.5

            y_pred = np.round(clf.predict(X_te))
            kappa = pykappa.quadratic_weighted_kappa(y_te, y_pred)
            acc = np.mean(y_te == y_pred)
            print 'prob[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    print
    return (mf_tr / n_round, mf_te / n_round)
def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'):
    '''
    Perform per-query slicing, BoW on text, fit @clf and get prediction for test. Assumed that @clf -- classifier
    '''
    print '\n [make_mf_slice]'
    print clf
    mf_tr = np.zeros(len(subset_tr))
    mf_te = np.zeros(len(subset_te))

    #query-slice
    for cur_query in subset_tr.query_stem.value_counts().index:
        mask_tr = subset_tr.query_stem == cur_query
        mask_te = subset_te.query_stem == cur_query
        
        # build Bow
        vect = CountVectorizer(min_df=1, ngram_range=(1,2))

        txts = (list((subset_tr[mask_tr]['title_stem']).values) + 
                list((subset_te[mask_te]['title_stem']).values))
        vect.fit(txts)

        X_loc_base = vect.transform(list((subset_tr[mask_tr]['title_stem']).values)).todense()
        X_loc_hold = vect.transform(list((subset_te[mask_te]['title_stem']).values)).todense()
        y_loc_train = subset_tr[mask_tr][target_col].values
        # intersect terms
        feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(np.sum(X_loc_hold, axis=0))[0]
        feat_mask = np.where(feat_counts>0)[0]
        # build final feats matrix
        X_loc_base = np.hstack((X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list]))
        X_loc_hold = np.hstack((X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list]))
        
        # metafeatures iterators
        tmp_tr = np.zeros(sum(mask_tr))
        tmp_te = np.zeros(sum(mask_te))
        
        #print y_loc_train.shape, X_loc_base.shape
        
        for i in range(n_round):
            kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42+i*1000)
            for ind_tr, ind_te in kf:
                X_tr = X_loc_base[ind_tr]
                X_te = X_loc_base[ind_te]
                y_tr = y_loc_train[ind_tr]
                y_te = y_loc_train[ind_te]

                clf.fit(X_tr, y_tr)
                tmp_tr[ind_te] += clf.predict(X_te)
                tmp_te += clf.predict(X_loc_hold)*0.5
        mf_tr[mask_tr.values] = tmp_tr / n_round
        mf_te[mask_te.values] = tmp_te / n_round

    y_valid = subset_tr[target_col].values
    kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr))
    acc = np.mean(y_valid == np.round(mf_tr))
    print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    return (mf_tr, mf_te)
예제 #8
0
def svr():

    clf = SVR(C=4.0,gamma=0.2,cache_size=2048,kernel='rbf')
    clf.fit(x_train,y_train,sample_weight=getWeights())#,sample_weight=weights)
    y_pred = clf.predict(x_test)

    y_pred = list(y_pred)
    y_p = rounding_cdf(y_pred)

    qwk=quadratic_weighted_kappa(y_test,y_p)
    print("SVR Kappa:",qwk)
예제 #9
0
def ridge(alpha=1.0):
    from sklearn.linear_model import Ridge#, Lasso, LassoLars, ElasticNet
    ridge = Ridge(alpha=alpha, normalize=False)
    ridge.fit(x_train, y_train,sample_weight=getWeights())#, sample_weight=weight_train[index_base]
    y_pred = ridge.predict(x_test)

    y_pred = list(y_pred)
    #print(y_pred[:10])
    y_p = rounding_cdf(y_pred)
    qwk=quadratic_weighted_kappa(y_test,y_p)
    print("RidgeRegression Kappa:",qwk)
예제 #10
0
def LinearR():
    from sklearn.linear_model import LinearRegression
    
    model = LinearRegression(n_jobs = -1)
    model.fit(x_train, y_train,sample_weight=getWeights())
    y_pred = model.predict(x_test)
    y_pred = list(y_pred)

    y_p = rounding_cdf(y_pred)
    qwk=quadratic_weighted_kappa(y_test,y_p)
    print("LinearRegresion Kappa:",qwk)
예제 #11
0
def Lasso(alpha=1.0):
    from sklearn.linear_model import Lasso#, Lasso, LassoLars, ElasticNet
    
    lasso = Lasso(alpha=alpha, normalize=False)
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_test)

    y_pred = list(y_pred)
    y_p = rounding_cdf(y_pred)
    qwk=quadratic_weighted_kappa(y_test,y_p)
    print("kappa",qwk)
    print("LASSO rounding cdf Acc = ",calcAcc(y_p,y_test))
예제 #12
0
def validate(n_epochs, n_models, n_steps=5, activations=False):
    with h5py.File(constants.train_features_scaled_strat_file, "r") as fi:
        labels_train = fi.get("y_train")[:60000]
        X_train = fi.get("X_train")[:60000]
        y_train, _ = preprocess_labels(labels_train,
                                       categorical=(net_type == 'softmax'))

        labels_test = fi.get("y_test")[()]
        X_test = fi.get("X_test")[()]
        y_test, _ = preprocess_labels(labels_test,
                                      categorical=(net_type == 'softmax'))

        y_train = y_train / 5.0 / 2 + 0.5
        y_test = y_test / 5.0 / 2 + 0.5

        if net_type == 'softmax':
            n_classes = y_train.shape[1]
        elif net_type == 'regression':
            n_classes = 1
        print(n_classes, 'classes')

        n_dims = X_train.shape[1]
        print(n_dims, 'dims')

        cum_blend = 0
        models = range(1, n_models + 1)
        for i in models:
            print("\n-------------- Model %d --------------\n" % i)

            model = model_factory(n_classes, n_dims, net_type)
            for n in range(0, n_epochs, n_steps):
                model.fit(X_train,
                          y_train,
                          nb_epoch=n_steps,
                          batch_size=128,
                          show_accuracy=False,
                          verbose=2)  #, validation_data=(X_test, y_test))

                # validate individual net
                if net_type == 'softmax':
                    y_pred = model.predict_classes(X_test, verbose=0)
                elif net_type == 'regression':
                    y_pred = model.predict(X_test, verbose=0)
                    y_pred = np.floor((y_pred - 0.5) * 2 * 5.0).flatten()
                    y_pred[y_pred < 0] = 0
                    y_pred[y_pred > 4] = 4

                print('Epoch: %d. Accuracy: %0.2f%%. Kappa: %0.2f' %
                      (n + n_steps, 100 * accuracy_score(labels_test, y_pred),
                       quadratic_weighted_kappa(labels_test, y_pred)))

            # validate ensemble
            if net_type == 'softmax':
                cum_blend += model.predict_proba(X_test, verbose=0)
                y_pred = np.argmax(cum_blend, axis=1)
            elif net_type == 'regression':
                cum_blend += model.predict(X_test, verbose=0)
                y_pred = np.floor((cum_blend / i - 0.5) * 2 * 5.0).flatten()
                y_pred[y_pred < 0] = 0
                y_pred[y_pred > 4] = 4

            print('\nBlend %d. Accuracy: %0.2f%%. Kappa: %0.2f' %
                  (i, 100 * accuracy_score(labels_test, y_pred),
                   quadratic_weighted_kappa(labels_test, y_pred)))
            print('Confusion matrix:\n', confusion_matrix(labels_test, y_pred))

            fitted = fit2distribution(labels_test, cum_blend)
            print('\nFitted. Accuracy: %0.2f%%. Kappa: %0.2f' %
                  (100 * accuracy_score(labels_test, fitted),
                   quadratic_weighted_kappa(labels_test, fitted)))
            print('Confusion matrix:\n', confusion_matrix(labels_test, fitted))

            if activations:
                F_train = pick_activations(model, X_train, net_type)
                F_test = pick_activations(model, X_test, net_type)

                fout = os.path.join(
                    constants.features_NN_dir,
                    features_NN_prefix + format(i, '02d') + '.hd5')
                with h5py.File(fout, "w") as fo:
                    fo.create_dataset("X_train", data=F_train)
                    fo.create_dataset("y_train", data=labels_train)
                    fo.create_dataset("X_test", data=F_test)
                    fo.create_dataset("y_test", data=labels_test)

                with h5py.File(fout, "r") as fi:
                    X = fi.get("X_train")
                    y = fi.get("y_train")
                    XX = fi.get("X_test")
                    yy = fi.get("y_test")
                    print(X.shape, y.shape, XX.shape, yy.shape)
예제 #13
0
def func(data):
    d1 = data[:, 0]
    d2 = data[:, 1]
    kappa = quadratic_weighted_kappa(d1, d2)
    return kappa
예제 #14
0
def validate(n_epochs, n_models, n_steps=5, activations=False):
    with h5py.File(constants.train_features_scaled_strat_file, "r") as fi:
        labels_train = fi.get("y_train")[:60000]
        X_train = fi.get("X_train")[:60000]
        y_train, _ = preprocess_labels(labels_train, categorical=(net_type=='softmax'))
        
        labels_test = fi.get("y_test")[()]
        X_test = fi.get("X_test")[()]
        y_test, _ = preprocess_labels(labels_test, categorical=(net_type=='softmax'))
    
        y_train = y_train/5.0/2+0.5
        y_test = y_test/5.0/2+0.5
    
        if net_type == 'softmax':
            n_classes = y_train.shape[1]
        elif net_type == 'regression':
            n_classes = 1
        print(n_classes, 'classes')
        
        n_dims = X_train.shape[1]
        print(n_dims, 'dims')
    
        cum_blend = 0
        models = range(1, n_models+1)
        for i in models:
            print("\n-------------- Model %d --------------\n" % i)
        
            model = model_factory(n_classes, n_dims, net_type)
            for n in range(0, n_epochs, n_steps):
                model.fit(X_train, y_train, nb_epoch=n_steps, batch_size=128,
                          show_accuracy=False, verbose=2)#, validation_data=(X_test, y_test))
    
                # validate individual net
                if net_type == 'softmax':
                    y_pred = model.predict_classes(X_test, verbose=0)
                elif net_type == 'regression':        
                    y_pred = model.predict(X_test, verbose=0)
                    y_pred = np.floor((y_pred-0.5)*2*5.0).flatten()
                    y_pred[y_pred<0] = 0
                    y_pred[y_pred>4] = 4
    
                print('Epoch: %d. Accuracy: %0.2f%%. Kappa: %0.2f' %
                (n+n_steps,
                 100 * accuracy_score(labels_test, y_pred),
                 quadratic_weighted_kappa(labels_test, y_pred)))
            
    
            # validate ensemble
            if net_type == 'softmax':
                cum_blend += model.predict_proba(X_test, verbose=0)
                y_pred = np.argmax(cum_blend, axis=1)
            elif net_type == 'regression':  
                cum_blend += model.predict(X_test, verbose=0)
                y_pred = np.floor((cum_blend/i-0.5)*2*5.0).flatten()
                y_pred[y_pred<0] = 0
                y_pred[y_pred>4] = 4
    
            print('\nBlend %d. Accuracy: %0.2f%%. Kappa: %0.2f' %
            (i, 100 * accuracy_score(labels_test, y_pred),
             quadratic_weighted_kappa(labels_test, y_pred)))
            print('Confusion matrix:\n', confusion_matrix(labels_test, y_pred))
            
            fitted = fit2distribution(labels_test, cum_blend)
            print('\nFitted. Accuracy: %0.2f%%. Kappa: %0.2f' %
            (100 * accuracy_score(labels_test, fitted),
            quadratic_weighted_kappa(labels_test, fitted)))
            print('Confusion matrix:\n', confusion_matrix(labels_test, fitted))
            
            if activations:
                F_train = pick_activations(model, X_train, net_type)
                F_test = pick_activations(model, X_test, net_type)
                
                fout = os.path.join(constants.features_NN_dir,
                                    features_NN_prefix + format(i,'02d') +'.hd5')
                with h5py.File(fout, "w") as fo:
                    fo.create_dataset("X_train", data=F_train)
                    fo.create_dataset("y_train", data=labels_train)
                    fo.create_dataset("X_test", data=F_test)
                    fo.create_dataset("y_test", data=labels_test)
                    
                with h5py.File(fout, "r") as fi:
                    X = fi.get("X_train")
                    y = fi.get("y_train")
                    XX = fi.get("X_test")
                    yy = fi.get("y_test")
                    print(X.shape, y.shape, XX.shape, yy.shape)        
예제 #15
0
def make_mf_sliced_classification(subset_tr,
                                  subset_te,
                                  clf,
                                  n_round=3,
                                  target_col='median_relevance'):
    print '\n [make_mf_slice]'
    print clf
    mf_tr = np.zeros(len(subset_tr))
    mf_te = np.zeros(len(subset_te))

    #query-slice
    for cur_query in subset_tr.query_stem.value_counts().index:
        mask_tr = subset_tr.query_stem == cur_query
        mask_te = subset_te.query_stem == cur_query

        # build Bow
        vect = CountVectorizer(min_df=1, ngram_range=(1, 2))

        txts = (list((subset_tr[mask_tr]['title_ext']).values) + list(
            (subset_te[mask_te]['title_ext']).values))
        vect.fit(txts)

        X_loc_base = vect.transform(
            list((subset_tr[mask_tr]['title_ext']).values)).todense()
        X_loc_hold = vect.transform(
            list((subset_te[mask_te]['title_ext']).values)).todense()
        y_loc_train = subset_tr[mask_tr][target_col].values
        # intersect terms
        feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array(
            np.sum(X_loc_hold, axis=0))[0]
        feat_mask = np.where(feat_counts > 0)[0]
        # build final feats matrix
        X_loc_base = np.hstack(
            (X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list]))
        X_loc_hold = np.hstack(
            (X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list]))

        # metafeatures iterators
        tmp_tr = np.zeros(sum(mask_tr))
        tmp_te = np.zeros(sum(mask_te))

        #print y_loc_train.shape, X_loc_base.shape

        for i in range(n_round):
            kf = KFold(len(y_loc_train),
                       n_folds=2,
                       shuffle=True,
                       random_state=42 + i * 1000)
            for ind_tr, ind_te in kf:
                X_tr = X_loc_base[ind_tr]
                X_te = X_loc_base[ind_te]
                y_tr = y_loc_train[ind_tr]
                y_te = y_loc_train[ind_te]

                clf.fit(X_tr, y_tr)
                tmp_tr[ind_te] += clf.predict(X_te)
                tmp_te += clf.predict(X_loc_hold) * 0.5
        mf_tr[mask_tr.values] = tmp_tr / n_round
        mf_te[mask_te.values] = tmp_te / n_round

    y_valid = subset_tr[target_col].values
    kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr))
    acc = np.mean(y_valid == np.round(mf_tr))
    print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc)
    return (mf_tr, mf_te)