예제 #1
0
파일: main.py 프로젝트: DeathCoil/kaggle
def make_predictions(load_list=[]):
    
    train, test, target, test_index = io.load_data()
    result = np.zeros(len(test))
    seed_list = [1234, 2345, 3456, 4567, 5678, 6789, 7890, 8901, 9012, 123]
    mcw_list = [1, 1, 1, 1, 1, 8, 8, 8, 8, 8]
    seed_list2 = [1242, 5432]
    mcw_list2 = [1, 8]
    
    train, test, target, test_index = io.load_data()
    train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)
    train_meta, test_meta = train_level1(load_list=["xgb1", "xgb2", "knn", "ext1", "ext2", "rf1", "rf2"])
    train = np.column_stack((train, train_meta))
    test = np.column_stack((test, test_meta))
    
    print "Training model1..."
    pred1 = np.zeros(len(test))
    for i in range(10):
        if "xgb1" not in load_list:
            model1 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95,
                                       subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i])
            model1.fit(train, target)
            pickle.dump(model1, open("final_models/xgb/1/xgb_n_"+str(i)+".pkl", "wb"))
        else:
            model1 = pickle.load(open("final_models/xgb/1/xgb_n_"+str(i)+".pkl", "rb"))
        pred1 += model1.predict_proba(test)[:, 1]
    pred1 /= len(seed_list)

    print "Training model2..."
    pred2 = np.zeros(len(test))
    for i in range(2):
        if "xgb2" not in load_list:
            model2 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95,
                                       subsample=1, min_child_weight=mcw_list2[i], seed=seed_list2[i])
            model2 = sklearn.calibration.CalibratedClassifierCV(model2, method="isotonic", cv=10)
            model2.fit(train, target)
            pickle.dump(model2, open("final_models/xgb/2/xgb_n_"+str(i)+".pkl", "wb"))
        else:
            model2 = pickle.load(open("final_models/xgb/2/xgb_n_"+str(i)+".pkl", "rb"))
        pred2 += model2.predict_proba(test)[:, 1]
    pred2 /= len(seed_list2)

    print "Training model4..."
    pred4 = np.zeros(len(test))
    if "ext1" not in load_list:
        model4 = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4,
                                                       max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234)
        model4.fit(train, target)
        pred4 = model4.predict_proba(test)[:, 1]        
        pred4.dump("final_models/ext/ext1_pred")
    else:
        pred4 = np.load("final_models/ext/ext1_pred")
            
    result = 0.7*np.sqrt(pred1*pred2) + 0.3*pred4
    
    return result
예제 #2
0
파일: main.py 프로젝트: DeathCoil/kaggle
def main():
    
    
    train, test, target, test_index = io.load_data()  
    train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)

    #tuning.parametr_tuning(train, target, param_grid={})
    #tuning.ensemble_tuning(train, test, target, load_list=["xgb1"])
    
    result = make_predictions(load_list=["xgb1", "xgb2", "ext1"])
    io.save_result(test_index, result)
예제 #3
0
파일: main.py 프로젝트: DeathCoil/kaggle
def main():
    train, test, target, test_index = io.load_data()
    train, test, target = fe.preprocess_data(train, test, target)

    #tuning.tune_xgboost(train, target, load_list=[])
    #tuning.parametr_tuning(train, target, param_grid={})
    #tuning.ensemble_tuning(train, target, load_list=[])

    model = sklearn.ensemble.RandomForestClassifier(n_estimators=2000, max_depth=8, criterion="entropy", bootstrap=False,
                                                    min_samples_leaf=4, min_samples_split=2, random_state=1234)

    model.fit(train, target)
    result = model.predict_proba(test)[:, 1]

    """
    result = make_predictions(train, target, test, load_list=["rf_entropy", "xgb"])
    """
    io.save_result(test_index, result)
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from input_output import load_data
import numpy as np
from keras.utils import np_utils

DATAFILE = "SEQC_NB_batchCorr_tr_FAV_TopFeats.txt"
LABELSFILE = "SEQC_NB_batchCorr_tr_FAV.lab"
sample_names_tr, var_names_tr, X_train = load_data(DATAFILE)
y_tr = np.loadtxt(LABELSFILE, dtype=np.int)
n_classes = np.max(y_tr)+1
y_train = np_utils.to_categorical(y_tr+1)
y_train = np_utils.to_categorical(y_tr,n_classes)

model = Sequential()
model.add(Dense(64,input_dim=X_train.shape[1],init='uniform'))
model.add(Activation('tanh'))
model.add(Dense(64,init='uniform'))
model.add(Activation('tanh'))
model.add(Dense(2,init='uniform'))
model.add(Activation('softmax'))

sgd = SGD(lr=0.003)
model.compile(loss='binary_crossentropy',optimizer=sgd,metrics=['accuracy'])
model.fit(X_train,y_train,nb_epoch=200,batch_size=16)
예제 #5
0
		RANK_METHOD = 'randomForest'

# number of CV folds
#CV_K = 5
# number of CV cycles
#CV_N = 10
# number of Montecarlo CV cycles (for SVM tuning)
TUN_CV_K = 10
# fraction of the dataset to keep apart as test split (for SVM tuning)
TUN_CV_P = 50
# list of C values for SVM tuning
TUN_SVM_C = [10**k for k in np.arange(-7, 5)]
# maximum count of trying k-fold data selection
KFOLD_TRY = 100

sample_names, var_names, x = load_data(DATAFILE)
y = np.loadtxt(LABELSFILE, dtype=np.int)

# build FSTEPS according to dataset size
nfeat = x.shape[1]
ord = np.int(np.log10(nfeat))
fs = np.empty(0, dtype=np.int)
for p in range(ord+1):
    fs = np.concatenate( (fs, np.dot(10**p, np.arange(10))) )
fs = np.unique(fs)[1:]
# cap FSTEPS at 10000 features, if applicable
FLIM = 10000 if nfeat>10000 else nfeat
FSTEPS = fs[ fs <= FLIM ].tolist() + [nfeat]

### FSTEPS = range(1,10) + range(10, 100, 10) + range(100, 1000, 100) + range(1000, 10000, 1000) + [10000] + [x.shape[1]]
예제 #6
0
파일: main.py 프로젝트: DeathCoil/kaggle
                               update_momentum=0.9, eval_size=0.01, verbose=0,
                               max_epochs=100, use_label_encoder=True)

            model3.fit(train, target)
            pickle.dump(model3, open("final_models/nn/nn_n_"+str(i)+".pkl", "wb"))
        else:
            model3 = pickle.load(open("final_models/nn/nn_n_"+str(i)+".pkl", "rb"))
        pred3 += model3.predict_proba(test)[:, 1]
    pred3 /= 10


    if ranking:
        pred1 = scipy.stats.rankdata(pred1)
        pred2 = scipy.stats.rankdata(pred2)
        pred3 = scipy.stats.rankdata(pred3)

    result = 0.21*pred1 + 0.47*pred2 + 0.32*pred3

    return result


train, test, target, test_index = io.load_data()
train, test, target = fe.preprocess_data(train, test, target)

tuning.parametr_tuning(train, target, param_grid={"alpha": [0.01]})
#tuning.ensemble_tuning(train, target, ranking=True, load_list=["linear", "xgb"])

"""
result = make_predictions(train, target, test, ranking=True, load_list=["linear", "xgb", "nn"])
io.save_result(test_index, result)
"""
예제 #7
0
파일: main.py 프로젝트: DeathCoil/kaggle
def train_level1(load_list=[]):    
    print "Training xgb1..."
    train, test, target, test_index = io.load_data()
    train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)

    N_FOLDS = 10
    cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
    train_xgb1 = np.zeros(train.shape[0])
    test_xgb1 = np.zeros(test.shape[0])
    for fold, (train_index, test_index) in enumerate(cv):
        if "xgb1" not in load_list:
            model = xgb.XGBClassifier(n_estimators=180, learning_rate=0.05, max_depth=11, colsample_bytree=0.8,
                                      subsample=0.96, min_child_weight=4, seed=1234)
            model.fit(train[train_index], target[train_index])
            pickle.dump(model, open("level1/xgb/1/xgb_1_fold_"+str(fold)+".pkl", "wb"))
        else:
            model = pickle.load(open("level1/xgb/1/xgb_1_fold_"+str(fold)+".pkl", "rb"))
        train_xgb1[test_index] = model.predict_proba(train[test_index])[:, 1]
        test_xgb1 += model.predict_proba(test)[:, 1]/N_FOLDS

    train_meta = train_xgb1.reshape((train_xgb1.shape[0], 1))
    test_meta = test_xgb1.reshape((test_xgb1.shape[0], 1)) 
    
    print "Training xgb2..."
    train, test, target, test_index = io.load_data()
    train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)

    N_FOLDS = 10
    cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
    train_xgb2 = np.zeros(train.shape[0])
    test_xgb2 = np.zeros(test.shape[0])
    for fold, (train_index, test_index) in enumerate(cv):
        if "xgb2" not in load_list:
            model = xgb.XGBRegressor(n_estimators=600, learning_rate=0.02, max_depth=9, colsample_bytree=1,
                                     subsample=1, min_child_weight=1, seed=1234)
            model.fit(train[train_index], target[train_index])
            pickle.dump(model, open("level1/xgb/2/xgb_1_fold_"+str(fold)+".pkl", "wb"))
        else:
            model = pickle.load(open("level1/xgb/2/xgb_1_fold_"+str(fold)+".pkl", "rb"))
        train_xgb2[test_index] = model.predict(train[test_index])
        test_xgb2 += model.predict(test)/N_FOLDS

    train_meta = np.column_stack((train_meta, train_xgb2))
    test_meta = np.column_stack((test_meta, test_xgb2))

    
    print "Training knn..."
    if "knn" not in load_list:
        train, test, target, test_index = io.load_data()
        train, test, target = fe.preprocess_data(train, test, target, preprocess_type=4)
    
        N_FOLDS = 10
        cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
        train_knn = np.zeros(train.shape[0])
        test_knn = np.zeros(test.shape[0])
        for fold, (train_index, test_index) in enumerate(cv):
            model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=128, metric="minkowski", weights="distance", n_jobs=-1)
            model.fit(train[train_index], target[train_index])
            train_knn[test_index] = model.predict_proba(train[test_index])[:, 1]
        model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=128, metric="minkowski", weights="distance", n_jobs=-1)
        model.fit(train, target)
        test_knn = model.predict_proba(test)[:, 1]
        train_knn.dump("level1/knn/train_knn")
        test_knn.dump("level1/knn/test_knn")
    else:
        train_knn = np.load("level1/knn/train_knn")
        test_knn = np.load("level1/knn/test_knn")
     
    train_meta = np.column_stack((train_meta, train_knn))
    test_meta = np.column_stack((test_meta, test_knn))
    
    print "Training ext1..."
    if "ext1" not in load_list:
        train, test, target, test_index = io.load_data()
        train, test, target = fe.preprocess_data(train, test, target, preprocess_type=3)
    
        N_FOLDS = 10
        cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
        train_ext1 = np.zeros(train.shape[0])
        test_ext1 = np.zeros(test.shape[0])
        for fold, (train_index, test_index) in enumerate(cv):
            model = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4,
                                                       max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234)
            model.fit(train[train_index], target[train_index])
            train_ext1[test_index] = model.predict_proba(train[test_index])[:, 1]
            test_ext1 += model.predict_proba(test)[:, 1]/N_FOLDS
        train_ext1.dump("level1/ext/1/train_ext")
        test_ext1.dump("level1/ext/1/test_ext")
    else:
        train_ext1 = np.load("level1/ext/1/train_ext")
        test_ext1 = np.load("level1/ext/1/test_ext")

    train_meta = np.column_stack((train_meta, train_ext1))
    test_meta = np.column_stack((test_meta, test_ext1))

    
    print "Training ext2..."
    if "ext2" not in load_list:
        train, test, target, test_index = io.load_data()
        train, test, target = fe.preprocess_data(train, test, target, preprocess_type=5)
    
        N_FOLDS = 10
        cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
        train_ext2 = np.zeros(train.shape[0])
        test_ext2 = np.zeros(test.shape[0])
        for fold, (train_index, test_index) in enumerate(cv):
            model = sklearn.ensemble.ExtraTreesRegressor(n_estimators=1000,max_features=50,min_samples_split=4,
                                                         max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234)
            model.fit(train[train_index], target[train_index])
            train_ext2[test_index] = model.predict(train[test_index])
            test_ext2 += model.predict(test)/N_FOLDS
        train_ext2.dump("level1/ext/2/train_ext")
        test_ext2.dump("level1/ext/2/test_ext")
    else:
        train_ext2 = np.load("level1/ext/2/train_ext")
        test_ext2 = np.load("level1/ext/2/test_ext")
    
    train_meta = np.column_stack((train_meta, train_ext2))    
    test_meta = np.column_stack((test_meta, test_ext2))
    
    
    print "Training rf1..."
    if "rf1" not in load_list:
        train, test, target, test_index = io.load_data()
        train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)
    
        N_FOLDS = 10
        cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
        train_rf1 = np.zeros(train.shape[0])
        test_rf1 = np.zeros(test.shape[0])
        for fold, (train_index, test_index) in enumerate(cv):
            model = sklearn.ensemble.RandomForestClassifier(n_estimators=500, criterion="entropy", max_depth=32, min_samples_leaf=4, n_jobs=-1, random_state=1234)
            model.fit(train[train_index], target[train_index])
            train_rf1[test_index] = model.predict_proba(train[test_index])[:, 1]
            test_rf1 += model.predict_proba(test)[:, 1]/N_FOLDS
        train_rf1.dump("level1/rf/1/train_rf")
        test_rf1.dump("level1/rf/1/test_rf")
    else:
        train_rf1 = np.load("level1/rf/1/train_rf")
        test_rf1 = np.load("level1/rf/1/test_rf")

    train_meta = np.column_stack((train_meta, train_rf1))
    test_meta = np.column_stack((test_meta, test_rf1))

    
    print "Training rf2..."
    if "rf2" not in load_list:
        train, test, target, test_index = io.load_data()
        train, test, target = fe.preprocess_data(train, test, target, preprocess_type=1)
    
        N_FOLDS = 10
        cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
        train_rf2 = np.zeros(train.shape[0])
        test_rf2 = np.zeros(test.shape[0])
        for fold, (train_index, test_index) in enumerate(cv):
            model = sklearn.ensemble.RandomForestRegressor(n_estimators=500, max_depth=32, min_samples_leaf=4, n_jobs=-1, random_state=1234)
            model.fit(train[train_index], target[train_index])
            train_rf2[test_index] = model.predict(train[test_index])
            test_rf2 += model.predict(test)/N_FOLDS
        train_rf2.dump("level1/rf/2/train_rf")
        test_rf2.dump("level1/rf/2/test_rf")
    else:
        train_rf2 = np.load("level1/rf/2/train_rf")
        test_rf2 = np.load("level1/rf/2/test_rf")    

    train_meta = np.column_stack((train_meta, train_rf2))
    test_meta = np.column_stack((test_meta, test_rf2))
    
    
    return train_meta, test_meta
예제 #8
0
def preprocess_data(train, test, target, preprocess_type=1):
    if preprocess_type == 1:
        train = add_features(train)
        test = add_features(test)

        train, test = label_cats(train, test, nan_value=-999)

        train = add_features(train)
        test = add_features(test)
        train = drop_features(train)
        test = drop_features(test)

        train, test = input_missing(train, test, input_type="-999")

    elif preprocess_type == 2:
        train, test = label_cats(train, test, nan_value=10000)
        train_cat, test_cat = do_one_hot(train, test, drop_cat=True)

        train = add_features(train)
        test = add_features(test)
        train = drop_features(train)
        test = drop_features(test)

        train, test = input_missing(train, test, input_type="mean")

        train = scipy.sparse.hstack((train, train_cat), format="csr")
        test = scipy.sparse.hstack((test, test_cat), format="csr")
    
    elif preprocess_type == 3:
        train = add_features(train)
        test = add_features(test)
        train, test = label_cats(train, test, nan_value=-999)

        train, test = MungeData(train, target, test)

        train = drop_features(train)
        test = drop_features(test)

        train, test = input_missing(train, test, input_type="-999")      
        
    elif preprocess_type == 4:
        train = add_features(train)
        test = add_features(test)

        train, test = MungeData(train, target, test)

        train = drop_features(train)
        test = drop_features(test)

        train, test = input_missing(train, test, input_type="-1") 

        scaler = sklearn.preprocessing.StandardScaler()
        train = scaler.fit_transform(train)
        test = scaler.transform(test)
    
    elif preprocess_type == 5:

        train, test, target, test_index = io.load_data(drop="ext")        
   
        train, test = label_cats(train, test, nan_value=-999)

        train['v22-1']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
        test['v22-1']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0]))
        train['v22-2']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
        test['v22-2']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1]))
        train['v22-3']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
        test['v22-3']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2]))
        train['v22-4']=train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))
        test['v22-4']=test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3]))

        train = add_features(train)
        test = add_features(test)
        train = drop_features(train)
        test = drop_features(test)

        train, test = input_missing(train, test, input_type="-999")        

        rnd = 12
        n_ft = 20
        max_elts = 3
        

        a=addNearestNeighbourLinearFeatures(n_neighbours=n_ft, max_elts=max_elts, verbose=True, random_state=rnd)
        a.fit(train, target)

        train = a.transform(train)
        test = a.transform(test)

    elif preprocess_type == 6:
        train = add_features(train)
        test = add_features(test)

        train, test = label_cats(train, test, nan_value=-999)

        train = add_features(train)
        test = add_features(test)
        train = drop_features(train)
        test = drop_features(test)

        train.drop(["v50"], axis=1, inplace=True)
        test.drop(["v50"], axis=1, inplace=True)
        
        train, test = input_missing(train, test, input_type="-999")

    if type(train) is pd.DataFrame:
        train = train.values
        test = test.values

    return train, test, target
예제 #9
0
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

import input_output
import models
import preprocess
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

# Define constants
TEST_SIZE_SAMPLE = 0.0001
RANDOM_STATE_SPLIT = 38
NUMBER_OF_ENTRIES = 300000

rawData = input_output.load_data("train.csv")
processedData = preprocess.prepare_data(rawData)

# Separate features and output + scale data
# pandas has some weird column counting
scaler = StandardScaler()
multiBinarizer = MultiLabelBinarizer()

trainData = scaler.fit_transform(processedData[:, 0:2])

# Convert output to binarized array
numbers = np.reshape((processedData[:, 2]), (len(processedData[:, 2]), 1))
predOutput = multiBinarizer.fit_transform(numbers)

X_train, X_test, y_train, y_test = train_test_split(trainData, predOutput, test_size=TEST_SIZE_SAMPLE,
                                                    random_state=RANDOM_STATE_SPLIT)
예제 #10
0
파일: tuning.py 프로젝트: DeathCoil/kaggle
def ensemble_tuning(train, test, target, load_list=[]):
    N_FOLDS = 10
    seed_list = [1234, 2345, 6789, 7890]
    mcw_list = [1, 1, 8, 8]
    
    cv = sklearn.cross_validation.StratifiedKFold(target, n_folds=N_FOLDS, shuffle=True, random_state=1234)
    scores = np.zeros((N_FOLDS, 101), dtype=np.float32)

    train, test, target, test_index = io.load_data()
    X1, _, target = fe.preprocess_data(train, test, target, preprocess_type=1)
    train_meta, test_meta = main.train_level1(load_list=["xgb1", "xgb2", "knn", "ext1", "ext2", "rf1", "rf2"])
    X1 = np.column_stack((X1, train_meta))
    
    for fold, (train_index, test_index) in enumerate(cv):

        print("Training model1...")
        pred1 = np.zeros(len(X1[test_index]))
                
        for i in range(4):
            if "xgb1" not in load_list:
                model1 = xgb.XGBClassifier(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95,
                                       subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i])
                model1.fit(X1[train_index], target[train_index])
                pickle.dump(model1, open("cv/xgb/1/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "wb"))
            else:
                model1 = pickle.load(open("cv/xgb/1/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "rb"))
            pred1 += model1.predict_proba(X1[test_index])[:, 1]
        pred1 /= len(seed_list)

        #print("Training model2...")
        #pred2 = np.zeros(len(X1[test_index]))
        
        """
        for i in range(6):
            if "xgb2" not in load_list:
                model2 = xgb.XGBRegressor(n_estimators=550, learning_rate=0.01, max_depth=6, colsample_bytree=0.95,
                                          subsample=1, min_child_weight=mcw_list[i], seed=seed_list[i])
                model2.fit(X1[train_index], target[train_index])
                pickle.dump(model1, open("cv/xgb/2/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "wb"))
            else:
                model2 = pickle.load(open("cv/xgb/2/xgb_1_fold_"+str(fold)+"_n_"+str(i)+".pkl", "rb"))
            pred2 += model1.predict(X1[test_index])
        pred2 /= len(seed_list)
        
        pred2[pred2 >= 0.99] = 0.99
        pred2[pred2 <= 0.01] = 0.01
        """
        
        print("Training model3...")
        if "ext1" not in load_list:
            model3 = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000,max_features=50,criterion='entropy',min_samples_split=4,
                                                           max_depth=35, min_samples_leaf=2, n_jobs =-1, random_state=1234)
            model3.fit(X1[train_index], target[train_index])
            pred3 = model3.predict_proba(X1[test_index])[:, 1]
            pred3.dump("cv/rf/pred_fold_"+str(fold))
        else:
            pred3 = np.load("cv/rf/pred_fold_"+str(fold))
        
        print("Calculating scores...")
        for alpha in np.ndindex(101):
            scores[fold][alpha] = sklearn.metrics.log_loss(target[test_index], 0.01*alpha[0]*pred1 + np.max(1 - 0.01*alpha[0], 0)*pred3)#np.power(pred1**(0.01*alpha[0])*pred2**(0.01*alpha[1]), 1/(0.01*(alpha[0] + alpha[1] + 1))))
            #                                                    0.01*alpha[0]*pred1 + np.max(1 - 0.01*alpha[0], 0)*pred2)
        print("Current fold:", np.min(scores[fold]), np.unravel_index(scores[fold].argmin(), scores[fold].shape), scores[fold][100], scores[fold][0])
        sc1 = np.mean(scores, axis=0) * 1.0 / (fold+1) * N_FOLDS
        print("Accumulated:", np.min(sc1), np.unravel_index(sc1.argmin(), sc1.shape), sc1[100], sc1[0])

    scores1 = np.mean(scores, axis=0)
    print(np.min(scores1), np.unravel_index(scores1.argmin(), scores1.shape), scores1[100], scores1[0])

    return scores
예제 #11
0
파일: main.py 프로젝트: DeathCoil/kaggle
    X_test = scaler.transform(X_test)


    X_train = scipy.sparse.hstack((text_train_tfidf, X_train), format="csr")
    X_test = scipy.sparse.hstack((text_test_tfidf, X_test), format="csr")

    model = sklearn.linear_model.LogisticRegression(C=0.7, penalty="l2")

    result = make_predictions(model, X_train, target, X_test)
    io.save_result(test["PostId"], result)

    return result

def make_dirs(dir_names):
    for name in dir_names:
        if not os.path.exists(name):
            os.makedirs(name)

dir_names = ["input", "output", "w2v", "metafeatures"]
make_dirs(dir_names)

train, target, test = io.load_data()
text_train_tfidf, text_test_tfidf = get_tfidf(train, test)
preds1 = rf_model(train, target, test, text_train_tfidf,
                  text_test_tfidf)
preds2 = linear_model(train, target, test, text_train_tfidf,
                      text_test_tfidf)

result = 0.7*preds1 + 0.3*preds2
io.save_result(test["PostId"], result)