Exemplo n.º 1
0
def get_votes(rows_range):
    votes = np.zeros([len(rows_range), 104])
    Y = np.zeros(len(rows_range))
    for i, row_id in enumerate(rows_range):
        votes[i, :] = read_variable('final/tr_votes_1L/' + str(row_id))
        Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl')
    return votes, Y
Exemplo n.º 2
0
def get_votes_large(rows_range):
    votes = np.zeros([len(rows_range), 104])
    Y = np.zeros(len(rows_range))
    bar = progressbar.ProgressBar()
    i = 0
    for row_id in bar(rows_range):
        votes[i, :] = read_variable('final/tr_votes_1L/' + str(row_id))
        Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl')
        i += 1
    return votes, Y
def load_tr_XY(group_id):
    gp_0s_idx = tr_0s_groups[group_id]

    gp_idx = np.concatenate([tr_1s_idx, gp_0s_idx])
    gp_idx = np.random.permutation(gp_idx)

    gp_X = np.zeros([len(gp_idx), col_numeric_nu])
    gp_Y = np.zeros(len(gp_idx))
    bar = progressbar.ProgressBar()
    i = 0
    for row_id in bar(gp_idx):
        row_num = read_variable('data/train_numeric_rows/' + str(row_id) +
                                '.pkl')
        gp_Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl')
        gp_X[i, :] = row_num
        i += 1

    return gp_X, gp_Y
Exemplo n.º 4
0
model_folders = [
    'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp'
]

for model_folder in model_folders:
    print('Processing model cluster:', model_folder)

    model_ids = []
    model_stds = []
    model_means = []

    for root, dirs, files in os.walk(model_folder):
        bar = progressbar.ProgressBar()
        for model_idx in bar(files):
            model_path = os.path.join(root, model_idx)
            model = read_variable(model_path)
            #print(model_idx,'std:',model.val_std,',mean:',model.val_mean)
            model_stds.append(model.val_std)
            model_means.append(model.val_mean)
            model_ids.append(model_idx)

    #%
    plt.figure()
    plt.errorbar(model_ids, model_means, yerr=model_stds, fmt='o')
    plt.title(root)

#%%
model_folders = [
    'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp'
]
Exemplo n.º 5
0
import utils
import pandas as pd
import numpy as np
import progressbar

max_chunk_size = 1000

train_rows_nu = 1183747
chunk_nu = 1184

#%
column_names = utils.read_variable('outputs/column_names.pkl')
cols_date = column_names['date']
#%%
"""
process date data
"""

from sklearn.neighbors.kde import KernelDensity
import os.path

responses = utils.read_variable('model_stats/responses.pkl').astype(int)

while True:
    try:
        for col_name in cols_date:
            print('processing date col:', col_name)

            file_path = 'model_stats/date/' + col_name + '.pkl'
            if os.path.exists(file_path):
                print('already exist.')
Exemplo n.º 6
0
import numpy as np
import pandas as pd
import utils
from sklearn.neural_network import MLPRegressor

#%%

chunk_i = 45
tr_chunk = utils.read_variable('output/tr_chunks/' + str(chunk_i))

test_chunk = utils.read_variable('output/test_data')

tr_Y = tr_chunk['y']

#%%
'''
result: tr R scores are always below or around 0 regardless of adjusting following configs
    1. alpha
    2. hidden_layer_sizes
    3. activation
    4. tol
    5. solver
'''

from sklearn.preprocessing import Imputer
from sklearn import preprocessing
import time
from sklearn.model_selection import KFold

Y = tr_chunk['y'].values
X = tr_chunk.drop(['id', 'y'], 1).as_matrix()
Exemplo n.º 7
0
import utils
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import time
import sklearn.metrics as mx
#%%
people_act_train = utils.read_variable('outputs/people_act_train')
column_names = utils.read_variable('outputs/column_names')
people_act_train_category_dummys = utils.read_variable(
    'outputs/people_act_train_category_dummys')
pc_number = 70
pca = utils.read_variable('outputs/people_act_train_category_pca_' +
                          str(pc_number) + 'pc')
selected_col_ids = utils.read_variable(
    'outputs/people_act_train_category_selected_col_ids_pvalue_0.05')

from scipy.sparse import csr_matrix

# can NOT do transform all data in one process which will lead to memory leak

length = people_act_train_category_dummys.shape[0]

#%%########################
#% Use two IPython console and run the PCA asc and desc separately as a multiple-threading solution
##########################

#%% DSC####################
##########################
# can NOT do transform all data in one process which will lead to memory leak
people_act_train_category_pcs_dsc = csr_matrix(
Exemplo n.º 8
0
import utils
import pandas as pd
import numpy as np
import progressbar
from collections import defaultdict
max_chunk_size = 1000

train_rows_nu = 1183747
chunk_nu = 10  #1184

#%
column_names = utils.read_variable('outputs/column_names.pkl')
cols_numeric = column_names['numeric']

#%%
file_name = 'data/train_numeric.csv'
cols = cols_numeric
nan_counts = np.zeros(cols.size)
col_nan_counts = defaultdict(int)

col_index = 0
bar = progressbar.ProgressBar()
for col_name in bar(cols_numeric):
    #print('processing (',file_name,') col:',col_name)

    chunks = pd.read_csv(file_name,
                         usecols=[col_name],
                         chunksize=max_chunk_size,
                         low_memory=False,
                         iterator=True)
Exemplo n.º 9
0
import utils

import numpy as np
import time
import pandas as pd

#%%

print('loading col_stats_cate...')
col_stats_cate = utils.read_variable('model_stats/col_stats_cate.pkl')
print('loading col_stats_date...')
col_stats_date = utils.read_variable('model_stats/col_stats_date.pkl')
print('loading col_stats_num...')
col_stats_num = utils.read_variable('model_stats/col_stats_num.pkl')
#%% 
'''
calculate probability of response 0 with categorical col
'''
def cal_0_proba_by_cate(col_name,value):
    stat_0 = col_stats_cate[col_name][0]
    stat_1 = col_stats_cate[col_name][1]
    response0_proba = 0
    if value!=value:
        # nan
        response0_proba = stat_0['nan']/(stat_0['nan']+stat_1['nan'])
    elif value in stat_0 and value not in stat_1:
        response0_proba = 1
        
    elif value not in stat_0 and value in stat_1:
        response0_proba = 0
        
Exemplo n.º 10
0
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
import utils
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
#%%
kbest_i = [0, 88, 91, 99, 100]
for chunk_i in range(100):
    print(chunk_i, end='-->')
    chunk = utils.read_variable('output/tr_chunks/' + str(chunk_i))
    X = chunk.drop(['id', 'y'], 1).as_matrix()
    X = X[:, kbest_i]
    Y = chunk['y'].values

    X_imputed = Imputer(missing_values='NaN', strategy='median',
                        axis=0).fit_transform(X)
    X_norm = preprocessing.StandardScaler().fit_transform(X_imputed)

    kf_i = 0
    skf = KFold(n_splits=5, shuffle=True, random_state=13)
    for tr_idx, val_idx in skf.split(X_norm):
        print('kf', kf_i, end='-->')
        kf_i += 1
        tr_X, tr_Y = X_norm[tr_idx, :], Y[tr_idx]
        val_X, val_Y = X_norm[val_idx, :], Y[val_idx]

        model = LinearRegression()
        model.fit(tr_X, tr_Y)
        tr_Y_pred = model.predict(tr_X)
        tr_r = utils.cal_r(tr_Y, tr_Y_pred)
Exemplo n.º 11
0
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import utils
import time

#%
column_names = utils.read_variable('outputs/column_names')
people_act_train = utils.read_variable('outputs/people_act_train')

#% convert category column to integer
#-----------------------------------
people_act_train_category2int = pd.DataFrame()
for name in column_names['category']:
    people_act_train_category2int[name] = people_act_train[name].str.replace(
        '((type)|(group))\s', '')

people_act_train_category2int = people_act_train_category2int.fillna(value=0)
del name
#% convert integer to one hot codes
one_hot_encoder = OneHotEncoder(n_values='auto', sparse=True)
one_hot_encoder.fit(people_act_train_category2int)

#% this variable will not presented in variable explorer and can NOT call toarray() which lead MemoryError
people_act_train_category_dummys = one_hot_encoder.transform(
    people_act_train_category2int)
print(people_act_train_category_dummys.shape)
print(type(people_act_train_category_dummys))

del people_act_train_category2int
Exemplo n.º 12
0
import utils
import progressbar



column_names = utils.read_variable('outputs/column_names.pkl')
cols_categorical = column_names['categorical']
cols_date = column_names['date']
cols_numeric = column_names['numeric']

col_len = cols_categorical.size+cols_date.size+cols_numeric.size

del column_names
#%% 
'''
calculate probability of response 0 with categorical col
'''
col_stats_cate = {}
print('import col statistic:','categorical columns')
bar = progressbar.ProgressBar()
for col_name in bar(cols_categorical):
    col_stats_cate[col_name] = utils.read_variable('model_stats/cate/'+col_name+'.pkl')

utils.save_variable(col_stats_cate,'model_stats/col_stats_cate.pkl')

      
'''
calculate probability of response 0 with date col
'''
col_stats_date = {}
print('import col statistic:','date columns')
Exemplo n.º 13
0
import matplotlib.pyplot as plt
import utils
import pandas as pd

#%%
column_names = utils.read_variable('column_names')
#%%
#a_row = people[2:3]
#a_col = people['people_id']
#people_unique_ids= people.people_id.unique()
#a_cell = people[2:3]['people_id']

#%% join people and act tables
# right join: use act_train keys only
people_act_train = pd.merge(people,
                            act_train,
                            how='right',
                            on='people_id',
                            suffixes=('_p', '_a'))
people_act_test = pd.merge(people,
                           act_test,
                           how='right',
                           on='people_id',
                           suffixes=('_p', '_a'))

#%%
people_act_activity_category_type1 = people_act_train.loc[(
    people_act_train['activity_category'] == 'type 1')][0:1]
people_act_char_10_a_notnull = people_act_train.loc[
    people_act_train['char_10_a'].notnull()][0:1]
    (x_cate_num_len, col_cate_nu + col_numeric_nu + +col_date_nu))
y_cate_num = np.zeros(x_cate_num_len)
bar = progressbar.ProgressBar()
print('loading cate proba and raw num, and date...')

chunks_num = pd.read_csv('data/train_numeric.csv',
                         chunksize=max_chunk_size,
                         low_memory=False,
                         iterator=True)
chunks_date = pd.read_csv('data/train_date.csv',
                          chunksize=max_chunk_size,
                          low_memory=False,
                          iterator=True)
for chunk_id in bar(range(0, chunk_nu, 1)):

    chunk_cate = utils.read_variable('model_stats/train_cate_proba/' +
                                     str(chunk_id) + '.pkl')
    chunk_num = chunks_num.get_chunk()
    chunk_date = chunks_date.get_chunk()
    row_range = range(chunk_id * max_chunk_size,
                      chunk_id * max_chunk_size + chunk_cate.shape[0], 1)
    x_cate_num[row_range, :col_cate_nu] = chunk_cate
    x_cate_num[row_range, col_cate_nu:col_cate_nu +
               col_numeric_nu] = chunk_num.drop(['Response'], axis=1)
    x_cate_num[row_range, col_cate_nu + col_numeric_nu:] = chunk_date
    y_cate_num[row_range] = chunk_num['Response']

del chunk_id, bar, chunk_num, chunk_cate, row_range

#%%
'''
remove low density col
#%%

# used to find the best SGD model during the training
sgd_val_chunk_nu = 10
sgd_val_chunk_ids = range(0, sgd_val_chunk_nu, 1)
sgd_val_nu = sgd_val_chunk_nu * max_chunk_size
sgd_val_y = np.empty(sgd_val_nu, dtype=np.int)
sgd_val_y[:] = np.NAN
sgd_val_x = np.empty((sgd_val_nu, col_num_date + col_cate))
sgd_val_x[:] = np.NAN

print('loading model_val dataset.', 'chunk [', sgd_val_chunk_ids, ')')
i = 0
for chunk_id in sgd_val_chunk_ids:
    chunk = utils.read_variable(
        'chunk_tree_votes/models/train_y_votes_prob/chunk_' + str(chunk_id) +
        '.pkl')
    row_range = range(i * max_chunk_size, i * max_chunk_size + chunk.shape[0],
                      1)
    sgd_val_y[row_range] = chunk['Response']
    # num and date
    votes_date_nu = chunk.drop(['Response'], axis=1)
    sgd_val_x[row_range, 0:col_num_date] = votes_date_nu
    # cate
    chunk = utils.read_variable('model_stats/train_cate_proba/' +
                                str(chunk_id) + '.pkl')
    sgd_val_x[row_range, col_num_date:] = chunk

    i += 1

del i, chunk_id, row_range, chunk, votes_date_nu
#%%
print('-----------------init first Model_2nd----------------')
SUPER_start_timestamp = 10
SUPER_window_size = SUPER_start_timestamp
SUPER_kf_k = 5
SUPER_seed = 13
SUPER_alpha = 1e-8
skf = KFold(n_splits=SUPER_kf_k, shuffle=True, random_state=SUPER_seed)
'''
load lag_models (FIFO)
'''
lag_models = []
for lag_model_timestamp in range(SUPER_start_timestamp - SUPER_window_size,
                                 SUPER_start_timestamp, 1):
    lag_model = utils.read_variable(
        'E:/two-sigma/output/timeseries/model_1L/' + str(lag_model_timestamp))
    lag_models.append(lag_model)
'''
Init Model_2L
'''
timestamp = SUPER_start_timestamp

print('processing', timestamp)

tr_chunk = utils.read_variable('E:/two-sigma/output/timeseries/tr_chunks/' +
                               str(timestamp))
'''
CV
'''
kf_i = 0
best_model_2L_r = 0
from utils import read_variable

all_chunk_idx = range(1184)
tr_chunk_idx = read_variable('final/tr_chunk_idx')
val_chunk_idx = read_variable('final/val_chunk_idx')

#%%
'''
check consistence between chunk index
'''
for chunk_id in all_chunk_idx:
    chunk_y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl')
    chunk_num = read_variable('data/train_numeric_chunks/' + str(chunk_id) +
                              '.pkl')
    chunk_date = read_variable('data/train_date_chunks/' + str(chunk_id) +
                               '.pkl')
    chunk_cate = read_variable('data/train_categorical_chunks/' +
                               str(chunk_id) + '.pkl')

    print(chunk_id, end=',')
    diff = chunk_num.index.difference(chunk_date.index)
    if diff.size != 0:
        print('X', end=',')
    else:
        print('v', end=',')
    diff = chunk_num.index.difference(chunk_cate.index)
    if diff.size != 0:
        print('X', end=',')
    else:
        print('v', end=',')
    diff = chunk_num.index.difference(chunk_y.index)
Exemplo n.º 18
0
from sklearn.metrics import matthews_corrcoef

#%%

print('reading training num and date col votes matrix...')
# memory can not handle DataFrame
# memory can not handle votes and Response in two separate numpy variables
votes = np.zeros((1183748, 1185))

max_chunk_size = 1000

#responses = np.zeros((1183748, 1))
bar = progressbar.ProgressBar()
for chunk_id in bar(range(0, 1184, 1)):
    chunk = utils.read_variable(
        'chunk_tree_votes/models/train_y_votes_prob/chunk_' + str(chunk_id) +
        '.pkl')
    votes[chunk_id * max_chunk_size:chunk_id * max_chunk_size + chunk.shape[0],
          1:] = chunk.drop(['Response'], axis=1)
    votes[chunk_id * max_chunk_size:chunk_id * max_chunk_size + chunk.shape[0],
          0] = chunk['Response']
del chunk_id, bar, chunk

#%

x_tr = votes[:900000, 1:]
y_tr = votes[:900000, 0]

# used to find best SGD in training
x_val = votes[900000:1000000, 1:]
y_val = votes[900000:1000000, 0]
Exemplo n.º 19
0
test_ratio = 0.01
for name, group in data_gp:
    print('T'+str(name),end=',')
    gp_idx = set(data_gp.groups[name])
    idx_pool = gp_idx.intersection(remained_sample_ids)
    sel_ids_len = int(len(gp_idx)*test_ratio)
    sel_ids = set(random.sample(idx_pool,sel_ids_len))
    print('sel',len(sel_ids),'samples from',len(gp_idx))
    sel_sample_ids = sel_sample_ids | sel_ids
    remained_sample_ids = remained_sample_ids - sel_ids

utils.save_variable(sel_sample_ids,'output/test_ids')
print('testing samples:',len(sel_sample_ids))
del sel_sample_ids
#%%
test_ids = list(utils.read_variable('output/test_ids'))
'''
check whether testing dataset is well distributed among different obj ids
'''
data_gp = data.groupby('id')
overall_stats_by_id = {}
for name, group in data_gp:
    print(name,'give has samples:',group.shape[0])
    overall_stats_by_id[name] = group.shape[0]

test_data = data.ix[test_ids]
data_gp = test_data.groupby('id')
test_ratio_per_obj_ids = []
for name, group in data_gp:
    print(name,'give testing samples:',group.shape[0],'from',overall_stats_by_id[name])
    ratio = group.shape[0]/overall_stats_by_id[name]
Exemplo n.º 20
0
#ids = range(400000)
#ids_shuffled = np.random.permutation(ids)
#tr_val_ids = []
#bar = progressbar.ProgressBar()
#for i in bar(ids_shuffled):
#    if i in all_test_ids:
#        test_rows_range.append(i)
#    else:
#        tr_val_ids.append(i)

#%% Read all trainingd dataset
all_test_ids = []
tr_val_ids = []
bar = progressbar.ProgressBar()
for gp_idx in bar(range(119)):
    row_group = read_variable('final/row_groups/' + str(gp_idx))
    test_row_ids = row_group['test']
    all_test_ids.extend(test_row_ids)
    tr_row_ids = row_group['train']
    tr_val_ids.extend(tr_row_ids)

#%
tr_rows_range = []
val_rows_range = []
test_rows_range = all_test_ids

ids = range(len(tr_val_ids))
ids_shuffled = np.random.permutation(ids)

#%
tr_rows_range = tr_val_ids[:int(len(tr_val_ids) * 0.9)]
import numpy as np

from utils import read_variable

test_Y = np.zeros(1183748)
row_start = 0
row_end = 0
for chunk_id in range(1184):

    path = 'final/test_votes_1L/' + str(chunk_id)
    votes = read_variable(path)

    chunk_Y_pred = (np.sum(votes, axis=1) >= 1).astype(np.int)
    print(chunk_id, '1s:', sum(chunk_Y_pred))

    row_end = row_start + len(chunk_Y_pred)

    test_Y[row_start:row_end] = chunk_Y_pred
    row_start = row_end

print('FINAL 1s:', sum(test_Y))
#%%
import pandas as pd
# saving to CSV
test_ids = read_variable('outputs/test_ids.pkl')
test_y_ids = pd.DataFrame(test_ids, columns=['Id'])
test_y_y = pd.DataFrame(test_Y, columns=['Response'])
test_y = pd.concat([test_y_ids, test_y_y], axis=1)
test_y = test_y.set_index('Id')
test_y.to_csv('submissions/submission_1130.csv', float_format='%.0f')
Exemplo n.º 22
0
    print(max_depth, '-->', col_range, end='')
    print(',tr:', matthews_corrcoef(Y, y_pred), end='')
    #print('tr 1s:real',sum(Y),',pred',sum(y_pred))
    #utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl')
    print(',val:', end='')
    X, Y = val_X[:, col_range], val_Y
    y_pred = model.predict(X)
    val_mcc = matthews_corrcoef(Y, y_pred)
    best_val_mcc = val_mcc
    print(val_mcc, end='')
    print(',1s:', sum(y_pred), '/', sum(Y))
#%%
from sklearn.tree import DecisionTreeClassifier
class_weight = {}
class_weight[0] = 1000
class_weight[1] = 10

#%%

col_trees = []
bar = ProgressBar()
for col_set_idx in bar(range(78)):
    model = read_variable('vert/tree_' + str(col_set_idx) + '.pkl')
    col_trees.append(model)
#%%
for col_set_idx, model in enumerate(col_trees):
    col_range = range(10 * col_set_idx, 10 * col_set_idx + 10, 1)

    pred_y = model.predict(val_X[:, col_range])
    print(col_set_idx, matthews_corrcoef(val_Y, pred_y), sum(pred_y))
Exemplo n.º 23
0
import time
from utils import load_training_subset_1110, read_variable, save_variable
import numpy as np
from sklearn.metrics import matthews_corrcoef

#%%
val_X, val_Y = load_training_subset_1110(range(1000, 1010, 1))

tr_X_1s = read_variable('model_stats/tr_pip_data_1s_1110.pkl')

#%%
'''
Model: SGD
'''
from sklearn.linear_model import SGDClassifier

len_1s = tr_X_1s.shape[0]

for set_id in range(0, 166, 1):

    chunk_range = range(set_id, 1000, 166)
    t_X, t_Y = load_training_subset_1110(chunk_range)
    tr_X = np.concatenate([t_X, tr_X_1s])
    tr_Y = np.concatenate([t_Y, np.ones(len_1s)])

    alpha = 1e-4  # default
    #‘none’, ‘l2’, ‘l1’, or ‘elasticnet’
    penalty = 'l1'
    model = SGDClassifier(alpha=alpha, shuffle=True, n_jobs=3, penalty=penalty)
    t0 = time.time()
    model = model.fit(tr_X, tr_Y)
Exemplo n.º 24
0
'''
Decision tree learners create biased trees if some classes dominate. It is therefore recommended to balance the dataset prior to fitting with the decision tree.
ref:http://scikit-learn.org/stable/modules/tree.html
'''
'''
StratifiedKFold is a variation of k-fold which returns stratified folds:
each set contains approximately the same percentage of samples of each target
class as the complete set.
'''
import os

for root, dirs, files in os.walk('final/tr_groups'):
    for chunk_idx in files:
        print('chunk', chunk_idx, end='...')
        chunk_path = os.path.join(root, chunk_idx)
        tr_chunk = read_variable(chunk_path)
        model_path = 'final/1L_tree/' + str(chunk_idx)
        if os.path.isfile(model_path):
            print('model exist')
        else:
            save_variable({}, model_path)
            print('processing...')

            chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx])

            # based on experiment, k=4 give the smallest STD
            chunk_model = ForestChunkClassifierWithKFolds(k=4,
                                                          seeds=[13, 11, 193])
            chunk_model.fit(chunk_X, chunk_Y, test_X, test_Y)
            chunk_Y_pred = chunk_model.predict(chunk_X)
            chunk_mcc = matthews_corrcoef(chunk_Y, chunk_Y_pred)
Exemplo n.º 25
0
import progressbar
import numpy as np
from sklearn.metrics import matthews_corrcoef
from utils import read_variable, save_variable
import time

tr_chunk_idx = read_variable('final/tr_chunk_idx')

tr_Y = np.zeros([0])
tr_votes = np.zeros([0, 301])
bar = progressbar.ProgressBar()
for chunk_id in bar(tr_chunk_idx):
    chunk_votes = read_variable('final/tr_votes/' + str(chunk_id) + '.pkl')
    tr_votes = np.concatenate([tr_votes, chunk_votes])
    chunk_Y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl')
    tr_Y = np.concatenate([tr_Y, chunk_Y])

val_chunk_idx = read_variable('final/val_chunk_idx')
val_Y = np.zeros([0])
val_votes = np.zeros([0, 301])
bar = progressbar.ProgressBar()
for chunk_id in bar(val_chunk_idx):
    chunk_votes = read_variable('final/tr_votes/' + str(chunk_id) + '.pkl')
    val_votes = np.concatenate([val_votes, chunk_votes])
    chunk_Y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl')
    val_Y = np.concatenate([val_Y, chunk_Y])

#%%
from sklearn.ensemble import RandomForestClassifier

tr_row_nu = tr_Y.shape[0]
Exemplo n.º 26
0
import progressbar
import numpy as np
from sklearn.metrics import matthews_corrcoef
import time, os
from utils import load_training_subset_1110, read_variable

#%
val_X, val_Y = load_training_subset_1110(range(1000, 1184, 1))
#%%

print('loading trees...')
model_forest = []
bar = progressbar.ProgressBar()
for set_id in bar(range(0, 300, 1)):
    model = read_variable('9/forest_' + str(set_id) + '.pkl')
    model_forest.append(model)

#%%
#%%
print('loading logic...')
model_logic = []
bar = progressbar.ProgressBar()
for set_id in bar(range(0, 166, 1)):
    model = read_variable('7/logic_' + str(set_id) + '.pkl')
    model_logic.append(model)
    #%%
print('loading boost...')
model_boost = []
bar = progressbar.ProgressBar()
for set_id in bar(range(0, 166, 1)):
    model = read_variable('7/boost_' + str(set_id) + '.pkl')
Exemplo n.º 27
0
#import col_stats_utils
import utils
import pandas as pd
import time
import numpy as np
'''
=======================================================
'''
print('loading col_stats_cate...')
col_stats_cate = utils.read_variable('model_stats/col_stats_cate.pkl')
column_names = utils.read_variable('outputs/column_names.pkl')
cols_cate = column_names['categorical']
#%%
'''
calculate probability of response 0 with categorical col
'''


def cal_0_proba_by_cate(col_name, value):
    stat_0 = col_stats_cate[col_name][0]
    stat_1 = col_stats_cate[col_name][1]
    response0_proba = 0
    if value != value:
        # nan
        response0_proba = stat_0['nan'] / (stat_0['nan'] + stat_1['nan'])
    elif value in stat_0 and value not in stat_1:
        response0_proba = 1
    elif value not in stat_0 and value in stat_1:
        response0_proba = 0
    elif value in stat_0 and value in stat_1:
        response0_proba = stat_0[value] / (stat_0[value] + stat_1[value])
Exemplo n.º 28
0
import utils
from sklearn.ensemble import RandomForestClassifier
import time
#%%
tr= utils.read_variable('outputs/tr')
val= utils.read_variable('outputs/val')
estimator_nu=2

print('###########################')
print('Char 38, Tree Nu:',estimator_nu)
print('---------------------------')

X_col_exl = ['outcome','char_38']

#X = tr.drop(X_col_exl, axis=1)
X= tr['char_38'].reshape(-1, 1)
Y = tr['outcome']
# col char_38
model = RandomForestClassifier(n_estimators=estimator_nu, verbose=0, n_jobs=-1)

startTime = time.time()
model = model.fit(X, Y)
print ('Training took', int(time.time() - startTime),'sec');

f = model.predict(X)
print('------TRAINING-------')
(tr_f1,tr_auc, tr_confusion) = utils.validate_prediction(f,Y)

del X,Y,f

#%
Exemplo n.º 29
0
#%%
#val_X,val_Y = utils.load_training_subset(range(1000,1184,1))

#%%
'''
Model: SVC
WARNING: The implementation is based on libsvm.
The fit time complexity is more than quadratic with the number of samples which
makes it hard to scale to dataset with more than a couple of 10000 samples.
HENCE, training dataset is splitted into 10 blocks
'''
from sklearn.svm import SVC
import os

tr_X_1s = utils.read_variable('model_stats/tr_pip_data_1s.pkl')

len_1s = tr_X_1s.shape[0]

for set_id in range(6, 1000, 6):
    # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s

    file_path = '7/svc_' + str(set_id) + '.pkl'
    if os.path.exists(file_path):
        print('already exist.', file_path)
    else:
        chunk_range = range(set_id - 6, set_id, 1)
        t_X, t_Y = utils.load_training_subset(chunk_range)
        tr_X = np.concatenate([t_X, tr_X_1s])
        tr_Y = np.concatenate([t_Y, np.ones(len_1s)])
        model = SVC(kernel='rbf', C=1)
Exemplo n.º 30
0
import time
from utils import load_pipped_tr_chunks, read_variable, save_variable
import numpy as np
from sklearn.metrics import matthews_corrcoef

#%%
tr_chunk_idx = read_variable('final/tr_chunk_idx')

#%%
'''
Model: Tree
'''
'''
Decision tree learners create biased trees if some classes dominate. It is therefore recommended to balance the dataset prior to fitting with the decision tree.
ref:http://scikit-learn.org/stable/modules/tree.html
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
import os

good_model_val_mcc = 0.2

from sklearn.neighbors import KNeighborsClassifier
for model_idx in range(100):

    print(model_idx, end='...')
    file_path = 'final/good_models_onlynum_2/' + str(model_idx)
    if os.path.isfile(file_path):
        print('exist')
    else:
        print('processing...')