Пример #1
0
import utils

act_train = pd.read_csv('data/act_train.csv')
act_test = pd.read_csv('data/act_test.csv')
people = pd.read_csv('data/people.csv')

#%%
print(people.columns.values)
print(act_train.columns.values)

#%%
column_names = {}
column_names['category'] = [
    'char_1_p', 'group_1', 'char_2_p', 'char_3_p', 'char_4_p', 'char_5_p',
    'char_6_p', 'char_7_p', 'char_8_p', 'char_9_p', 'activity_category',
    'char_1_a', 'char_2_a', 'char_3_a', 'char_4_a', 'char_5_a', 'char_6_a',
    'char_7_a', 'char_8_a', 'char_9_a', 'char_10_a'
]
column_names['date'] = ['date_p', 'date_a']
column_names['ignore'] = ['people_id', 'activity_id']
column_names['y'] = 'outcome'
column_names['bool'] = [
    'char_10_p', 'char_11', 'char_12', 'char_13', 'char_14', 'char_15',
    'char_16', 'char_17', 'char_18', 'char_19', 'char_20', 'char_21',
    'char_22', 'char_23', 'char_24', 'char_25', 'char_26', 'char_27',
    'char_28', 'char_29', 'char_30', 'char_31', 'char_32', 'char_33',
    'char_34', 'char_35', 'char_36', 'char_37'
]
column_names['nu'] = 'char_38'
utils.save_variable('column_names', column_names)
Пример #2
0
for forest_idx in range(0, 10, 1):
    tr_range = range(forest_idx, tr_row_nu, 10)
    t_X, t_Y = tr_votes[tr_range, :], tr_Y[tr_range]

    t0 = time.time()
    print('forest', forest_idx, end='-->')
    forest_2nd = RandomForestClassifier(max_depth=max_depth_best,
                                        n_estimators=11,
                                        random_state=12)
    forest_2nd.fit(t_X, t_Y)
    y_pred = forest_2nd.predict(t_X)
    print('tr:', matthews_corrcoef(t_Y, y_pred), end=',')
    val_y_pred = forest_2nd.predict(val_votes)
    print('val:', matthews_corrcoef(val_Y, val_y_pred), end='')
    print(',cost', int(time.time() - t0))
    save_variable(forest_2nd, 'final/2nd_level_models/' + str(forest_idx))

#%%
'''
forest 0-->tr: 0.853781308398,val: 0.547872937046,cost 84
forest 1-->tr: 0.839596341509,val: 0.550393783491,cost 89
forest 2-->tr: 0.868869382821,val: 0.544121676669,cost 110
forest 3-->tr: 0.831067645144,val: 0.556898396227,cost 111
forest 4-->tr: 0.866979504134,val: 0.572140934661,cost 108
forest 5-->tr: 0.865444728731,val: 0.565962549302,cost 105
forest 6-->tr: 0.847071811478,val: 0.550088494652,cost 105
forest 7-->tr: 0.867795690777,val: 0.55928161695,cost 110
forest 8-->tr: 0.849556952293,val: 0.54308323331,cost 111
forest 9-->tr: 0.852024718779,val: 0.558287108709,cost 105
'''
#%%
Пример #3
0
StratifiedKFold is a variation of k-fold which returns stratified folds:
each set contains approximately the same percentage of samples of each target
class as the complete set.
'''
import os

for root, dirs, files in os.walk('final/tr_groups'):
    for chunk_idx in files:
        print('chunk', chunk_idx, end='...')
        chunk_path = os.path.join(root, chunk_idx)
        tr_chunk = read_variable(chunk_path)
        model_path = 'final/1L_tree/' + str(chunk_idx)
        if os.path.isfile(model_path):
            print('model exist')
        else:
            save_variable({}, model_path)
            print('processing...')

            chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx])

            # based on experiment, k=4 give the smallest STD
            chunk_model = ForestChunkClassifierWithKFolds(k=4,
                                                          seeds=[13, 11, 193])
            chunk_model.fit(chunk_X, chunk_Y, test_X, test_Y)
            chunk_Y_pred = chunk_model.predict(chunk_X)
            chunk_mcc = matthews_corrcoef(chunk_Y, chunk_Y_pred)
            print('OVERALL tr:', chunk_mcc, end='')
            test_Y_pred = chunk_model.predict(test_X)
            test_mcc = matthews_corrcoef(test_Y, test_Y_pred)
            print(',test:', test_mcc, end='-->')
Пример #4
0
category_dummys_feature_filter = SelectKBest(chi2, k=X.shape[1])
category_dummys_feature_filter.fit(X, y)

#%
category_dummys_pvalues_significance_level = 0.05
# bigger p value, feature distribution is more like y
#normally set pvalues_threshold to 0.01 or 0.05
selected_col_ids = []
for id, p in enumerate(category_dummys_feature_filter.pvalues_):
    if p > category_dummys_pvalues_significance_level:
        selected_col_ids.append(id)
print(len(selected_col_ids))

utils.save_variable(
    'outputs/people_act_train_category_selected_col_ids_pvalue_' +
    str(category_dummys_pvalues_significance_level), selected_col_ids)

del X, y, id, p

#%% filter on rows, based on 16GB RAW, it only can process 2K rows in a reasonable time.
Y = people_act_train[column_names['y']]

row_total_0 = 1000
row_total_1 = 1000
row_count_0 = 0
row_count_1 = 0
selected_row_ids = []
for i, y in enumerate(Y):
    if y == 0 and row_count_0 < row_total_0:
        selected_row_ids.append(i)
import utils
import progressbar
max_chunk_size = 1


chunks_num = pd.read_csv('data/train_numeric.csv',index_col='Id',chunksize=max_chunk_size, low_memory=False,iterator=True)

bar = progressbar.ProgressBar()

for chunk_id in bar(range(1183747)):
    # chunk has to be read one by one in sequence
    chunk_num_response = chunks_num.get_chunk()
    chunk_num = chunk_num_response.drop(['Response'],axis=1)
    chunk_y = chunk_num_response['Response']
    
    utils.save_variable(chunk_y, 'data/train_y_rows/'+str(chunk_id)+'.pkl')
    utils.save_variable(chunk_num, 'data/train_numeric_rows/'+str(chunk_id)+'.pkl')

    
#%%

chunks_num = pd.read_csv('data/test_numeric.csv',index_col='Id',chunksize=max_chunk_size, low_memory=False,iterator=True)


bar = progressbar.ProgressBar()

for chunk_id in bar(range(1183748)):
    # chunk has to be read one by one in sequence
    chunk_num = chunks_num.get_chunk()
    
    utils.save_variable(chunk_num, 'data/test_numeric_rows/'+str(chunk_id)+'.pkl')
Пример #6
0
tr_X_1s = read_variable('model_stats/tr_pip_data_1s_1108.pkl')

len_1s = tr_X_1s.shape[0]

for set_id in range(0, 166, 1):
    # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s
    chunk_range = range(set_id, 1000, 166)
    t_X, t_Y = load_training_subset_1108(chunk_range)
    tr_X = np.concatenate([t_X, tr_X_1s])
    tr_Y = np.concatenate([t_Y, np.ones(len_1s)])
    X, Y = tr_X, tr_Y
    model = AdaBoostClassifier(n_estimators=100)
    t0 = time.time()
    model = model.fit(X, Y)
    y_pred = model.predict(X)
    print(set_id, 'boost:', ',tr:', matthews_corrcoef(Y, y_pred), end='')
    #print('tr 1s:real',sum(Y),',pred',sum(y_pred))
    #utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl')
    print(',val:', end='')
    X, Y = val_X, val_Y
    y_pred = model.predict(X)
    print(matthews_corrcoef(Y, y_pred), end='')
    print(',cost:', int(time.time() - t0), 'sec')

    break
    save_variable(model, '7/boost_' + str(set_id) + '.pkl')
'''


'''
Пример #7
0
                for chunk_id in bar(range(0, chunk_nu)):

                    col = chunks.get_chunk()[col_name]
                    ys = responses[chunk_id *
                                   max_chunk_size:chunk_id * max_chunk_size +
                                   col.shape[0]]
                    for i in range(0, col.shape[0], 1):
                        value = col.iloc[i]
                        y = ys[i]
                        if value != value:
                            cnts[y]['nan'] += 1
                        else:
                            cnts[y]['nu'].append(value)

                cnts[0]['nu'] = np.asarray(cnts[0]['nu']).reshape(-1, 1)
                cnts[1]['nu'] = np.asarray(cnts[1]['nu']).reshape(-1, 1)
                print('cal kde for 0...')
                if cnts[0]['nu'].size > 0:
                    cnts[0]['kde'] = KernelDensity(kernel='gaussian').fit(
                        cnts[0]['nu'])
                print('cal kde for 1...')
                if cnts[1]['nu'].size > 0:
                    cnts[1]['kde'] = KernelDensity(kernel='gaussian').fit(
                        cnts[1]['nu'])
                utils.save_variable(cnts, file_path)
        break
    except ValueError:
        print('get ValueError. Restart again.')

#%%
Пример #8
0
#%% convert one date variable to three variables: year, month, and day
people_act_train_date_dummys = pd.DataFrame()
for name in column_names['date']:
    people_act_train_date_dummys[name+'_y'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.year)
    people_act_train_date_dummys[name+'_m'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.month)
    people_act_train_date_dummys[name+'_d'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.day)
del name
#%% convert category column to integer
#-----------------------------------
people_act_train_category2int = pd.DataFrame()
for name in column_names['category']:
    people_act_train_category2int[name] = people_act_train[name].str.replace('((type)|(group))\s','')

people_act_train_category2int = people_act_train_category2int.fillna(value=0)
del name
#%% convert integer to one hot codes
one_hot_encoder = OneHotEncoder(n_values='auto', sparse=True)
one_hot_encoder.fit(people_act_train_category2int)

#%% this variable will not presented in variable explorer and can NOT call toarray() which lead MemoryError
people_act_train_category_dummys = one_hot_encoder.transform(people_act_train_category2int)
print(people_act_train_category_dummys.shape)
print(type(people_act_train_category_dummys))

del people_act_train_category2int
#%%
utils.save_variable('outputs/people_act_train_date_dummys',people_act_train_date_dummys)
utils.save_variable('outputs/people_act_train_category_dummys',people_act_train_category_dummys)


Пример #9
0
#%
#X = val.drop(X_col_exl, axis=1)
X= val['char_38'].reshape(-1, 1)
Y = val['outcome']
f = model.predict(X)
print('------VALIDATION-------')
(val_f1,val_auc, val_confusion) = utils.validate_prediction(f,Y)


print('============================')
print('FINAL ==',val_auc)
print('============================')

model_char38 = model
utils.save_variable('models/model_char38',model_char38)
del model

del X,Y,f

print('###########################')
print('without Char 38, Tree Nu:',estimator_nu)
print('---------------------------')

X_col_exl = ['outcome','char_38']
X = tr.drop(X_col_exl, axis=1)
Y = tr['outcome']
# col char_38
model = RandomForestClassifier(n_estimators=estimator_nu, verbose=0, n_jobs=-1)

startTime = time.time()
Пример #10
0
remained_sample_ids = set(data.index.tolist())
test_ids = set([])

data_gp = data.groupby('timestamp')
test_ratio = 0.01
print('create test dataset',end='')


unique_timestamp = data["timestamp"].unique()
n = len(unique_timestamp)
test_start_i = int(n*(1-test_ratio))
timesplit = unique_timestamp[test_start_i]

print('timesplit:',timesplit)
train_data  = data[data.timestamp < timesplit]
test_data = data[data.timestamp >= timesplit]

utils.save_variable(test_data,'E:/two-sigma/output/timeseries/test_data')

#%%
'''
generate timeseries chunks
'''
ts_groups = train_data.groupby('timestamp')
for key in ts_groups.groups.keys():
    print('Timestamp:',key,end=',')
    row_ids = ts_groups.groups[key]
    gp_data = train_data.ix[row_ids]
    utils.save_variable(gp_data,'E:/two-sigma/output/timeseries/tr_chunks/'+str(key))
    print('SIZE:',gp_data.shape)
                          index_col='Id',
                          chunksize=max_chunk_size,
                          low_memory=False,
                          iterator=True)

bar = progressbar.ProgressBar()

for chunk_id in bar(range(max_chunk_nu)):
    # chunk has to be read one by one in sequence
    chunk_num_response = chunks_num.get_chunk()
    chunk_num = chunk_num_response.drop(['Response'], axis=1)
    chunk_y = chunk_num_response['Response']
    chunk_date = chunks_date.get_chunk()
    chunk_cate = chunks_cate.get_chunk()

    utils.save_variable(chunk_y,
                        'data/train_y_chunks/' + str(chunk_id) + '.pkl')
    utils.save_variable(chunk_num,
                        'data/train_numeric_chunks/' + str(chunk_id) + '.pkl')
    utils.save_variable(chunk_date,
                        'data/train_date_chunks/' + str(chunk_id) + '.pkl')
    utils.save_variable(
        chunk_cate, 'data/train_categorical_chunks/' + str(chunk_id) + '.pkl')

#%%

chunks_num = pd.read_csv('data/test_numeric.csv',
                         index_col='Id',
                         chunksize=max_chunk_size,
                         low_memory=False,
                         iterator=True)
chunks_date = pd.read_csv('data/test_date.csv',
Пример #12
0
#%%
from sklearn.ensemble import RandomForestClassifier

t_X, t_Y = votes[:90000,:], val_Y[:90000]
v_X, v_Y = votes[90000:,:], val_Y[90000:]

max_depth=59
print(max_depth,end='-->')
forest_2nd = RandomForestClassifier(max_depth=max_depth,n_estimators=11,random_state=12)
forest_2nd.fit(t_X, t_Y)
y_pred= forest_2nd.predict(t_X)
print('tr:',matthews_corrcoef(t_Y , y_pred),end=',')
y_pred= forest_2nd.predict(v_X)
print('val:',matthews_corrcoef(v_Y , y_pred))

save_variable(forest_2nd,'forest_2nd_14.pkl')


#%%    
"""
 produce testing result
"""

max_chunk_size = 1000
col_cate_nu = 2140
col_numeric_nu = 969
col_date_nu = 1157


pip = read_variable('model_stats/pip_1110.pkl')
Пример #13
0
        for model_idx in bar(files):
            model_path = os.path.join(root, model_idx)
            model = read_variable(model_path)

            pack = {}
            pack['root'] = root
            pack['chunk_id'] = model_idx
            pack['model'] = model

            packs.append(pack)

print('model loaded:', len(packs))
#%%
from utils import load_pipped_tr_chunk, save_variable
from sklearn.metrics import matthews_corrcoef
'''
WARNING: 41 chunks cost about 20 hrs to finish
'''
for chunk_idx in range(41):
    print('chunk', chunk_idx, end='...')
    chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx])

    for pack in packs:
        model = pack['model']
        chunk_Y_pred = model.predict(chunk_X)
        file_path = pack['root'] + '_tr_y_pred/' + str(
            pack['chunk_id']) + '_' + str(chunk_idx)
        save_variable(chunk_Y_pred, file_path)
        print(file_path, '-->', matthews_corrcoef(chunk_Y, chunk_Y_pred),
              ',1s:', str(sum(chunk_Y_pred)))
model_forest = []
bar = progressbar.ProgressBar()
for model_id in bar(range(0, 301, 1)):
    model = read_variable('final/good_models/' + str(model_id))
    model_forest.append(model)

#%%

for chunk_id in range(1184):

    path = 'final/test_votes/' + str(chunk_id) + '.pkl'
    print('checking chunk:', chunk_id, end='...')
    if os.path.isfile(path):
        print('exist')
    else:
        save_variable({}, path)
        print('processing', end='...')
        chunk_X = load_pipped_test_chunks([chunk_id])

        # predit
        votes = np.zeros([chunk_X.shape[0], len(model_forest)])
        bar = progressbar.ProgressBar()
        model_id = 0
        for model in bar(model_forest):
            t0 = time.time()
            pred_Y = model.predict_proba(chunk_X)
            pred_Y_0 = pred_Y[:, 0]
            votes[:, model_id] = pred_Y_0
            model_id += 1
        save_variable(votes, path)
        print('saved to', path)
Пример #15
0
        round_set += 1
        n_estimators += 10

    X, Y = tr_X, tr_Y

    y_pred = best_model.predict(X)

    print('tree BEST:', set_id, ',tr:', matthews_corrcoef(Y, y_pred), end='')

    print(',val:', end='')
    X, Y = val_X, val_Y
    y_pred = best_model.predict(X)
    val_mcc = matthews_corrcoef(Y, y_pred)
    print(val_mcc)
    print('#####################################')
    save_variable(model, '8/forest_' + str(set_id) + '.pkl')

#%%
'''
'''
from sklearn.ensemble import RandomForestClassifier

len_1s = tr_X_1s.shape[0]

set_id = 0

chunk_range = range(set_id, 1000, 166)
t_X, t_Y = load_training_subset_1110(chunk_range)

tr_X = np.concatenate([t_X, tr_X_1s])
tr_Y = np.concatenate([t_Y, np.ones(len_1s)])
Пример #16
0
#%% tolil() is more efficient than csr_matrix when values need to be modified
people_act_train_category_pcs_dsc_lil = people_act_train_category_pcs_dsc.tolil(
)
startTime = time.time()
for i in range(2174877, 1160000, -1):
    people_act_train_category_pcs_dsc_lil[i] = pca.transform(
        people_act_train_category_dummys[i, selected_col_ids].toarray())
    print(i, '/', length)
print('PCA Translation took', int(time.time() - startTime), 'sec')

print(people_act_train_category_dummys.shape, '-PCA->',
      people_act_train_category_pcs_dsc_lil.shape)

#%
utils.save_variable(
    'outputs/people_act_train_category_' + str(pc_number) + 'pcs_dsc_lil',
    people_act_train_category_pcs_dsc_lil)

#%% ASC####################
##########################
# can NOT do transform all data in one process which will lead to memory leak
people_act_train_category_pcs = csr_matrix(
    (people_act_train_category_dummys.shape[0], pc_number))
people_act_train_category_pcs_asc = people_act_train_category_pcs
#%%
people_act_train_category_pcs_asc_lil = people_act_train_category_pcs_asc.tolil(
)
startTime = time.time()
for i in range(0, 1160001, 1):
    people_act_train_category_pcs_asc_lil[i] = pca.transform(
        people_act_train_category_dummys[i, selected_col_ids].toarray())
Пример #17
0
        mcc = matthews_corrcoef(y_val, y_pred)
        print(mcc, end='')

        if mcc > best_tree_mcc:
            best_tree = tre
            best_tree_mcc = mcc
            print('(best)', end='')
        print()
    print('tree', tree_id, '-->best mcc:', best_tree_mcc)
    forest.append(best_tree)
    if id_end == x_tr.shape[0]:
        break
    tree_id += 1
    id_start = id_end

utils.save_variable(forest, 'model_stats/forest.pkl')

del x, y, id_start, id_end, tree_id, best_tree_mcc, best_tree
#%%
#%% validation  based on three second level trees
x = x_val_final
y = y_val_final

y_sum = np.zeros(y.shape[0])
threshold = 0

for tre in forest:
    y_sum += tre.predict(x)

y_pred = (y_sum > threshold).astype(int)
            print('(best)', end='')
        print()
        round_i += 1
    forest.append(best_model)
    tree_id += 1

    tr_chunk_start_index = tr_chunk_end_index
    tr_chunk_end_index = tr_chunk_start_index + tr_chunk_nu
    if tr_chunk_end_index > tr_chunk_end:
        tr_chunk_end_index = tr_chunk_end
    del tr_nu, tr_y, tr_x, best_model_mcc, best_model, x, y, votes_date_nu
    if tr_chunk_start_index == tr_chunk_end:
        del tr_chunk_start_index, tr_chunk_end
        break

utils.save_variable(forest, 'model_stats/forest_nu_date_cate.pkl')

del sgd_val_x, sgd_val_y
#%%
'''
valide over all model mcc with validation dataset
'''
import progressbar
import numpy as np
from sklearn.metrics import matthews_corrcoef
import utils

forest = utils.read_variable('model_stats/forest_nu_date_cate.pkl')

val_chunk_ids = range(tr_chunk_end, chunk_nu, 1)
val_nu = 183747
Пример #19
0
            pack['model'] = model

            packs.append(pack)

print('model loaded:', len(packs))

#%%

test_X, test_Y = load_pipped_test_chunk()

real_test_Y_1s_count = str(sum(test_Y))
for pack in packs:
    model = pack['model']
    test_Y_pred = model.predict(test_X)
    file_path = pack['root'] + '_test_y_pred/' + str(pack['chunk_id'])
    save_variable(test_Y_pred, file_path)
    print(file_path, '-->', matthews_corrcoef(test_Y, test_Y_pred), ',1s:',
          str(sum(test_Y_pred)) + '/' + real_test_Y_1s_count)

#%%
model_folders = [
    'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp'
]

packs = []
for model_folder in model_folders:
    print('Processing model cluster:', model_folder)
    for root, dirs, files in os.walk(model_folder):
        bar = progressbar.ProgressBar()
        for model_idx in bar(files):
            model_path = os.path.join(root, model_idx)
feature_engine_pvalues = feature_engine.pvalues_
kbest_cols = []

# pvalue 0.05 or 0.1
pvalue_threshold = 0.05
for idx, pv in enumerate(feature_engine_pvalues):
    if pv > pvalue_threshold:
        kbest_cols.append(idx)

print('selected col:', len(kbest_cols))

pip['2_kbest_cols'] = kbest_cols

#%%

utils.save_variable(pip, 'model_stats/pip.pkl')
#%%
'''
modeling: search for the best max_depth
'''
from sklearn.feature_selection import SelectKBest, f_classif

X = tr_X[:, kbest_cols]
Y = tr_Y
# use random_state to produce repeatable result.
model = tree.DecisionTreeClassifier(random_state=0)
model = model.fit(X, Y)
y_pred = model.predict(X)

print('tree depth:MAX,tr:', matthews_corrcoef(Y, y_pred), end='')
#utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl')
Пример #21
0
remained_sample_ids = set(data.index.tolist())
sel_sample_ids = set([])

data_gp = data.groupby('timestamp')
test_ratio = 0.01
for name, group in data_gp:
    print('T'+str(name),end=',')
    gp_idx = set(data_gp.groups[name])
    idx_pool = gp_idx.intersection(remained_sample_ids)
    sel_ids_len = int(len(gp_idx)*test_ratio)
    sel_ids = set(random.sample(idx_pool,sel_ids_len))
    print('sel',len(sel_ids),'samples from',len(gp_idx))
    sel_sample_ids = sel_sample_ids | sel_ids
    remained_sample_ids = remained_sample_ids - sel_ids

utils.save_variable(sel_sample_ids,'output/test_ids')
print('testing samples:',len(sel_sample_ids))
del sel_sample_ids
#%%
test_ids = list(utils.read_variable('output/test_ids'))
'''
check whether testing dataset is well distributed among different obj ids
'''
data_gp = data.groupby('id')
overall_stats_by_id = {}
for name, group in data_gp:
    print(name,'give has samples:',group.shape[0])
    overall_stats_by_id[name] = group.shape[0]

test_data = data.ix[test_ids]
data_gp = test_data.groupby('id')
Пример #22
0
cols_numeric = column_names['numeric']

col_len = cols_categorical.size+cols_date.size+cols_numeric.size

del column_names
#%% 
'''
calculate probability of response 0 with categorical col
'''
col_stats_cate = {}
print('import col statistic:','categorical columns')
bar = progressbar.ProgressBar()
for col_name in bar(cols_categorical):
    col_stats_cate[col_name] = utils.read_variable('model_stats/cate/'+col_name+'.pkl')

utils.save_variable(col_stats_cate,'model_stats/col_stats_cate.pkl')

      
'''
calculate probability of response 0 with date col
'''
col_stats_date = {}
print('import col statistic:','date columns')
bar = progressbar.ProgressBar()
for col_name in bar(cols_date):
    stat = utils.read_variable('model_stats/date/'+col_name+'.pkl')
    #remove nu list to save memory
    del stat[0]['nu'],stat[1]['nu']
    col_stats_date[col_name] = stat

utils.save_variable(col_stats_date,'model_stats/col_stats_date.pkl')
Пример #23
0
    nan_count = 0
    for chunk_id in range(0, chunk_nu):
        chunk = chunks.get_chunk()
        nan_count += np.sum(chunk.isnull())
    nan_counts[col_index] = nan_count
    col_nan_counts[col_name] = nan_count

    #print('\nnan values:',nan_counts[col_index]/train_rows_nu)
    col_index += 1

del file_name, cols, col_index, chunks, bar, nan_count, chunk_id

#%%

a = nan_counts / (max_chunk_size * chunk_nu)

#%%

nan_threshold_percent = 0.2

threshold = nan_threshold_percent * max_chunk_size * chunk_nu
col_selected = []
bar = progressbar.ProgressBar()
for key, value in bar(col_nan_counts.items()):
    if int(value) < threshold:
        col_selected.append(key)

print(len(col_selected), 'cols selected.')
file_path = 'model_stats/col_nu_selected_2percent_Nan.pkl'
utils.save_variable(col_selected, file_path)
print('result is saved to:', file_path)
Пример #24
0
                    best_model = model
                    print('-->BEST')
                else:
                    print()
                print(',test:', end='')
                final_y_pred = model.predict(final_val_X)
                final_val_mcc = matthews_corrcoef(final_val_Y, final_y_pred)
                print(final_val_mcc)
                print('---------------------------------------')

        print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^')

        print('tree:', gp_idx, 'BEST,val:', best_val_mcc, end='-->')

        if best_val_mcc > good_model_val_mcc:
            save_variable(model, file_path)
            print('SAVED' + '(' + file_path + ')')
        else:
            print('DISCARD')
        print('#####################################')

#%%
'''
TEST
'''

from sklearn.ensemble import RandomForestClassifier
import os

good_model_val_mcc = 0.2
Пример #25
0
            best_model = model
            best_tr_mcc = tr_mcc
            print('<---best')
        else:
            print()
        if best_tr_mcc > 0.9:
            break
    tr_Y_pred = best_model.predict(tr_X)
    best_tr_mcc = matthews_corrcoef(tr_Y, tr_Y_pred)
    print(set_id, 'best sgd:', ',tr:', best_tr_mcc, end='')
    print(',val:', end='')
    val_Y_pred = best_model.predict(val_X)
    print(matthews_corrcoef(val_Y, val_Y_pred))

    break
    save_variable(best_model, '7/sgd_' + str(set_id) + '.pkl')

#%%
'''

No matter what config, SGD give really bad pred even on training.

0 / 1 sgd: ,tr: 0.0130409009077,val:-0.00894843344066 ( 3325 )
0 / 2 sgd: ,tr: -0.0126166788456,val:-0.00494400796572 ( 3414 )
0 / 3 sgd: ,tr: 0.0210647425353,val:-0.00608883260096 ( 2121 )
0 / 4 sgd: ,tr: 0.0409474491906,val:0.00673896161817 ( 1048 )
0 / 5 sgd: ,tr: 0.0178420591587,val:-0.00513487374047 ( 2921 )
0 / 6 sgd: ,tr: 0.0812392407412,val:0.0155743265752 ( 4745 )<---best
0 / 7 sgd: ,tr: 0.0667857466617,val:0.0115874549738 ( 1881 )
0 / 8 sgd: ,tr: 0.0161787956102,val:-0.00146447562307 ( 2706 )
0 / 9 sgd: ,tr: 0.0362361124868,val:-0.0127457101921 ( 2143 )
Пример #26
0
    people_act_train['activity_category'] == 'type 1')][0:1]
people_act_char_10_a_notnull = people_act_train.loc[
    people_act_train['char_10_a'].notnull()][0:1]

#%%
people_act_grouped = people_act_train.groupby('outcome')
#%% outcome 0 VS. outcome 1
people_act_outcome0 = people_act_grouped.get_group(0)
people_act_outcome1 = people_act_grouped.get_group(1)

plt.pie([people_act_outcome0.shape[0], people_act_outcome1.shape[0]],
        labels=['0', '1'],
        autopct='%1.4f%%')
plt.show()

#%% review category variables
category_column_names_stat = pd.DataFrame(index=column_names['category'],
                                          columns=['unique_types'])
for name in column_names['category']:
    shape = people_act_train[name].unique().shape
    category_column_names_stat.ix[name].unique_types = shape[0]
print(category_column_names_stat)

#%%
people_act_train['group_1'].unique()
people_act_train['char_10_a'].unique()

#%%
utils.save_variable('people_act_train', people_act_train)
utils.save_variable('people_act_test', people_act_test)
Пример #27
0
                          low_memory=False,
                          iterator=True)

chunk_id = 0
for chunk_id in range(0, chunk_nu, 1):
    print('processing chunk:', chunk_id)
    chunk = chunks_cate.get_chunk()

    file_path = 'model_stats/test_cate_proba/' + str(chunk_id) + '.pkl'
    if os.path.exists(file_path):
        print('already exist.')
    else:
        ids = chunk.index
        result = np.zeros(chunk.shape)
        time1 = time.time()

        r = 0
        c = 0
        for col_name in cols_cate:
            col = chunk[col_name]
            time2 = time.time()
            r = 0
            for index, value in col.iteritems():
                proba = cal_0_proba_by_cate(col_name, value)
                result[r, c] = proba
                r += 1
            c += 1
        print('per chunk:', time.time() - time1)
        df = pd.DataFrame(data=result, index=ids.tolist(), columns=cols_cate)
        utils.save_variable(df, file_path)
Пример #28
0
        tr_X = t_X
        tr_Y = t_Y
        '''
        based on experiment, smaller tol give better fit on training dataset
         tol : float, default: 1e-4
        Tolerance for stopping criteria.
        '''

        best_model = KNeighborsClassifier(n_jobs=3)
        t0 = time.time()
        best_model = best_model.fit(tr_X, tr_Y)
        tr_Y_pred = best_model.predict(tr_X)
        best_tr_mcc = matthews_corrcoef(tr_Y, tr_Y_pred)
        print(set_id,
              '- i',
              '(',
              tol,
              ')',
              'logic:',
              ',tr:',
              best_tr_mcc,
              end='')

        #        print(',val:',end='')
        #        val_Y_pred = best_model.predict(val_X)
        #        print(matthews_corrcoef(val_Y, val_Y_pred),end='')
        #        print(',cost:',int(time.time()-t0),'sec')
        #print('val 1s:real',sum(val_Y),',pred',sum(val_Y_pred))

        save_variable(best_model, file_path)
Пример #29
0
tr_X_1s = utils.read_variable('model_stats/tr_pip_data_1s.pkl')

len_1s = tr_X_1s.shape[0]

for set_id in range(6, 1000, 6):
    # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s

    file_path = '7/svc_' + str(set_id) + '.pkl'
    if os.path.exists(file_path):
        print('already exist.', file_path)
    else:
        chunk_range = range(set_id - 6, set_id, 1)
        t_X, t_Y = utils.load_training_subset(chunk_range)
        tr_X = np.concatenate([t_X, tr_X_1s])
        tr_Y = np.concatenate([t_Y, np.ones(len_1s)])
        model = SVC(kernel='rbf', C=1)
        t0 = time.time()
        model = model.fit(tr_X, tr_Y)
        y_pred = model.predict(tr_X)
        print(set_id, 'svc:', ',tr:', matthews_corrcoef(tr_Y, y_pred))

        # do not do val which cost too long.
        #    print(',val:',end='')
        #    X, Y = val_X, val_Y
        #    y_pred = model.predict(X)
        #    print(matthews_corrcoef(Y, y_pred),end='')
        #    print(',cost:',int(time.time()-t0),'sec')

        utils.save_variable(model, file_path)
import os

row_total = 1183747
#%%
'''
get 1% samples as testing dataset for the final testing

NOTE: use permutation to cover information through the all training dataset, 
and try to avoid information unbias between samples at different sequential positions
'''
idx = np.random.permutation(row_total)
tr_test_split = int(1183747 * 0.01)
test_row_idx = idx[:tr_test_split]
tr_row_idx = idx[tr_test_split:]

save_variable(tr_row_idx, 'final/tr_row_idx')
save_variable(test_row_idx, 'final/test_row_idx')

# check test dataset have the same distributions of outcomes as training (about 0.5% 1s)
count_1 = 0
bar = progressbar.ProgressBar()
for row_id in bar(test_row_idx):
    row_y = read_variable('data/train_y_rows/' + str(row_id) + '.pkl')
    count_1 += row_y.values

print()
print('test 1s:', count_1, '(', count_1 / len(test_row_idx), ')')
#%%
'''
rebalance 1s and 0s in the rest of dataset
'''