def split_csv(basefname='train_numeric.csv'): basefname_part_template = ub.data_fname_to_partial_data_fname_template( basefname) input_fname = os.path.join(ub.data_dir, basefname) output_fname_template = os.path.join(ub.data_dir, basefname_part_template) ub.log('reading {}'.format(input_fname)) sw = ub.StopWatch('read ' + basefname) reader = pd.read_csv(input_fname, chunksize=50000, low_memory=False) for i, chuck in enumerate(reader): print i chuck.to_csv(output_fname_template.format(i), index=False) ub.log(sw.stop())
def split_csv_old(basefname='train_numeric.csv'): basefname_part_template = ub.data_fname_to_partial_data_fname_template( basefname) input_fname = os.path.join(ub.data_dir, basefname) output_fname_template = os.path.join(ub.data_dir, basefname_part_template) ub.log('reading {}'.format(input_fname)) nrows = None sw = ub.StopWatch('read ' + basefname) df_train = pd.read_csv(input_fname, nrows=nrows, low_memory=False) print 'df shape: {}'.format(df_train.shape) ub.log(sw.stop()) # split original csv ub.log('splitting {}'.format(basefname)) N_split = ub.N_split ub.log('outputting {}'.format(output_fname_template)) n_rows = int(df_train.shape[0] / N_split) widgets = ['splitting', ': ', Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=N_split).start() for i in range(N_split): output_fname = output_fname_template.format(i) if 0: if i < N_split - 1: print 'writng rows:', i * n_rows, (i + 1) * n_rows - 1 else: print 'writng rows:', i * n_rows, df_train.shape[0] df_train[i * n_rows:(i + 1) * n_rows].to_csv(output_fname, index=False) pbar.update(i) pbar.finish()
def split_data(df_to_split, output_fname_template=None): # output_fname_template exampe: os.path.join(ub.data_dir, 'df_train_preprocessed_part{}.csv') ub.log('splitting {}'.format(output_fname_template)) N_split = ub.N_split ub.log('outputting {}'.format(output_fname_template)) n_rows = int(df_to_split.shape[0] / N_split) widgets = ['splitting', ': ', Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=N_split).start() for i in range(N_split): output_fname = output_fname_template.format(i) if 0: if i < N_split - 1: print 'writng rows:', i * n_rows, (i + 1) * n_rows - 1 else: print 'writng rows:', i * n_rows, df_to_split.shape[0] df_to_split[i * n_rows:(i + 1) * n_rows].to_csv(output_fname, index=False) pbar.update(i) pbar.finish()
#for c in cols: # # print c # existing_cols = list(df_tmp_unique.columns) # # print existing_cols # # already_have_it = False # for ec in existing_cols: # if df_tmp_unique[ec].equals(df_date[c]): # ub.log('Skipping ' + c) # # already_have_it = True # # if not already_have_it: # ub.log('Processing date csv, adding ' + c) # df_tmp_unique[c] = df_date[c] #%% #import sys #sys.exit(0) ub.log('Done processing date csv') df_train_date = df_date[df_date['is_train'] == 1].drop(['is_train', 'Response'], axis=1) df_test_date = df_date[df_date['is_train'] == 0].drop(['is_train', 'Response'], axis=1) df_train_date.to_csv('../data_processed/df_train_date_{}.csv'.format(tag), index=False) df_test_date.to_csv('../data_processed/df_test_date_{}.csv'.format(tag), index=False) print df_train_date.shape print df_test_date.shape del df_train_date del df_test_date
])) print TS_list print len(TS_list) S_list = [x.split('_')[1] for x in TS_list] print S_list print len(S_list) col_list = list(df_num.columns) df_tmp_unique = df_num[['Id', 'is_train']].copy() # to host additional columns for ts in TS_list: # print ts cols = [x for x in col_list if ts in x] for c in cols: # print c existing_cols = [x for x in df_tmp_unique.columns if ts in x] # print existing_cols already_have_it = False for ec in existing_cols: if df_tmp_unique[ec].equals(df_num[c]): ub.log('Skipping ' + c) already_have_it = True if not already_have_it: ub.log('Processing numeric csv, adding ' + c) df_tmp_unique[c] = df_num[c] print df_tmp_unique.shape with open('check_num.txt', 'w') as f: f.write('\n'.join(list(df_tmp_unique.columns)))
def load_data(load_test=False, original_cols_only=False, N_start=None, N_read=5, N_split=24, shuffle=False, feature_list_file=None, load_date_csv=True, load_numerical_csv=True, load_categorical_csv=True): assert load_categorical_csv or load_numerical_csv or load_date_csv if load_test: ub.log('load test files') else: ub.log('load train files') print 'N_start: {}'.format(N_start), if N_start is None: print ', i.e. random start' else: print '' print 'N_read: {}'.format(N_read) print 'N_splits: {}'.format(N_split) print 'shuffle: {}'.format(shuffle) print 'original_cols_only: {}'.format(original_cols_only) print 'feature_list_file: {}'.format(feature_list_file) if shuffle: N_list = range(N_split) random.shuffle(N_list) file_ids = N_list[:N_read] else: if N_start is None: N_start = random.randint(0, N_split - N_read) N_list = range(N_start, N_read + N_start) file_ids = N_list print file_ids, basefname_list = ub.train_files if load_test: basefname_list = ub.test_files print basefname_list original_date_cols = [] original_num_cols = [] original_cat_cols = [] # widgets = ['reading', ': ', Percentage(), ' ', Bar(), ' ', ETA()] # pbar = ProgressBar(widgets=widgets, maxval=N_read).start() counter = 0 df_output = None for id in file_ids: # print id counter += 1 df_chunk = None for basefname in basefname_list: basefname_part_template = ub.data_fname_to_partial_data_fname_template(basefname) input_fname_template = os.path.join(ub.data_dir, basefname_part_template) input_fname = input_fname_template.format(id) ub.log('reading {}'.format(input_fname)) if 'date' in basefname: if not load_date_csv: print 'skip date table...' continue else: if os.path.exists(input_fname + '.pkl'): df_tmp = pickle.load(input_fname + '.pkl') else: df_tmp = pd.read_csv(input_fname, low_memory=False) if not original_date_cols: original_date_cols = list(df_tmp.columns) df_tmp_add = df_tmp[['Id']].copy() # to host additional columns if not original_cols_only: df_tmp_add['null_col_count'] = df_tmp.isnull().sum(axis=1) df_tmp_add['start_date'] = df_tmp.drop('Id', 1).min(axis=1) df_tmp_add['end_date'] = df_tmp.drop('Id', 1).max(axis=1) df_tmp_add['time_span'] = df_tmp_add.end_date - df_tmp_add.start_date TS_list = list( set(['_'.join(x.split('_')[:2]) for x in list(df_tmp.columns) if x.startswith('L')])) # print TS_list col_list = list(df_tmp.columns) df_tmp_add['active_ts_cnt'] = 0 for ts in TS_list: cols = [x for x in col_list if ts in x] df_tmp_add[ts + '_start_date'] = df_tmp[cols].min(axis=1) df_tmp_add[ts + '_end_date'] = df_tmp[cols].max(axis=1) df_tmp_add[ts + '_time_span'] = df_tmp_add[ts + '_end_date'] - df_tmp_add[ ts + '_start_date'] # add TS active flag df_tmp_add[ts + '_active'] = df_tmp_add[ts + '_start_date'].notnull().astype(int) df_tmp_add['active_ts_cnt'] = df_tmp_add['active_ts_cnt'] + df_tmp_add[ts + '_active'] # added TS ordering # ub.log('Adding TS ordering columns') col_list = list(df_tmp_add.columns) # with open(os.path.join(ub.code_dir, 'df_tmp_add_debug_1029.pkl'), 'w') as fpickle: # pickle.dump(df_tmp_add, fpickle) for t in ['_start_date', '_end_date', '_time_span']: # print t cols = [x for x in col_list if t in x] # print cols df_tmp_order = np.argsort(np.argsort(df_tmp_add[cols])) df_tmp_order = pd.DataFrame(df_tmp_order) df_tmp_order['Id'] = df_tmp_add['Id'] df_tmp_add = pd.merge(df_tmp_add, df_tmp_order, on='Id', suffixes=['', '_rank']) elif 'numeric' in basefname: if not load_numerical_csv: if not load_test: print 'skip numerical table... but will still read Response column' df_tmp = pd.read_csv(input_fname, usecols=['Id', 'Response']) df_tmp_add = df_tmp[['Id']].copy() # to host additional columns else: print 'skip numerical table...' continue else: if os.path.exists(input_fname + '.pkl'): df_tmp = pickle.load(input_fname + '.pkl') else: df_tmp = pd.read_csv(input_fname, low_memory=False) if not original_num_cols: original_num_cols = list(df_tmp.columns) df_tmp_add = df_tmp[['Id']].copy() # to host additional columns df_tmp_add['num_null_col_count'] = df_tmp.isnull().sum(axis=1) df_tmp_add['num_active_ts_cnt'] = 0 TS_list = list(set(['_'.join(x.split('_')[:2]) for x in list(df_tmp.columns) if x.startswith('L')])) col_list = list(df_tmp.columns) for ts in TS_list: cols = [x for x in col_list if ts in x] # print cols df_tmp_add[ts + '_num_active'] = (df_tmp[cols].notnull().sum(axis=1) / len(cols)).astype(int) df_tmp_add['num_active_ts_cnt'] = df_tmp_add['num_active_ts_cnt'] + df_tmp_add[ ts + '_num_active'] elif 'categorical' in basefname: if not load_categorical_csv: print 'skip categorical table...' continue else: if os.path.exists(input_fname + '.pkl'): df_tmp = pickle.load(input_fname + '.pkl') else: df_tmp = pd.read_csv(input_fname, low_memory=False) if not original_cat_cols: original_cat_cols = list(df_tmp.columns) df_tmp_add = df_tmp[['Id']].copy() # to host additional columns df_tmp_add['cat_null_col_count'] = df_tmp.isnull().sum(axis=1) df_tmp_add['cat_active_ts_cnt'] = 0 TS_list = list(set(['_'.join(x.split('_')[:2]) for x in list(df_tmp.columns) if x.startswith('L')])) col_list = list(df_tmp.columns) for ts in TS_list: cols = [x for x in col_list if ts in x] # print cols df_tmp_add[ts + '_cat_active'] = (df_tmp[cols].notnull().sum(axis=1) / len(cols)).astype(int) df_tmp_add['cat_active_ts_cnt'] = df_tmp_add['cat_active_ts_cnt'] + df_tmp_add[ ts + '_cat_active'] else: assert 0 print 'df_tmp shape:', df_tmp.shape selected_features = [] if feature_list_file is not None: # cols_selected = ub.cols[keyword] feature_list_file_full_path = os.path.join(ub.code_dir, os.path.basename(feature_list_file)) ub.log('Using feature list" {}'.format(feature_list_file_full_path), 'highlight') with open(feature_list_file_full_path, 'r') as f_feature: selected_features = [x.strip() for x in f_feature.readlines()] if 'Id' not in selected_features: selected_features.append('Id') if 'Response' not in selected_features: selected_features.append('Response') # ub.log('Down selecting features with N_features={}'.format(N_features)) # if feature_print_flag == 0: # # N_features = 500 # ub.log('Down selecting features with N_features={}'.format(N_features)) # df_feature = pd.read_csv(os.path.join(ub.output_dir, 'feature_importance_xgb_accumu_list_df.csv')) # target_features = list(set(df_feature.sort_values(by=['fscore'])['feature'].values[:N_features])) # # f_test = 'load_data_record_col_names_Test_2016-10-25 20:31:38.txt' # f_train = 'load_data_record_col_names_Train_2016-10-25 20:30:30.txt' # with open(os.path.join(ub.output_dir, f_train), 'r') as f: # col_list_all = [x.strip() for x in f.readlines()] # # print len(target_features) # # print len(col_list_all) # # actual_features = [] # for feature in target_features: # if 'id_diff' in feature: # # print feature # actual_features.append( # feature.replace('_id_diff', '').replace('_reverse', '').replace('_magic', '')) # # these are created after loading in data # else: # actual_features.append(feature) # # print actual_features # # print len(actual_features) # # print len(set(actual_features)) # actual_features = list(set(actual_features)) # # selected_features = [] # for x in actual_features: # if x in col_list_all: # selected_features.append(x) # else: # ub.log('Found feature: {}, not in col_list_all, removing it'.format(x), 'error') # # selected_features.append('Id') # if not load_test: # selected_features.append('Response') # # ub.log('Features selected ({}):'.format(len(selected_features)), 'highlight') # # print '\n'.join(selected_features) # # print selected_features # feature_print_flag += 1 # print df_tmp.columns df_tmp = df_tmp[list(set(df_tmp.columns) & set(selected_features))] print 'df_tmp shape (after selection):', df_tmp.shape # print df_tmp.shape # print df_tmp.columns print 'df_tmp_add shape (merged):', df_tmp_add.shape if len(set(df_tmp.columns) & set(df_tmp_add.columns)) > 1: print set(df_tmp.columns) & set(df_tmp_add.columns) assert 0 df_tmp = df_tmp.merge(df_tmp_add, on='Id', copy=False) print 'df_tmp shape (merged):', df_tmp.shape if selected_features: df_tmp = df_tmp[list(set(df_tmp.columns) & set(selected_features))] print 'df_tmp shape (merged, after selection):', df_tmp.shape # print df_tmp.shape # print df_tmp.columns if df_chunk is None: df_chunk = df_tmp else: df_chunk = df_chunk.merge(df_tmp, on='Id', copy=False) print 'df_chunk shape (merged):', df_chunk.shape gc.collect() if not original_cols_only: if (not selected_features) or (('time_per_TS_num' in selected_features) \ and ('time_per_TS_cat' in selected_features) \ and ('time_span' in selected_features) \ and ('num_active_ts_cnt' in selected_features) \ and ('cat_active_ts_cnt' in selected_features)): if 'num_active_ts_cnt' in df_chunk.columns: df_chunk['time_per_TS_num'] = df_chunk.time_span / (1e-9 + df_chunk['num_active_ts_cnt']) if 'cat_active_ts_cnt' in df_chunk.columns: df_chunk['time_per_TS_cat'] = df_chunk.time_span / (1e-9 + df_chunk['cat_active_ts_cnt']) if 'active_ts_cnt' in df_chunk.columns: df_chunk['time_per_TS'] = df_chunk.time_span / (1e-9 + df_chunk['active_ts_cnt']) # print 'df_chunk shape: {}'.format(df_chunk.shape) if df_output is None: df_output = df_chunk else: df_output = pd.concat([df_output, df_chunk], ignore_index=True) # pbar.update(counter) gc.collect() # pbar.finish() print 'df_output shape: {}'.format(df_output.shape) # dates_cols = [x for x in list(df_output.columns) if 'start_date' in x or 'end_date' in x] # df_output[dates_cols].head(n=1000).to_csv(os.path.join(ub.data_dir, 'df_output_debug.csv')) datetime_str2 = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") col_list_fname = os.path.join(ub.output_dir, 'load_data_record_col_names_Train_{}.txt'.format(datetime_str2)) if load_test: col_list_fname = os.path.join(ub.output_dir, 'load_data_record_col_names_Test_{}.txt'.format(datetime_str2)) with open(col_list_fname, 'w') as fp: fp.write('\n'.join(list(df_output.columns))) return df_output, N_start, list(set(original_num_cols + original_cat_cols + original_date_cols))
def main(run_info_fname=None, compile_data=False, train_model=False, make_submission=False, N_start=None, N_files_train=1, N_files_test=1, original_cols_only=False, disable_id_diff_cols=False, feature_list_file=None, analyze_feature_importance=False, cv=False, # if True running cross validation if False, run single model training session and importance analysis early_stop_rounds=10, N_rounds=1000, testsize=0.3, xgb_params=None, skip_date_csv=False, skip_num_csv=False, skip_cat_csv=False ): datetime_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") if compile_data: run_info = dict() N_splits = ub.N_split if N_files_train > N_splits: N_files_train = N_splits if N_files_test > N_splits: N_files_test = N_splits # if analyze_feature_importance and (feature_list_file is not None): # assert 0 run_info['compile_data'] = compile_data run_info['N_splits'] = N_splits run_info['N_start'] = N_start run_info['N_files_train'] = N_files_train run_info['N_files_test'] = N_files_test run_info['original_cols_only'] = original_cols_only run_info['disable_id_diff_cols'] = disable_id_diff_cols run_info['features_list_file'] = feature_list_file run_info['skip_date_csv'] = skip_date_csv run_info['skip_num_csv'] = skip_num_csv run_info['skip_cat_csv'] = skip_cat_csv df_train, n_start, orig_cols = load_data(load_test=False, N_start=N_start, N_read=N_files_train, N_split=N_splits, original_cols_only=original_cols_only, feature_list_file=feature_list_file, load_categorical_csv=(not skip_cat_csv), load_date_csv=(not skip_date_csv), load_numerical_csv=(not skip_num_csv)) df_test, _1, _2 = load_data(load_test=True, N_start=n_start, N_read=N_files_test, N_split=N_splits, original_cols_only=original_cols_only, feature_list_file=feature_list_file, load_categorical_csv=(not skip_cat_csv), load_date_csv=(not skip_date_csv), load_numerical_csv=(not skip_num_csv)) if not disable_id_diff_cols: diff_period = 1 ub.log('generating id diff columns based on various dates columns: diff_period = {}'.format(diff_period)) dates_cols = [x for x in list(df_train.columns) if ('start_date' in x or 'end_date' in x) and ('rank' not in x)] # print dates_cols df_datesort = pd.concat([df_train[['Id'] + dates_cols], df_test[['Id'] + dates_cols]], ignore_index=True) gc.collect() for c in dates_cols: # print c df_datesort.sort_values(by=[c, 'Id'], inplace=True) df_datesort[c + '_id_diff'] = df_datesort['Id'].diff(diff_period).fillna(999999).astype(int) df_datesort[c + '_id_diff_reverse'] = df_datesort['Id'].iloc[::-1].diff().fillna(999999).astype(int) df_datesort[c + '_id_diff_magic'] = \ 1 + 2 * (df_datesort[c + '_id_diff'] > 1) + 1 * (df_datesort[c + '_id_diff_reverse'] < -1) df_datesort.drop([c], axis=1, inplace=True) df_datesort.head(n=N_DEBUG_LINES).to_csv(os.path.join(ub.data_dir, 'df_datesort_debug.csv'), index=False) gc.collect() df_train = df_train.merge(df_datesort, on='Id') gc.collect() df_test = df_test.merge(df_datesort, on='Id') df_test['Response'] = 0 df_train.head(n=N_DEBUG_LINES).to_csv(os.path.join(ub.data_dir, 'df_train_debug.csv'), index=False) o = set(orig_cols) c = set(df_train.columns) c.difference_update(o) new_cols = list(c) df_train[new_cols].head(n=N_DEBUG_LINES).to_csv(os.path.join(ub.data_dir, 'df_train_debug_new_cols_only.csv'), index=False) df_test.head(n=N_DEBUG_LINES).to_csv(os.path.join(ub.data_dir, 'df_test_debug.csv'), index=False) o = set(orig_cols) c = set(df_test.columns) c.difference_update(o) new_cols = list(c) df_test[new_cols].head(n=N_DEBUG_LINES).to_csv(os.path.join(ub.data_dir, 'df_test_debug_new_cols_only.csv'), index=False) print df_train.shape print df_test.shape gc.collect() # if N_files_train == N_splits: # split_data(df_train, # output_fname_template=os.path.join(ub.processed_data_dir, 'df_train_preprocessed_part{}.csv')) # if N_files_test == N_splits: # split_data(df_test, # output_fname_template=os.path.join(ub.processed_data_dir, 'df_test_preprocessed_part{}.csv')) fillna = True run_info['fillna'] = fillna if fillna: ub.log('Filling na...') for df in [df_train, df_test]: cols_full_flag = df.isnull().any() non_full_cols = list(cols_full_flag[cols_full_flag].index) print 'Non-full columns: {}'.format(len(non_full_cols)) # print non_full_cols if 1: df.fillna(-999999, inplace=True) else: # print df.PersonalField7.unique() for c in non_full_cols: if len(df[c].unique()) > 2: most_frequent_items = df[c].value_counts().idxmax() print c, most_frequent_items df[c].fillna(value=most_frequent_items, inplace=True) else: # if it is only a pair of value [somthing, nan] then fill in "missing" df[c].fillna(value='missing', inplace=True) print c, df[c].unique() cols_full_flag = df.isnull().any() non_full_cols = list(cols_full_flag[cols_full_flag].index) print 'Non-full columns: {}'.format(len(non_full_cols)) le = LabelEncoder() obj_cols = df.select_dtypes(include=['object']).columns # print 'Obj columns: ', list(obj_cols) for col in obj_cols: df[col] = le.fit_transform(df[col]) df_train.head(n=1000).to_csv(os.path.join(ub.data_dir, 'df_train_cleanup_debug.csv'), index=False) df_test.head(n=1000).to_csv(os.path.join(ub.data_dir, 'df_train_cleanup_debug.csv'), index=False) ub.log('Dropping Id and Response columns...') columns_to_drop = ['Id', 'Response'] shuffle_col = df_train[['Id']].copy() shuffle_col['Id'] = np.random.rand(len(shuffle_col)) y_total_df = df_train['Response'] y_total = df_train['Response'].values df_train.drop(columns_to_drop, axis=1, inplace=True) df_test.drop(columns_to_drop, axis=1, inplace=True) print df_train.shape print df_test.shape prior = np.sum(y_total) / (1. * len(y_total)) print 'prior: {}'.format(prior) run_info['prior'] = prior gc.collect() feature_imp_fname_template = os.path.join(ub.output_dir, 'feature_importance_xgb_{}') run_info['feature_imp_fname_template'] = feature_imp_fname_template top_features_fname = feature_imp_fname_template.format('accumu_list.txt') run_info['top_features_fname'] = top_features_fname # if feature_down_select: # ub.log('Feature down selected based on {}...'.format(top_features_fname)) # #todo may need to set a maxN for the number of features to use # # with open(top_features_fname, 'r') as tf: # selected_cols = [x.strip() for x in tf.readlines()] # df_train = df_train[selected_cols] # df_test = df_test[selected_cols] # print df_train.shape # print df_test.shape # print df_train.columns feature_names = list(df_train.columns) postfix_train = '{}_{}of{}'.format(datetime_str, N_files_train, N_splits) postfix_test = '{}_{}of{}'.format(datetime_str, N_files_test, N_splits) run_info['postfix_train'] = postfix_train run_info['postfix_test'] = postfix_test run_info['testsize'] = testsize train_test_split_method = 1 ub.log('Train/val split using testsize={}, split_method={}'.format(testsize, train_test_split_method)) if train_test_split_method == 1: train_idx = shuffle_col[shuffle_col['Id'] > testsize].index val_idx = shuffle_col[shuffle_col['Id'] <= testsize].index ub.log('Done shuffling...') print 'len of train_idx', len(train_idx) print 'len of val_idx', len(val_idx) y_train = y_total_df.loc[train_idx].values y_val = y_total_df.loc[val_idx].values xgtrain = xgb.DMatrix(df_train.loc[train_idx].values, y_train, feature_names=feature_names) ub.log('Assembled xgtrain') xgval = xgb.DMatrix(df_train.loc[val_idx].values, y_val, feature_names=feature_names) ub.log('Assembled xgval') del df_train ub.log('Deleted df_train') gc.collect() else: x_train, x_val, y_train, y_val = train_test_split(df_train.values, y_total, test_size=testsize) ub.log('Done shuffling...') print x_train.shape print x_val.shape del df_train gc.collect() ub.log('Deleted df_train') xgtrain = xgb.DMatrix(x_train, y_train, feature_names=feature_names) ub.log('Assembled xgtrain') xgval = xgb.DMatrix(x_val, y_val, feature_names=feature_names) ub.log('Assembled xgval') del x_train del x_val gc.collect() fname_xgtrain = os.path.join(ub.processed_data_dir, 'xgtrain_{}.buffer'.format(postfix_train)) xgtrain.save_binary(fname_xgtrain) ub.log('Saved {}'.format(fname_xgtrain)) fname_xgval = os.path.join(ub.processed_data_dir, 'xgval_{}.buffer'.format(postfix_train)) xgval.save_binary(fname_xgval) ub.log('Saved {}'.format(fname_xgval)) xgtest = xgb.DMatrix(df_test.values, feature_names=feature_names) ub.log('Assembled xgtest') fname_xgtest = os.path.join(ub.processed_data_dir, 'xgtest_{}.buffer'.format(postfix_test)) xgtest.save_binary(fname_xgtest) ub.log('Saved {}'.format(fname_xgtest)) del df_test gc.collect() ub.log('Deleted df_test') print 'train and val set sizes' print xgtrain.num_row(), xgtrain.num_col() print xgval.num_row(), xgval.num_col() run_info['xgtrain_nrows'] = xgtrain.num_row() run_info['xgval_nrows'] = xgval.num_row() run_info['fname_xgtrain'] = fname_xgtrain run_info['fname_xgval'] = fname_xgval run_info['fname_xgtest'] = fname_xgtest fname_ytrain = os.path.join(ub.processed_data_dir, 'ytrain_{}.npy'.format(postfix_train)) fname_yval = os.path.join(ub.processed_data_dir, 'yval_{}.npy'.format(postfix_train)) np.save(fname_ytrain, y_train) ub.log('Saved ' + fname_ytrain) np.save(fname_yval, y_val) ub.log('Saved ' + fname_yval) run_info['fname_ytrain'] = fname_ytrain run_info['fname_yval'] = fname_yval if train_model: assert compile_data or (run_info_fname is not None) run_info['cv'] = cv run_info['analyze_feature_importance'] = analyze_feature_importance run_info['early_stop_rounds'] = early_stop_rounds if not compile_data: ub.log('(train_model) Loading run info from {} ...'.format(run_info_fname)) with open(run_info_fname, 'r') as fp: run_info = eval(fp.read()) print json.dumps(run_info, indent=2) run_info_fname = run_info_fname.replace('.txt', '_{}.txt'.format(datetime_str)) logged_home_dir = None if ub.home_dir not in run_info['fname_xgtrain']: for i in ub.possible_home_dirs: if i in run_info['fname_xgtrain']: logged_home_dir = i for k in ['fname_xgtrain', 'fname_xgval', 'fname_ytrain', 'fname_yval']: run_info[k] = run_info[k].replace(logged_home_dir, ub.home_dir) if analyze_feature_importance: for k in ['feature_imp_fname_template', 'top_feature_fname']: run_info[k] = run_info[k].replace(logged_home_dir, ub.home_dir) ub.log('Loading xgtrain data {} ...'.format(run_info['fname_xgtrain'])) xgtrain = xgb.DMatrix(run_info['fname_xgtrain']) ub.log('Loading xgval data {} ...'.format(run_info['fname_xgval'])) xgval = xgb.DMatrix(run_info['fname_xgval']) ub.log('Loading ytrain data {} ...'.format(run_info['fname_ytrain'])) y_train = np.load(run_info['fname_ytrain']) ub.log('Loading yval data {} ...'.format(run_info['fname_yval'])) y_val = np.load(run_info['fname_yval']) prior = run_info['prior'] postfix_train = run_info['postfix_train'] if xgb_params is None: xgb_params = get_params(bases_core=prior) xgb_params['base_score'] = prior # n_positive / n_total # xgb_params['scale_pos_weight'] = (1.0 - prior) / prior run_info['xgb_params'] = xgb_params ub.log('Get xgb_params') print xgb_params xgb_num_rounds = N_rounds run_info['xgb_num_rounds'] = xgb_num_rounds print 'xgb_num_rounds', xgb_num_rounds if cv: ub.log('Running cross validation...') eval_hist = xgb.cv(xgb_params, xgtrain, num_boost_round=xgb_num_rounds, early_stopping_rounds=early_stop_rounds, feval=ub.mcc_eval, maximize=True, verbose_eval=1, show_stdv=True, nfold=3, seed=0, stratified=True) print eval_hist eval_hist_fname = os.path.join(ub.output_dir, 'cv_eval_history_{}.csv'.format(postfix_train)) if not compile_data: eval_hist_fname = eval_hist_fname.replace('.csv', '_{}.csv'.format(datetime_str)) run_info['eval_hist_fname'] = eval_hist_fname eval_hist.to_csv(eval_hist_fname) run_info['cv_score_test'] = eval_hist['test-MCC-mean'].max() run_info['cv_score_train'] = eval_hist['train-MCC-mean'].max() if 1: ub.log('Running training...') watchlist = [(xgtrain, 'train'), (xgval, 'eval')] model = xgb.train(xgb_params, xgtrain, num_boost_round=xgb_num_rounds, early_stopping_rounds=early_stop_rounds, feval=ub.mcc_eval, maximize=True, evals=watchlist, verbose_eval=True) model_fname = os.path.join(ub.output_dir, 'xbg_{}.model'.format(postfix_train)) if not compile_data: model_fname = model_fname.replace('.model', '_{}.model'.format(datetime_str)) ub.log('Saving model: {}...'.format(model_fname)) model.save_model(model_fname) model.dump_model(model_fname + '.raw.txt') run_info['model_fname'] = model_fname ntree_limit = model.best_iteration + 1 ub.log('Predictions on xgtrain...', 'highlight') predictions = model.predict(xgtrain, ntree_limit=ntree_limit) best_proba, best_mcc, y_pred = ub.eval_mcc(y_train, predictions, True) mcc_official = matthews_corrcoef(y_train, y_pred) print 'ntree limit:', ntree_limit print 'best_mcc:', best_mcc print 'best_proba:', best_proba print 'matthews_corroef', mcc_official run_info['ntree_limit_train'] = ntree_limit run_info['best_mcc_train'] = best_mcc run_info['best_proba_train'] = best_proba run_info['mcc_official_train'] = mcc_official ub.log('Predictions on xgval...', 'highlight') predictions = model.predict(xgval, ntree_limit=ntree_limit) best_proba, best_mcc, y_pred = ub.eval_mcc(y_val, predictions, True) mcc_official = matthews_corrcoef(y_val, y_pred) print 'ntree limit:', ntree_limit print 'best_mcc:', best_mcc print 'best_proba:', best_proba print 'matthews_corroef', mcc_official run_info['ntree_limit_val'] = ntree_limit run_info['best_mcc_val'] = best_mcc run_info['best_proba_val'] = best_proba run_info['mcc_official_val'] = mcc_official if analyze_feature_importance: ub.log('Analyzing feature importance...') feature_imp_fname_template = run_info['feature_imp_fname_template'] top_features_fname = run_info['top_features_fname'] feature_imp_fname = feature_imp_fname_template.format(postfix_train) imp = model.get_fscore() imp = sorted(imp.items(), key=operator.itemgetter(1)) imp_df = pd.DataFrame(imp, columns=['feature', 'fscore']) imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() ub.log('Output result csv to {}...'.format(feature_imp_fname + '.csv')) imp_df.to_csv(feature_imp_fname + '.csv') plt.figure() imp_df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) plt.title('XGBoost Feature Importance @ {}'.format(postfix_train)) plt.xlabel('relative importance') plt.gcf().savefig(feature_imp_fname + '.png', bbox_inches='tight') feature_lists = glob.glob(feature_imp_fname_template.replace('{}', '*.csv')) ub.log('Aggregating previous analysis results...') print feature_lists features_df = None if feature_lists: for f_l in feature_lists: tmp_df = pd.read_csv(f_l, index_col=0) if features_df is None: features_df = tmp_df else: features_df = pd.concat([features_df, tmp_df], ignore_index=True) f_df = features_df.groupby(['feature']).mean().reset_index() f_df['overall'] = True imp_df['overall'] = False merged_df = pd.concat([imp_df, f_df]).sort_values(by=['overall', 'fscore'], ascending=False) sns_plot = sns.factorplot(y='feature', x='fscore', data=merged_df, hue='overall', kind='bar', hue_order=[True, False], size=20, aspect=0.5) sns_plot.savefig(feature_imp_fname + '_overall.png', bbox_inches='tight') ub.log('Output overall result csv to {}...'.format(top_features_fname)) with open(top_features_fname, 'w') as tf: tf.write('\n'.join(list(set(merged_df.feature.values)))) merged_df.to_csv(top_features_fname.replace('.txt', '_df.csv'), index=False) # json has trouble serializing np.float32 # with open(run_info_fname, 'w') as fp: # json.dump(run_info, fp) if make_submission: if not train_model and not compile_data: assert (run_info_fname is not None) ub.log('(make_submission) Loading run info from {} ...'.format(run_info_fname)) with open(run_info_fname, 'r') as fp: run_info = eval(fp.read()) print json.dumps(run_info, indent=2) if ub.home_dir not in run_info['model_fname']: for i in ub.possible_home_dirs: if i in run_info['model_fname']: logged_home_dir = i for k in ['fname_xgtest', 'model_fname']: if ub.home_dir not in run_info[k]: for i in ub.possible_home_dirs: if i in run_info[k]: run_info[k] = run_info[k].replace(i, ub.home_dir) if not train_model: model = xgb.Booster() ub.log('Loading model {} ...'.format(run_info['model_fname'])) model.load_model(run_info['model_fname']) if not compile_data: ub.log('Loading xgtest data {} ...'.format(run_info['fname_xgtest'])) xgtest = xgb.DMatrix(run_info['fname_xgtest']) ub.log('XGB making predictions...') postfix_train = run_info['postfix_train'] ypred = model.predict(xgtest, ntree_limit=run_info['ntree_limit_train']) nrows = len(ypred) sample = pd.read_csv(os.path.join(ub.data_dir, 'sample_submission.csv'), nrows=nrows) sample['Response'] = ypred fname_output = os.path.join(ub.output_dir, "sub_xgboost_{}_prob.csv".format(postfix_train)) if not compile_data: fname_output = fname_output.replace('.csv', '_{}.csv'.format(datetime_str)) ub.log('Writing output file (raw proba) {} ...'.format(fname_output)) sample.to_csv(fname_output, index=False) best_proba = (run_info['best_proba_train'] + run_info['best_proba_val']) / 2.0 ub.log('Using threshold: best_proba == {}'.format(best_proba)) sample['Response'] = (ypred > best_proba).astype(int) fname_output = os.path.join(ub.output_dir, "sub_xgboost_{}.csv".format(postfix_train)) if not compile_data: fname_output = fname_output.replace('.csv', '_{}.csv'.format(datetime_str)) ub.log('Writing output file {} ...'.format(fname_output)) sample.to_csv(fname_output, index=False) if compile_data or train_model: if compile_data: if run_info_fname is not None: ub.log('Ignore input run_info_fname {}'.format(run_info_fname)) run_info_fname = os.path.join(ub.output_dir, 'run_info_{}.txt'.format(postfix_train)) # else run_info_fname is an input parameter ub.log('Saving run_info into {}'.format(run_info_fname)) print pd.Series(run_info) with open(run_info_fname, 'w') as fp: fp.write(str(run_info)) return run_info_fname
parser.add_argument('-run_info', action='store', type=str, default=None, dest='run_info_fname', help='>> Specifies run_info_fname') par = parser.parse_args() if par.run_info_fname is not None: par.run_info_fname = os.path.join(ub.output_dir, os.path.basename(par.run_info_fname)) new_xgb_params = get_params() new_xgb_params['eta'] = par.xgb_eta new_xgb_params['max_depth'] = par.xgb_md new_xgb_params['min_child_weight'] = par.xgb_mcw new_xgb_params['subsample'] = par.xgb_ss new_xgb_params['colsample_bytree'] = par.xgb_cs ub.log('Input parameters:', 'info') # print pd.Series(par.__dict__) par_dict = dict( compile_data=par.compile_data, N_files_train=par.N_files_train, N_files_test=par.N_files_test, N_start=par.N_start, feature_list_file=par.feature_list_file, original_cols_only=par.original_cols_only, disable_id_diff_cols=par.disable_id_diff_cols, skip_date_csv=par.skip_date_csv, skip_num_csv=par.skip_num_csv, skip_cat_csv=par.skip_cat_csv, train_model=par.train_model,
left_index=True)['Response'] if (useLOO): x = ((x * x.shape[0]) - outcomes) / (x.shape[0] - 1) # x = x + np.random.normal(0, .01, x.shape[0]) return x.fillna(x.mean()) # %% if use_buffer: xgtrain_fname = '../data_processed/xgtrain{}_{}.buffer'.format( sub_str, tag) with open('../data_processed/xgb_features{}_{}.txt'.format(sub_str, tag), 'r') as ff: feature_names = [x.strip() for x in ff.readlines()] ub.log('Loading xgtrain DMatrix...{}'.format(xgtrain_fname)) xgtrain = xgb.DMatrix(xgtrain_fname, feature_names=feature_names) else: df_train_fname = '../data_processed/df_train_overall_{}.csv'.format(tag) # df_train_fname = '../data/train_numeric.csv' df_train2 = pd.read_csv(df_train_fname, nrows=1) feature_list_fname = 'importance_ordered_list_1107.txt' #feature_list_fname = 'feature_select_1102.txt' with open(feature_list_fname, 'r') as ff: selected_features = [fe.strip() for fe in ff.readlines()] OOFA_encode_features = [ 'start_date', 'start_date_id_diff', 'start_date_id_diff_reverse', 'L1_S24_F1559', 'L3_S32_F3851', 'L1_S24_F1827', 'L1_S24_F1582', 'L3_S32_F3854', 'L1_S24_F1510', 'L1_S24_F1525', 'L3_S30_start_date',
@author: dingran """ import pandas as pd import util_bosch as ub import gc import numpy as np import sys import seaborn as sns import os import matplotlib.pyplot as plt #import xgboost as xgb from sklearn.preprocessing import LabelEncoder do_plot = True ub.log('Starting') N_max = 24 tag = 'v1' testsize = 0.25 print 'N_max', N_max print 'tag', tag N = range(N_max) if 0: ub.log('Reading date csv') if N_max < 24: dfs = [] for i in N: print i,
if 'Response' in num_cols: num_cols.remove('Response') df2 = pd.read_csv('../data/test_numeric.csv', usecols=num_cols, nrows=NROWS) df1['is_train'] = 1 df2['is_train'] = 0 df_num = pd.concat([df1, df2], ignore_index=True) del df1 del df2 import sys import util_bosch as ub from sklearn.preprocessing import LabelEncoder #%% na_fill_val = -1 ub.log('Processing categorical csv, fillna {}'.format(na_fill_val)) df_cat.fillna(na_fill_val, inplace=True) ub.log('LabelEncoder running') le = LabelEncoder() obj_cols = df_cat.select_dtypes(include=['object']).columns print len(obj_cols) # print 'Obj columns: ', list(obj_cols) counter = 0 for col in obj_cols: counter += 1 print '{}/{}'.format(counter, len(obj_cols)), sys.stdout.flush() df_cat[col] = le.fit_transform(df_cat[col]) ub.log('Done processing categorical csv')
n_rows = int(df_train.shape[0] / N_split) widgets = ['splitting', ': ', Percentage(), ' ', Bar(), ' ', ETA()] pbar = ProgressBar(widgets=widgets, maxval=N_split).start() for i in range(N_split): output_fname = output_fname_template.format(i) if 0: if i < N_split - 1: print 'writng rows:', i * n_rows, (i + 1) * n_rows - 1 else: print 'writng rows:', i * n_rows, df_train.shape[0] df_train[i * n_rows:(i + 1) * n_rows].to_csv(output_fname, index=False) pbar.update(i) pbar.finish() if __name__ == '__main__': f_list = [ 'train_numeric.csv', 'train_categorical.csv', 'train_date.csv', 'test_numeric.csv', 'test_categorical.csv', 'test_date.csv', ] for f in f_list: ub.log('*' * 50 + f, 'highlight') split_csv(f)
import glob import os import pandas as pd import re import seaborn as sns import datetime do_plot = False datetime_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # feature_imp_fname_template = os.path.join(ub.output_dir, 'feature_importance_xgb_{}') feature_imp_fname_template = os.path.join(ub.code_dir, 'feature_importance_xgb_{}') feature_lists = glob.glob(feature_imp_fname_template.replace('{}', '*.csv')) ub.log('Aggregating previous analysis results...') print feature_lists features_df = None find_info = re.compile(r'feature_importance_xgb_(.*)_(\d+)of24.csv') if feature_lists: for f_l in feature_lists: if 'accumu' in f_l: print 'skip ' + f_l continue tmp_df = pd.read_csv(f_l, index_col=0) results = find_info.search(f_l) # datetime_info = results.group(1) # n_datasets = results.group(2) fname = os.path.basename(f_l) # id_info = '{}_{}sets'.format(datetime_info, n_datasets) # tmp_df = tmp_df.rename(columns={'fscore': })