def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape train_id = train['id'] train_price_doc = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test_id = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values # conbined_data = feature_distribute_scale(conbined_data) conbined_data = feature_discretization(conbined_data) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['id'] = train_id train['price_doc'] = train_price_doc test['id'] = test_id print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape train_price_doc = train['price_doc'] train.drop(['price_doc'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values conbined_data.index = range(conbined_data.shape[0]) # 时间窗大小 timewindow_days = [30 * 6, 30 * 4, 30 * 2, 30, 20, 10] conbined_data = perform_time_window(conbined_data, timewindow_days) conbined_data = perform_groupby_time_window(conbined_data, timewindow_days) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['price_doc'] = train_price_doc print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape train_price_doc = train['price_doc'] num_columns = train.select_dtypes(exclude=['object']).columns.values num_columns = num_columns.tolist() num_columns.remove('id') num_columns.remove('timestamp') print 'perform feature selection in %d numerical features...' % train[num_columns].shape[1] keep_features = feature_select(train[num_columns], keep_top=0.98) print 'after feature selection numerical features', len(keep_features) keep_features.append('id') keep_features.append('timestamp') train = train[keep_features] test = test[keep_features] train['price_doc'] = train_price_doc print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape train_id = train['id'] train_price_doc = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test_id = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values conbined_data = gen_area_features(conbined_data) conbined_data = gen_school_features(conbined_data) conbined_data = generate_hospital_features(conbined_data) conbined_data = generate_population_features(conbined_data) conbined_data = generate_population_age_features(conbined_data) conbined_data = generate_build_features(conbined_data) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['id'] = train_id train['price_doc'] = train_price_doc test['id'] = test_id print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() longitude_latitude = data_utils.load_longitude_latitude_data() print 'train:', train.shape, ', test:', test.shape train_id = train['id'] train_price_doc = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test_id = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values conbined_data = generate_distance_features(conbined_data, longitude_latitude) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['id'] = train_id train['price_doc'] = train_price_doc test['id'] = test_id.values print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape train_id = train['id'] train_price_doc = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test_id = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values pca_components = generate_pca_components(conbined_data, keep_component=0.01) pca_train = pca_components.iloc[:train.shape[0], :] pca_train['id'] = train_id pca_test = pca_components.iloc[:train.shape[0], :] pca_test['id'] = test_id train = conbined_data.iloc[:train.shape[0], :] train['id'] = train_id test = conbined_data.iloc[train.shape[0]:, :] test['id'] = test_id train = pd.merge(train, pca_train, how='left', on='id') test = pd.merge(test, pca_test, how='left', on='id') train['price_doc'] = train_price_doc print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train datas...' train, test, _ = data_utils.load_imputed_data() print 'train:', train.shape train = subsample_train(train) train = train.reset_index() del train['index'] print 'train:', train.shape print("Save data...") data_utils.save_data(train, test, _)
def build_hand_classes(self, params): """ Builds categorical targets of hand class. |Hand Value|Unique|Distinct| |Straight Flush |40 |10| |Four of a Kind |624 |156| |Full Houses |3744 |156| |Flush |5108 |1277| |Straight |10200 |10| |Three of a Kind|54912 |858| |Two Pair |123552 |858| |One Pair |1098240 |2860| |High Card |1302540 |1277| |TOTAL |2598960 |7462| """ for dataset in ['train', 'val']: save_path = os.path.join(params['save_dir'], dataset) xpath = f"{os.path.join(save_path,dataset)}X" ypath = f"{os.path.join(save_path,dataset)}Y" X = [] y = [] num_hands = params[dt.Globals.INPUT_SET_DICT[dataset]] // 9 if params['datatype'] == dt.DataTypes.NINECARD: for category in dt.Globals.HAND_TYPE_DICT.keys(): print('category', category) for _ in range(num_hands): hand, board = self.create_ninecard_handtypes(category) shuffled_hand, shuffled_board = CardDataset.shuffle_hand_board( hand, board) x_input = np.concatenate( [shuffled_hand, shuffled_board], axis=0) X.append(x_input) y.append(category) elif params['datatype'] == dt.DataTypes.FIVECARD: for category in dt.Globals.HAND_TYPE_DICT.keys(): print('category', category) for _ in range(num_hands): X.append(self.create_handtypes(category)) y.append(category) else: raise ValueError( f"{params['datatype']} datatype not understood") X = np.stack(X) y = np.stack(y) save_data(X, xpath) save_data(y, ypath)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape # delete_features = delete_some_non_important_features(train, test) # print 'delete_features:', len(delete_features) # delete_features = delete_features[-20:] # print 'delete_features:', len(delete_features) # # for f in delete_features: # del train[f] # del test[f] # low_corr_features = get_low_corr_features(train, min_corr=0.00) # for f in low_corr_features: # del train[f] # del test[f] train, test = delete_some_features(train, test) print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)
def main(): print 'loading train and test datas...' train, test, _ = data_utils.load_data() print 'train:', train.shape, ', test:', test.shape print 'perform data cleaning and basic feature engineering' train, test = perform_area_features(train, test) train, test = perform_floor_features(train, test) train, test = perform_state_features(train, test) train, test = perform_material_features(train, test) train, test = perform_build_year_features(train, test) train, test = perform_num_room_features(train, test) train, test = perform_product_type_features(train, test) train_id = train['id'] train_price_doc = train['price_doc'] train.drop(['id', 'price_doc'], axis=1, inplace=True) test_id = test['id'] test.drop(['id'], axis=1, inplace=True) # 合并训练集和测试集 conbined_data = pd.concat([train[test.columns.values], test]) conbined_data.columns = test.columns.values conbined_data = perform_timestamp_features(conbined_data) train = conbined_data.iloc[:train.shape[0], :] test = conbined_data.iloc[train.shape[0]:, :] train['id'] = train_id train['price_doc'] = train_price_doc test['id'] = test_id print 'train:', train.shape, ', test:', test.shape print("Save data...") data_utils.save_data(train, test, _)