예제 #1
0
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    # conbined_data = feature_distribute_scale(conbined_data)
    conbined_data = feature_discretization(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_price_doc = train['price_doc']
    train.drop(['price_doc'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values
    conbined_data.index = range(conbined_data.shape[0])

    # 时间窗大小
    timewindow_days = [30 * 6, 30 * 4, 30 * 2, 30, 20, 10]
    conbined_data = perform_time_window(conbined_data, timewindow_days)
    conbined_data = perform_groupby_time_window(conbined_data, timewindow_days)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['price_doc'] = train_price_doc
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_price_doc = train['price_doc']

    num_columns = train.select_dtypes(exclude=['object']).columns.values
    num_columns = num_columns.tolist()
    num_columns.remove('id')
    num_columns.remove('timestamp')

    print 'perform feature selection in %d numerical features...' % train[num_columns].shape[1]
    keep_features = feature_select(train[num_columns], keep_top=0.98)
    print 'after feature selection numerical features', len(keep_features)
    keep_features.append('id')
    keep_features.append('timestamp')

    train = train[keep_features]
    test = test[keep_features]

    train['price_doc'] = train_price_doc

    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    conbined_data = gen_area_features(conbined_data)
    conbined_data = gen_school_features(conbined_data)
    conbined_data = generate_hospital_features(conbined_data)
    conbined_data = generate_population_features(conbined_data)
    conbined_data = generate_population_age_features(conbined_data)
    conbined_data = generate_build_features(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    longitude_latitude = data_utils.load_longitude_latitude_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    conbined_data = generate_distance_features(conbined_data, longitude_latitude)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    pca_components = generate_pca_components(conbined_data, keep_component=0.01)

    pca_train = pca_components.iloc[:train.shape[0], :]
    pca_train['id'] = train_id
    pca_test = pca_components.iloc[:train.shape[0], :]
    pca_test['id'] = test_id

    train = conbined_data.iloc[:train.shape[0], :]
    train['id'] = train_id
    test = conbined_data.iloc[train.shape[0]:, :]
    test['id'] = test_id

    train = pd.merge(train, pca_train, how='left', on='id')
    test = pd.merge(test, pca_test, how='left', on='id')

    train['price_doc'] = train_price_doc
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train datas...'
    train, test, _ = data_utils.load_imputed_data()
    print 'train:', train.shape

    train = subsample_train(train)
    train = train.reset_index()
    del train['index']

    print 'train:', train.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
예제 #8
0
    def build_hand_classes(self, params):
        """
        Builds categorical targets of hand class.

        |Hand Value|Unique|Distinct|
        |Straight Flush |40      |10|
        |Four of a Kind |624     |156|
        |Full Houses    |3744    |156|
        |Flush          |5108    |1277|
        |Straight       |10200   |10|
        |Three of a Kind|54912   |858|
        |Two Pair       |123552  |858|
        |One Pair       |1098240 |2860|
        |High Card      |1302540 |1277|
        |TOTAL          |2598960 |7462|
        """
        for dataset in ['train', 'val']:
            save_path = os.path.join(params['save_dir'], dataset)
            xpath = f"{os.path.join(save_path,dataset)}X"
            ypath = f"{os.path.join(save_path,dataset)}Y"
            X = []
            y = []
            num_hands = params[dt.Globals.INPUT_SET_DICT[dataset]] // 9
            if params['datatype'] == dt.DataTypes.NINECARD:
                for category in dt.Globals.HAND_TYPE_DICT.keys():
                    print('category', category)
                    for _ in range(num_hands):
                        hand, board = self.create_ninecard_handtypes(category)
                        shuffled_hand, shuffled_board = CardDataset.shuffle_hand_board(
                            hand, board)
                        x_input = np.concatenate(
                            [shuffled_hand, shuffled_board], axis=0)
                        X.append(x_input)
                        y.append(category)
            elif params['datatype'] == dt.DataTypes.FIVECARD:
                for category in dt.Globals.HAND_TYPE_DICT.keys():
                    print('category', category)
                    for _ in range(num_hands):
                        X.append(self.create_handtypes(category))
                        y.append(category)
            else:
                raise ValueError(
                    f"{params['datatype']} datatype not understood")
            X = np.stack(X)
            y = np.stack(y)
            save_data(X, xpath)
            save_data(y, ypath)
예제 #9
0
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    # delete_features = delete_some_non_important_features(train, test)
    # print 'delete_features:', len(delete_features)
    # delete_features = delete_features[-20:]
    # print 'delete_features:', len(delete_features)
    #
    # for f in delete_features:
    #     del train[f]
    #     del test[f]

    # low_corr_features = get_low_corr_features(train, min_corr=0.00)
    # for f in low_corr_features:
    #     del train[f]
    #     del test[f]

    train, test = delete_some_features(train, test)
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
예제 #10
0
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    print 'perform data cleaning and basic feature engineering'

    train, test = perform_area_features(train, test)
    train, test = perform_floor_features(train, test)
    train, test = perform_state_features(train, test)
    train, test = perform_material_features(train, test)
    train, test = perform_build_year_features(train, test)
    train, test = perform_num_room_features(train, test)
    train, test = perform_product_type_features(train, test)

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    conbined_data = perform_timestamp_features(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)