type_train = type_train.sort_values('id')

        for col in categorical_cols:
            if col in type_train.columns:
                type_train[col] = type_train[col].astype('category')

        type_y = type_train['scalar_coupling_constant']
        del folds_data_list
        gc.collect()

        type_test = full_test[test_mask]

        type_oof_train = oof_train[train_mask]
        type_oof_test = oof_test[test_mask]

        type_train = train_utils.concat_stupidly(type_train, type_oof_train)
        type_test = train_utils.concat_stupidly(type_test, type_oof_test)

        print('reading distance based features...')
        type_dist_train = pd.read_csv(
            f'../data/{type}_train_distance_based_feats.csv')
        type_dist_train.drop('scalar_coupling_constant', axis=1, inplace=True)
        type_dist_test = pd.read_csv(
            f'../data/{type}_test_distance_based_feats.csv')
        atoms_categorical_cols = [f'atom_{i}' for i in range(2, 10)]
        for df in [type_dist_test, type_dist_train]:
            df[atoms_categorical_cols] = df[atoms_categorical_cols].astype(
                'category')

        if type_num == 0:
            best_features = best_features + list(type_dist_train.columns)
Пример #2
0
    oof_test = pd.read_csv('../data/test_oof_features.csv', nrows=len(test))
    oof_drop_cols = [
        'id',
        'molecule_name',
        'atom_index_0',
        'atom_index_1',
        'scalar_coupling_constant_oof',
    ]

    for df in [oof_train, oof_test]:
        df.drop(oof_drop_cols, axis=1, inplace=True)
    gc.collect()

    best_features = best_features + list(oof_train.columns)

    train = train_utils.concat_stupidly(train, oof_train)
    test = train_utils.concat_stupidly(test, oof_test)

    gc.collect()

    folds = KFold(n_splits=n_folds, shuffle=True, random_state=0)

    params = {
        'num_leaves': 128,
        'objective': 'regression',
        'learning_rate': 0.04,
        "boosting_type": "gbdt",
        "subsample_freq": 1,
        "subsample": 0.75,
        "bagging_seed": 11,
        "metric": 'mae',
Пример #3
0
            print(f'reading {mapping["train_features"]}')
            train_path = mapping['train_features']

            train_data = pd.read_csv(train_path,
                                     nrows=nrows,
                                     usecols=mapping['use_cols'])

            if mapping['drop_columns']:
                train_data.drop(mapping['drop_columns'], axis=1, inplace=True)
                gc.collect()

            if mapping['concat_type'] == 'concat':
                print('concating...')
                train_data = train_data[mask]
                gc.collect()
                fold_data = train_utils.concat_stupidly(fold_data, train_data)
                assert len(fold_data) == len(train_data)

            elif mapping['concat_type'] == 'merge':
                print('merging...')
                for i in range(2):
                    fold_data = train_utils.map_atom_info(
                        fold_data, train_data, i)

            else:
                raise KeyError()

            del train_data
            gc.collect()
        assert len(fold_data) == mask.sum()
        print('saving...')