type_train = type_train.sort_values('id') for col in categorical_cols: if col in type_train.columns: type_train[col] = type_train[col].astype('category') type_y = type_train['scalar_coupling_constant'] del folds_data_list gc.collect() type_test = full_test[test_mask] type_oof_train = oof_train[train_mask] type_oof_test = oof_test[test_mask] type_train = train_utils.concat_stupidly(type_train, type_oof_train) type_test = train_utils.concat_stupidly(type_test, type_oof_test) print('reading distance based features...') type_dist_train = pd.read_csv( f'../data/{type}_train_distance_based_feats.csv') type_dist_train.drop('scalar_coupling_constant', axis=1, inplace=True) type_dist_test = pd.read_csv( f'../data/{type}_test_distance_based_feats.csv') atoms_categorical_cols = [f'atom_{i}' for i in range(2, 10)] for df in [type_dist_test, type_dist_train]: df[atoms_categorical_cols] = df[atoms_categorical_cols].astype( 'category') if type_num == 0: best_features = best_features + list(type_dist_train.columns)
oof_test = pd.read_csv('../data/test_oof_features.csv', nrows=len(test)) oof_drop_cols = [ 'id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'scalar_coupling_constant_oof', ] for df in [oof_train, oof_test]: df.drop(oof_drop_cols, axis=1, inplace=True) gc.collect() best_features = best_features + list(oof_train.columns) train = train_utils.concat_stupidly(train, oof_train) test = train_utils.concat_stupidly(test, oof_test) gc.collect() folds = KFold(n_splits=n_folds, shuffle=True, random_state=0) params = { 'num_leaves': 128, 'objective': 'regression', 'learning_rate': 0.04, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.75, "bagging_seed": 11, "metric": 'mae',
print(f'reading {mapping["train_features"]}') train_path = mapping['train_features'] train_data = pd.read_csv(train_path, nrows=nrows, usecols=mapping['use_cols']) if mapping['drop_columns']: train_data.drop(mapping['drop_columns'], axis=1, inplace=True) gc.collect() if mapping['concat_type'] == 'concat': print('concating...') train_data = train_data[mask] gc.collect() fold_data = train_utils.concat_stupidly(fold_data, train_data) assert len(fold_data) == len(train_data) elif mapping['concat_type'] == 'merge': print('merging...') for i in range(2): fold_data = train_utils.map_atom_info( fold_data, train_data, i) else: raise KeyError() del train_data gc.collect() assert len(fold_data) == mask.sum() print('saving...')