def get_prev_train_test_data(prev_data_version=None, prev_trial_no=None): file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') sample_loaded = False prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" if DEBUG: # v003_033 train_path = Path( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl" ) test_path = Path( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl" ) if train_path.exists() and test_path.exists(): print("sample loading") train = unpickle(train_path) test = unpickle(test_path) sample_loaded = True print("sample load finish") if not sample_loaded: print(f"loading previous dataest") print("train loading") train: pd.DataFrame = unpickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl", ) assert "scalar_coupling_constant" in train.columns print("test loading") test: pd.DataFrame = unpickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl", ) print(f"loading finished") if DEBUG and not sample_loaded: n_sample = 5000 print(f"sampling {n_sample} rows.") train = train.sample(n=n_sample) test = test.sample(n=n_sample) Path( f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" ).mkdir(parents=True, exist_ok=True) to_pickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl", train) to_pickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl", test) print("saved.") ################################################################################################### # add additional feature for trying # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) return train, test
def feat(data, num=5): try: number = data["number"] df = data["df"] mols = data["mols"] #mols = df['molecule_name'].unique() # dist_mat = np.zeros([0, num*5]) # atoms_idx = np.zeros([0], dtype=np.int32) # molecule_names = np.empty([0]) start = time.time() mp_data = [{"mol": mol, "df": df} for mol in mols] dist_mats = [get_dist_mat(d) for d in mp_data] #dist_mats = Parallel(n_jobs=NCORES)(delayed(get_dist_mat)(data) for data in mp_data) molecule_names = np.hstack([x[0] for x in dist_mats]) atoms_idx = np.hstack([x[1] for x in dist_mats]) dist_mat = np.vstack([x[2] for x in dist_mats]) col_name_list = [] atoms = ['H', 'C', 'N', 'O', 'F'] for a in atoms: for n in range(num): col_name_list.append('dist_{}_{}'.format(a, n)) se_mole = pd.Series(molecule_names, name='molecule_name') se_atom_idx = pd.Series(atoms_idx, name='atom_index') df_dist = pd.DataFrame(dist_mat, columns=col_name_list) df_distance = pd.concat([se_mole, se_atom_idx, df_dist], axis=1) elapsed_time = time.time() - start print("elapsed_time:{0:.2f}".format(elapsed_time) + "[sec]") first_mol_name = df.molecule_name.iloc[0].replace("dsgdb9nsd_", "") last_mol_name = df.molecule_name.iloc[-1].replace("dsgdb9nsd_", "") np.save(f"mols_{first_mol_name}_{last_mol_name}.npy", mols) to_pickle( coulomb_feat / f"coulomb_train_{number}_{first_mol_name}_{last_mol_name}.pkl", df_distance) except Exception as e: print(e) print(mols[:5]) print(mols[-5:]) display(df.head()) display(df.tail()) raise e
ss = StandardScaler() df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:]) decomp_cols = [] comp_results = [] comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp", "mbkm"] #, "tsne"] # removing tsne for name, transform in zip(comp_names, [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]): print(current_time(), "{} converting...".format(name), flush=True) n_components = N_COMP if name == 'mbkm': n_components = num_clusters2 elif name == "tsne": n_components = 2 df_results = pd.DataFrame(transform.fit_transform(df_ss)) decomp_col = ["{0}_{1:02d}".format(name, i) for i in range(n_components)] df_results.columns = decomp_col decomp_cols.extend(decomp_col) df_results.reset_index(inplace=True) del df_results['index'] comp_results.append(df_results) comp_results_df = pd.concat(comp_results, axis=1) comp_results_df = pd.concat([ df.iloc[:, :2].reset_index(drop=True), comp_results_df.reset_index(drop=True) ], axis=1) to_pickle(f"../processed/v003/comp_results_df_{N_COMP}.pkl", comp_results_df)
train = train.merge(rdkit_train, on="id", how="left") test = test.merge(rdkit_test, on="id", how="left") ob_charges = pd.read_csv("../processed/v003/ob_charges.csv", index_col=0) train = map_ob_charges(train, 0) train = map_ob_charges(train, 1) test = map_ob_charges(test, 0) test = map_ob_charges(test, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) else: train = unpickle(save_path / f"v003_010/train_concat_v003_010.pkl", ) test = unpickle(save_path / f"v003_010/test_concat_v003_010.pkl", ) if AUGMENT: augmented_train = unpickle(save_path / f"augmented_train_v001.pkl", ).drop( ["Unnamed: 0_x", "Unnamed: 0_y"], axis=1) augmented_train['id'] += train["id"].max() + 1 train = pd.concat([train, augmented_train], axis=0).reset_index(drop=True)
def train_main(seed, type_): print(f"==================== seed: {seed} ====================") params = { #'num_leaves': 128, 'min_child_samples': 79, 'objective': 'regression', 'max_depth': -1, #9, 'learning_rate': 0.2, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1, 'reg_lambda': 0.3, 'colsample_bytree': 1.0, 'num_threads' : -1, } params["seed"] = seed params["bagging_seed"] = seed + 1 params["feature_fraction_seed"] = seed + 2 n_estimators = 5 #10000 params["num_leaves"] = 256 if DEBUG: n_estimators = 5 X_short = pd.DataFrame({ 'ind': list(X.index), 'type': X['type'].values, 'oof': [0] * len(X), 'target': y.values, 'fc': y_fc.values }) X_short_test = pd.DataFrame({ 'ind': list(X_test.index), 'type': X_test['type'].values, 'prediction': [0] * len(X_test) }) print(f'{current_time()} Training of type {type_} / {X["type"].unique()}') X_t = X.loc[X['type'] == type_] X_test_t = X_test.loc[X_test['type'] == type_] y_fc_t = X_short.loc[X_short['type'] == type_, 'fc'] y_t = X_short.loc[X_short['type'] == type_, 'target'] mol_name_t = mol_name.loc[X['type'] == type_][ X_t.index] if GROUP_K_FOLD else None print( f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}" ) ######################################################################################################## # fc print("=" * 30 + " fc " + "=" * 30) result_dict_lgb1 = train_model_regression(X=X_t, X_test=X_test_t, y=y_fc_t, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=False, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name.values) X['oof_fc'] = result_dict_lgb1['oof'] X_test['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X_test['oof_fc']) to_pickle( model_path / f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict_lgb1["models"]) ######################################################################################################### # 2nd layer model params["seed"] = seed + 3 params["bagging_seed"] = seed + 4 params["feature_fraction_seed"] = seed + 5 params["num_leaves"] = 256 # num_leaves_dict[t] start_time = current_time() bairitsu = 256 / params["num_leaves"] n_estimators = 5 #int(15000 * bairitsu) if DEBUG: n_estimators = 5 if TRAIN_ALL_DATA: print("============= 2nd layer TRIAN ALL DATA ================") result_dict = train_lgb_regression_alldata( X=X_t, X_test=X_test_t, y=y_t, params=params, eval_metric='group_mae', plot_feature_importance=True, verbose=5000, n_estimators=int(n_estimators * 1.6), mol_type=type_) X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") elif CV_FOLD: print("============= 2nd layer CV ================") result_dict = train_model_regression(X_t, X_test_t, y_t, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=True, model=None, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, mol_type=-1, fold_group=mol_name_t) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof'] X_short.to_csv(submit_path / f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") else: print("============= 2nd layer hold out ================") result_dict = hold_out_lgb_validation(X=X_t, y=y_t, params=params, eval_metric='mae', plot_feature_importance=True, verbose=5000, early_stopping_rounds=200, n_estimators=n_estimators) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] eval_result: list = result_dict["eval_result"]["valid_1"]["l1"] training_log_df: pd.DataFrame = pd.DataFrame( eval_result, index=np.arange(len(eval_result)) + 1) training_log_df.columns = ["l1"] training_log_df.index.name = "iter" training_log_df.to_csv( log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv") to_pickle( model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict["model"]) # # # to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict) # importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv' # result_dict["importance"].to_csv(importance_path, index=True) # # for type_, s in zip(X['type'].unique(), score_list): # print(f"type {type_}, score: {s:0.5f}") if TRAIN_ALL_DATA or CV_FOLD: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=False) print(sub.head()) send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}") if CV_FOLD: oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") df_oof = pd.DataFrame(index=train.id) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=True) send_message( f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}" )
train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1, ord=1) dist12('dist_xy', 'x', 'y') dist12('dist_xz', 'x', 'z') dist12('dist_yz', 'y', 'z') atom_count = structures.groupby(['molecule_name', 'atom']).size().unstack(fill_value=0) train = pd.merge(train, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') train = create_features(train) angle_df_train = angle_feature_conv() train = train.merge(angle_df_train, on="id", how="left") train = train.merge(train_add, on="id", how="left") train = train.merge(babel_train, on="id", how="left") train = train.merge(rdkit_train, on="id", how="left") ob_charges = pd.read_csv("../processed/v003/ob_charges_augmented.csv", index_col=0) train = map_ob_charges(train, 0) train = map_ob_charges(train, 1) train = reduce_mem_usage(train) to_pickle(save_path / f"augmented_train_v001.pkl", train) print("finished.")
X=X, X_test=X_test, y=mullkan_0, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=False, verbose=500, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name.values) oof_mullkan_0_train = result_dict_lgb_m0['oof'] oof_mullkan_0_test = result_dict_lgb_m0['prediction'] to_pickle( submit_path / f"train_oof_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", oof_mullkan_0_train) to_pickle( submit_path / f"test_oof_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", oof_mullkan_0_test) to_pickle( model_path / f"first_model_list_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", result_dict_lgb_m0["models"]) ######################################################################################################## # mullken 1 print("=" * 30 + " mullken 1 " + "=" * 30) result_dict_lgb_m1 = train_model_regression( X=X,
categorical_feature=categorical, folds=folds, num_boost_round=30000, verbose_eval = 500, early_stopping_rounds=200, callbacks=callbacks, ) df_ret = pd.DataFrame(ret) display(df_ret) print("finish fitting.") # Retrieving booster and training information. proxy = extraction_cb.boosters_proxy boosters = extraction_cb.raw_boosters best_iteration = extraction_cb.best_iteration to_pickle(model_path/'extraction_cb.pkl', extraction_cb) # Create oof prediction result print("create oof preds.") fold_iter = folds.split(train, y) oof_preds = np.zeros_like(y) for n_fold, ((trn_idx, val_idx), booster) in enumerate(zip(fold_iter, boosters)): print(val_idx) valid = train.iloc[val_idx] oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration) print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}") np.save(submit_path/'oof.npy', oof_preds) # Averaging prediction result for test data. y_pred_proba_list = proxy.predict(test, num_iteration=best_iteration) y_pred_proba_avg = np.array(y_pred_proba_list).mean(axis=0)
# train = train.merge(train_angle_add, on="id", how="left") # test = test.merge(test_angle_add, on="id", how="left") train = train.merge(train_add, on="id", how="left") test = test.merge(test_add, on="id", how="left") train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) to_pickle(save_path / f"train_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle(save_path / f"test_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test) X = train[good_columns].copy() y = train['scalar_coupling_constant'] y_fc = train['fc'] X_test = test[good_columns].copy() # export colnames pd.DataFrame({ "columns": X.columns.tolist() }).to_csv(log_path / f"use_cols.csv") #################################################################################################### # Model Fitting n_fold = 5
folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=True, verbose=500, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name_t) importance_path = log_path / f'importance_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv' result_dict_lgb1["importance"].to_csv(importance_path, index=True) X_t['oof_fc'] = result_dict_lgb1['oof'] X_test_t['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl", X_t['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl", X_test_t['oof_fc']) params["num_leaves"] = 256 # num_leaves_dict[t] start_time = current_time() bairitsu = 256 / params["num_leaves"] n_estimators = int(15000 * bairitsu) if DEBUG: n_estimators = 5 if TRAIN_ALL_DATA:
# result_dict["start_time"] = start_time # result_dict["n_estimator"] = n_estimators # result_dict["X_t_len"] = X_t.shape[0] # result_dict["type"] = t # result_dict["type_name"] = type_name[t] # score_list += [result_dict["score"]] # # eval_result: list = result_dict["eval_result"]["valid_1"]["l1"] # training_log_df: pd.DataFrame = pd.DataFrame(eval_result, index=np.arange(len(eval_result)) + 1) # training_log_df.columns = ["l1"] # training_log_df.index.name = "iter" # training_log_df.to_csv(log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{t}.csv") # # to_pickle(model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl", result_dict["model"]) to_pickle(log_path / f"result_dict_{t}_{seed}.pkl", result_dict) importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv' result_dict["importance"].to_csv(importance_path, index=True) for t, s in zip(X['type'].unique(), score_list): print(f"type {t}, score: {s:0.5f}") if TRAIN_ALL_DATA or CV_FOLD: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] submit_file_path = str( submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv') sub.to_csv(submit_file_path, index=False)
rgs = RankGaussScalar() rgs.fit(X_fillna) X_rgs = rgs.transform(X_fillna) del X gc.collect() X_test_rgs = rgs.transform(X_test.fillna(0)) del X_test gc.collect() X_rgs.fillna(0, inplace=True) X_test_rgs.fillna(0, inplace=True) X_rgs["type"] = type_train X_test_rgs["type"] = type_test to_pickle(train_data_path, X_rgs) to_pickle(test_data_path, X_test_rgs) to_pickle(y_data_path, y) to_pickle(mol_name_data_path, mol_name) print("saved files for model train.") #################################################################################################### # Model Fitting print("start fitting") n_fold = 5 if DEBUG: n_fold = 3 if GROUP_K_FOLD: folds = GroupKFold(n_splits=n_fold) else:
folds=folds, num_boost_round=30000, verbose_eval = 500, early_stopping_rounds=500, callbacks=callbacks, ) df_ret = pd.DataFrame(ret) display(df_ret.tail()) print("finish fitting.") # Retrieving booster and training information. proxy = extraction_cb.boosters_proxy boosters = extraction_cb.raw_boosters best_iteration = extraction_cb.best_iteration print(f"best_iteration: {best_iteration}") to_pickle(model_path/f'extraction_cb_{mol_type}.pkl', extraction_cb) to_pickle(model_path/f'boosters.pkl_{mol_type}', boosters) to_pickle(model_path/f'proxy.pkl_{mol_type}', proxy) # Create oof prediction result print("create oof preds.") fold_iter = folds.split(train, y) oof_preds = np.zeros_like(y) for n_fold, ((trn_idx, val_idx), booster) in enumerate(zip(fold_iter, boosters)): print(val_idx) valid = train.iloc[val_idx] oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration) print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}") df_oof = pd.DataFrame(index=train.index) df_oof["scalar_coupling_constant"] = oof_preds
axis=1) ret_list.append(acsf_df) return pd.concat(ret_list, axis=0) print("loading structures") structures = pd.read_csv("../input/structures.csv") molecule_names = np.sort(structures.molecule_name.unique()) gb_structure = structures.groupby("molecule_name") n_split = mp.cpu_count() unit = np.ceil(len(molecule_names) / n_split).astype(int) indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)] split_mol_names = [] for idx in indexer: split_mol_names.append(molecule_names[idx[0]:idx[1]]) mp_data = [{"mol_names": m} for m in split_mol_names] print("start multiprocessing") num_workers = mp.cpu_count() with mp.Pool(num_workers) as executor: features_chunk = executor.map(get_scsf, mp_data) df = pd.concat(features_chunk) to_pickle("../processed/v003/acsf_feat.pkl", df) #df.to_csv("../processed/v003/acsf_feat.csv") print("finished.")
'molecule_atom_index_1_dist_std_diff', 'molecule_atom_index_0_dist_mean_diff', 'molecule_atom_index_1_dist_mean_div', 'molecule_atom_index_1_dist_min_diff', 'rc_A', 'rc_B', 'rc_C', 'mu', 'alpha', 'h**o', 'lumo', 'gap', 'zpve', 'Cv', 'freqs_min', 'freqs_max', 'freqs_mean', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0', 'mulliken_atom_1', 'dist_C_0_x', 'dist_C_1_x', 'dist_C_2_x', 'dist_C_3_x', 'dist_C_4_x', 'dist_F_0_x', 'dist_F_1_x', 'dist_F_2_x', 'dist_H_0_x', 'dist_H_1_x', 'dist_H_2_x', 'dist_H_3_x', 'dist_H_4_x', 'dist_N_0_x', 'dist_N_1_x', 'dist_N_2_x', 'dist_N_3_x', 'dist_N_4_x', 'dist_O_0_x', 'dist_O_1_x', 'dist_O_2_x', 'dist_O_3_x', 'dist_O_4_x', 'dist_C_0_y', 'dist_C_1_y', 'dist_C_2_y', 'dist_C_3_y', 'dist_C_4_y', 'dist_F_0_y', 'dist_F_1_y', 'dist_F_2_y', 'dist_F_3_y', 'dist_F_4_y', 'dist_H_0_y', 'dist_H_1_y', 'dist_H_2_y', 'dist_H_3_y', 'dist_H_4_y', 'dist_N_0_y', 'dist_N_1_y', 'dist_N_2_y', 'dist_N_3_y', 'dist_N_4_y', 'dist_O_0_y', 'dist_O_1_y', 'dist_O_2_y', 'dist_O_3_y', 'dist_O_4_y', 'distance_closest_0', 'distance_closest_1', 'distance_farthest_0', 'distance_farthest_1', 'cos_c0_c1', 'cos_f0_f1', 'cos_c0_f0', 'cos_c1_f1', 'cos_center0_center1', 'cos_c0', 'cos_c1', 'cos_f0', 'cos_f1', 'cos_center0', 'cos_center1' ] + giba_columns cat_features = ['atom_y'] to_pickle("../processed/v003/train_kernel_plus_more_feats.pkl", train[all_features]) to_pickle("../processed/v003/test_kernel_plus_more_feats.pkl", test[all_features]) print("finished.")
g = unpickle(graph_list[j]) node_df = pd.concat([structure[structure.molecule_name==graph_name][["molecule_name", "atom_index"]].reset_index(drop=True), pd.DataFrame(np.concatenate(g.node, -1), columns=[f"node_{i}" for i in range(13)])], axis=1) node_list += [node_df] return node_list structure = pd.read_csv("../input/structures.csv") graph_list = glob("../input/graph/*.pickle") print(len(graph_list)) n_split = mp.cpu_count() unit = np.ceil(len(graph_list) / n_split).astype(int) indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)] split_graph_list = [] for idx in indexer: split_graph_list.append(graph_list[idx[0]:idx[1]]) mp_data = [{"graph_list": m} for m in split_graph_list] num_workers = mp.cpu_count() with mp.Pool(num_workers) as executor: features_chunk = executor.map(func, mp_data) concat_list = [] for i in range(len(features_chunk)): concat_list += features_chunk[i] node_df = pd.concat(concat_list, axis=0) to_pickle("../processed/v003/node_df.pkl", node_df)
test = map_ob_charges(test, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in use_cols.good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) else: train = unpickle(save_path / f"v003_024/train_concat_v003_024.pkl", ) test = unpickle(save_path / f"v003_024/test_concat_v003_024.pkl", ) ################################################################################################### # add additional feature for trying ################################################################################################### # final data preparation for train X = train[use_cols.good_columns].copy()
axis=1, inplace=True) print("test 1") test = map_atom_info(test, 0) print("test 2") test = map_atom_info(test, 1) print("test 3") test.rename( { "mass_x": "f006:mass_0", "mass_y": "f006:mass_1", "dist_from_origin_x": "f006:dist_from_origin_0", "dist_from_origin_y": "f006:dist_from_origin_1", }, axis=1, inplace=True) Path("../processed/v003").mkdir(parents=True, exist_ok=True) to_pickle( "../processed/v003/train_augmented_006.df.pkl", train[[ "id", "f006:dist_origin_mean", "f006:mass_0", "f006:mass_1", "f006:dist_from_origin_0", "f006:dist_from_origin_1" ]]) # to_pickle("../processed/v004/aug_test_006.df.pkl",test[["id", # "f006:dist_origin_mean", "f006:mass_0", "f006:mass_1", # "f006:dist_from_origin_0", "f006:dist_from_origin_1"]]) print("finished")
plot_feature_importance=True, verbose=500, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name.values, phase_mark="_fc") importance_path = log_path / f'importance_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv' result_dict_lgb1["importance"].to_csv(importance_path, index=True) ######################################################################################################## X['oof_fc'] = result_dict_lgb1['oof'] X_test['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", X['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", X_test['oof_fc']) to_pickle( model_path / f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", result_dict_lgb1["models"]) else: X['oof_fc'] = unpickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{current_seed}.pkl") X_test['oof_fc'] = unpickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{current_seed}.pkl")
folds=folds, num_boost_round=30000, verbose_eval=500, early_stopping_rounds=500, callbacks=callbacks, ) df_ret = pd.DataFrame(ret) display(df_ret.tail()) print("finish fitting.") # Retrieving booster and training information. proxy = extraction_cb.boosters_proxy boosters = extraction_cb.raw_boosters best_iteration = extraction_cb.best_iteration print(f"best_iteration: {best_iteration}") to_pickle(model_path / 'extraction_cb.pkl', extraction_cb) to_pickle(model_path / 'boosters.pkl', boosters) to_pickle(model_path / 'proxy.pkl', proxy) # Create oof prediction result print("create oof preds.") fold_iter = folds.split(train, y) oof_preds = np.zeros_like(y) for n_fold, ((trn_idx, val_idx), booster) in enumerate(zip(fold_iter, boosters)): print(val_idx) valid = train.iloc[val_idx] oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration) print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}") df_oof = pd.DataFrame(index=train.index)
train = map_ob_charges(train, 0) train = map_ob_charges(train, 1) test = map_ob_charges(test, 0) test = map_ob_charges(test, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) to_pickle(save_path / f"train_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle(save_path / f"test_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test) X = train[good_columns].copy() y = train['scalar_coupling_constant'] y_fc = train['fc'] X_test = test[good_columns].copy() # export colnames pd.DataFrame({ "columns": X.columns.tolist() }).to_csv(log_path / f"use_cols.csv") #################################################################################################### # Model Fitting n_fold = 5
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1, fold_group=None, skip_folds=None, phase_mark="", skipped_mark=[]): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ assert isinstance(skip_folds, list) or skip_folds is None print(f"skip_folds :{skip_folds}") columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() model_list = [] # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)): if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark: print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}') oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", ) y_pred = unpickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", ) model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", ) fold_importance = unpickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", ) feature_importance = pd.concat([feature_importance, fold_importance], axis=0) prediction += y_pred model_list += [model] continue print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain') print(model) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] params["objective"] = "reg:linear" params["eval_metric"] = metrics_dict[eval_metric]['lgb_metric_name'] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric]['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 try: fold_importance.to_csv(mid_path / f"importance_cv_{fold_n}.csv") except Exception as e: print("failed to save importance...") print(e) feature_importance = pd.concat([feature_importance, fold_importance], axis=0) model_list += [model] try: to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof) to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", y_pred) to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model) to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", fold_importance) except Exception as e: print("failed to save intermediate data...") print(e) if model_type == 'lgb' and plot_feature_importance: result_dict['importance'] = feature_importance prediction /= folds.n_splits try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' +' CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)) print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["models"] = model_list result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores return result_dict
def get_train_test_data(use_prev=False, prev_data_version=None, prev_trial_no=None): if use_prev: assert prev_data_version is not None assert prev_trial_no is not None file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') if not use_prev: test = pd.read_csv(f'{file_folder}/test.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] train_add = unpickle(save_path / "train_006.df.pkl", ) test_add = unpickle(save_path / "test_006.df.pkl", ) babel_train = pd.read_csv(save_path / "babel_train.csv", usecols=use_cols.babel_cols) babel_test = pd.read_csv(save_path / "babel_test.csv", usecols=use_cols.babel_cols) use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id'] rdkit_train = pd.read_csv(save_path / "rdkit_train.csv", usecols=use_cols.rdkit_cols) rdkit_test = pd.read_csv(save_path / "rdkit_test.csv", usecols=use_cols.rdkit_cols) coulomb_train = pd.read_csv(save_path / "coulomb_interaction_train.csv") coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv") bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl") bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl") ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0) tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv", index_col=0) tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv", index_col=0) pca_feat = unpickle(save_path / "pca_feat_df.pkl") #################################################################################################### # Feature Engineering train = pd.merge( train, scalar_coupling_contributions, how='left', left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type']) train = map_atom_info(train, 0, structures) train = map_atom_info(train, 1, structures) test = map_atom_info(test, 0, structures) test = map_atom_info(test, 1, structures) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2 test['dist_y'] = (test['y_0'] - test['y_1'])**2 train['dist_z'] = (train['z_0'] - train['z_1'])**2 test['dist_z'] = (test['z_0'] - test['z_1'])**2 train['type_0'] = train['type'].apply(lambda x: x[0]) test['type_0'] = test['type'].apply(lambda x: x[0]) train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1, ord=1) test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1) dist12('dist_xy', 'x', 'y') dist12('dist_xz', 'x', 'z') dist12('dist_yz', 'y', 'z') atom_count = structures.groupby(['molecule_name', 'atom']).size().unstack(fill_value=0) train = pd.merge(train, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') test = pd.merge(test, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') train = create_features(train) test = create_features(test) angle_df_train, angle_df_test = angle_feature_conv(structures) train = train.merge(angle_df_train, on="id", how="left") test = test.merge(angle_df_test, on="id", how="left") train = train.merge(train_add, on="id", how="left") test = test.merge(test_add, on="id", how="left") # train = train.merge(train_cos, on="id", how="left") # test = test.merge(test_cos, on="id", how="left") train = train.merge(babel_train, on="id", how="left") test = test.merge(babel_test, on="id", how="left") train = train.merge(rdkit_train, on="id", how="left") test = test.merge(rdkit_test, on="id", how="left") train = train.merge(coulomb_train, on="id", how="left") test = test.merge(coulomb_test, on="id", how="left") train = train.merge(bond_calc_train, on="id", how="left") test = test.merge(bond_calc_test, on="id", how="left") train = train.merge(tda_radius_df, on="molecule_name", how="left") test = test.merge(tda_radius_df, on="molecule_name", how="left") train = train.merge(tda_radius_df_03, on="molecule_name", how="left") test = test.merge(tda_radius_df_03, on="molecule_name", how="left") train = train.merge(pca_feat, on="molecule_name", how="left") test = test.merge(pca_feat, on="molecule_name", how="left") train = map_ob_charges(train, ob_charges, 0) train = map_ob_charges(train, ob_charges, 1) test = map_ob_charges(test, ob_charges, 0) test = map_ob_charges(test, ob_charges, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in use_cols.good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) else: sample_loaded = False prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" if DEBUG: # v003_033 train_path = Path( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) test_path = Path( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) if train_path.exists() and test_path.exists(): print("sample loading") train = unpickle(train_path) test = unpickle(test_path) sample_loaded = True print("sample load finish") if not sample_loaded: print(f"loading previous dataest") print("train loading") train: pd.DataFrame = unpickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) assert "scalar_coupling_constant" in train.columns print("test loading") test: pd.DataFrame = unpickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) print(f"loading finished") if DEBUG and not sample_loaded: n_sample = 5000 print(f"sampling {n_sample} rows.") train = train.sample(n=n_sample) test = test.sample(n=n_sample) Path( f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" ).mkdir(parents=True, exist_ok=True) to_pickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", train) to_pickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", test) print("saved.") ################################################################################################### # add additional feature for trying # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) return train, test
angle_df_train, angle_df_test = angle_feature_conv() train = train.merge(angle_df_train, on="id", how="left").merge(train_cos, on="id", how="left") test = test.merge(angle_df_test, on="id", how="left").merge(test_cos, on="id", how="left") train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) to_pickle(save_path/"train_v003.pkl", train) to_pickle(save_path/"test_v003.pkl", test) X = train[good_columns].copy() y = train['scalar_coupling_constant'] y_fc = train['fc'] X_test = test[good_columns].copy() # export colnames pd.DataFrame({"columns": X.columns.tolist()}).to_csv(log_path/f"use_cols.csv") #################################################################################################### # Model Fitting n_fold = 5 folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)