Exemplo n.º 1
0
def get_prev_train_test_data(prev_data_version=None, prev_trial_no=None):

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')

    sample_loaded = False
    prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
    if DEBUG:
        # v003_033
        train_path = Path(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl"
        )
        test_path = Path(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl"
        )

        if train_path.exists() and test_path.exists():
            print("sample loading")
            train = unpickle(train_path)
            test = unpickle(test_path)
            sample_loaded = True
            print("sample load finish")

    if not sample_loaded:
        print(f"loading previous dataest")
        print("train loading")
        train: pd.DataFrame = unpickle(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl",
        )
        assert "scalar_coupling_constant" in train.columns
        print("test loading")
        test: pd.DataFrame = unpickle(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl",
        )
        print(f"loading finished")

    if DEBUG and not sample_loaded:
        n_sample = 5000
        print(f"sampling {n_sample} rows.")
        train = train.sample(n=n_sample)
        test = test.sample(n=n_sample)
        Path(
            f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
        ).mkdir(parents=True, exist_ok=True)
        to_pickle(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl",
            train)
        to_pickle(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl",
            test)
        print("saved.")

        ###################################################################################################
        # add additional feature for trying

        # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test)
    return train, test
Exemplo n.º 2
0
def feat(data, num=5):
    try:
        number = data["number"]
        df = data["df"]
        mols = data["mols"]
        #mols = df['molecule_name'].unique()
        # dist_mat = np.zeros([0, num*5])
        # atoms_idx = np.zeros([0], dtype=np.int32)
        # molecule_names = np.empty([0])

        start = time.time()

        mp_data = [{"mol": mol, "df": df} for mol in mols]
        dist_mats = [get_dist_mat(d) for d in mp_data]
        #dist_mats = Parallel(n_jobs=NCORES)(delayed(get_dist_mat)(data) for data in mp_data)

        molecule_names = np.hstack([x[0] for x in dist_mats])
        atoms_idx = np.hstack([x[1] for x in dist_mats])
        dist_mat = np.vstack([x[2] for x in dist_mats])

        col_name_list = []
        atoms = ['H', 'C', 'N', 'O', 'F']
        for a in atoms:
            for n in range(num):
                col_name_list.append('dist_{}_{}'.format(a, n))

        se_mole = pd.Series(molecule_names, name='molecule_name')
        se_atom_idx = pd.Series(atoms_idx, name='atom_index')
        df_dist = pd.DataFrame(dist_mat, columns=col_name_list)
        df_distance = pd.concat([se_mole, se_atom_idx, df_dist], axis=1)

        elapsed_time = time.time() - start
        print("elapsed_time:{0:.2f}".format(elapsed_time) + "[sec]")

        first_mol_name = df.molecule_name.iloc[0].replace("dsgdb9nsd_", "")
        last_mol_name = df.molecule_name.iloc[-1].replace("dsgdb9nsd_", "")

        np.save(f"mols_{first_mol_name}_{last_mol_name}.npy", mols)

        to_pickle(
            coulomb_feat /
            f"coulomb_train_{number}_{first_mol_name}_{last_mol_name}.pkl",
            df_distance)
    except Exception as e:
        print(e)
        print(mols[:5])
        print(mols[-5:])
        display(df.head())
        display(df.tail())
        raise e
Exemplo n.º 3
0
ss = StandardScaler()
df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:])

decomp_cols = []
comp_results = []
comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp",
              "mbkm"]  #, "tsne"] # removing tsne
for name, transform in zip(comp_names,
                           [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]):
    print(current_time(), "{} converting...".format(name), flush=True)
    n_components = N_COMP
    if name == 'mbkm':
        n_components = num_clusters2
    elif name == "tsne":
        n_components = 2
    df_results = pd.DataFrame(transform.fit_transform(df_ss))
    decomp_col = ["{0}_{1:02d}".format(name, i) for i in range(n_components)]
    df_results.columns = decomp_col
    decomp_cols.extend(decomp_col)
    df_results.reset_index(inplace=True)
    del df_results['index']
    comp_results.append(df_results)

comp_results_df = pd.concat(comp_results, axis=1)
comp_results_df = pd.concat([
    df.iloc[:, :2].reset_index(drop=True),
    comp_results_df.reset_index(drop=True)
],
                            axis=1)
to_pickle(f"../processed/v003/comp_results_df_{N_COMP}.pkl", comp_results_df)
Exemplo n.º 4
0
    train = train.merge(rdkit_train, on="id", how="left")
    test = test.merge(rdkit_test, on="id", how="left")

    ob_charges = pd.read_csv("../processed/v003/ob_charges.csv", index_col=0)
    train = map_ob_charges(train, 0)
    train = map_ob_charges(train, 1)
    test = map_ob_charges(test, 0)
    test = map_ob_charges(test, 1)

    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)

    Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True,
                                                         exist_ok=True)
    to_pickle(
        save_path /
        f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
        train)
    to_pickle(
        save_path /
        f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
        test)
else:
    train = unpickle(save_path / f"v003_010/train_concat_v003_010.pkl", )
    test = unpickle(save_path / f"v003_010/test_concat_v003_010.pkl", )

if AUGMENT:
    augmented_train = unpickle(save_path / f"augmented_train_v001.pkl", ).drop(
        ["Unnamed: 0_x", "Unnamed: 0_y"], axis=1)
    augmented_train['id'] += train["id"].max() + 1
    train = pd.concat([train, augmented_train], axis=0).reset_index(drop=True)
Exemplo n.º 5
0
def train_main(seed, type_):
    print(f"==================== seed: {seed} ====================")
    params = { #'num_leaves': 128,
              'min_child_samples': 79,
              'objective': 'regression',
              'max_depth': -1, #9,
              'learning_rate': 0.2,
              "boosting_type": "gbdt",
              "subsample_freq": 1,
              "subsample": 0.9,
              "metric": 'mae',
              "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 1.0,
              'num_threads' : -1,
             }

    params["seed"] = seed
    params["bagging_seed"] = seed + 1
    params["feature_fraction_seed"] = seed + 2

    n_estimators = 5  #10000
    params["num_leaves"] = 256
    if DEBUG:
        n_estimators = 5

    X_short = pd.DataFrame({
        'ind': list(X.index),
        'type': X['type'].values,
        'oof': [0] * len(X),
        'target': y.values,
        'fc': y_fc.values
    })

    X_short_test = pd.DataFrame({
        'ind': list(X_test.index),
        'type': X_test['type'].values,
        'prediction': [0] * len(X_test)
    })

    print(f'{current_time()} Training of type {type_} / {X["type"].unique()}')
    X_t = X.loc[X['type'] == type_]
    X_test_t = X_test.loc[X_test['type'] == type_]
    y_fc_t = X_short.loc[X_short['type'] == type_, 'fc']
    y_t = X_short.loc[X_short['type'] == type_, 'target']
    mol_name_t = mol_name.loc[X['type'] == type_][
        X_t.index] if GROUP_K_FOLD else None
    print(
        f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}"
    )

    ########################################################################################################
    # fc
    print("=" * 30 + " fc " + "=" * 30)
    result_dict_lgb1 = train_model_regression(X=X_t,
                                              X_test=X_test_t,
                                              y=y_fc_t,
                                              params=params,
                                              folds=folds,
                                              model_type='lgb',
                                              eval_metric='group_mae',
                                              plot_feature_importance=False,
                                              verbose=1000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators,
                                              fold_group=mol_name.values)

    X['oof_fc'] = result_dict_lgb1['oof']
    X_test['oof_fc'] = result_dict_lgb1['prediction']

    to_pickle(
        submit_path /
        f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X['oof_fc'])
    to_pickle(
        submit_path /
        f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X_test['oof_fc'])
    to_pickle(
        model_path /
        f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        result_dict_lgb1["models"])

    #########################################################################################################
    # 2nd layer model
    params["seed"] = seed + 3
    params["bagging_seed"] = seed + 4
    params["feature_fraction_seed"] = seed + 5
    params["num_leaves"] = 256  # num_leaves_dict[t]
    start_time = current_time()
    bairitsu = 256 / params["num_leaves"]
    n_estimators = 5  #int(15000 * bairitsu)

    if DEBUG:
        n_estimators = 5

    if TRAIN_ALL_DATA:
        print("============= 2nd layer TRIAN ALL DATA ================")
        result_dict = train_lgb_regression_alldata(
            X=X_t,
            X_test=X_test_t,
            y=y_t,
            params=params,
            eval_metric='group_mae',
            plot_feature_importance=True,
            verbose=5000,
            n_estimators=int(n_estimators * 1.6),
            mol_type=type_)

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    elif CV_FOLD:
        print("============= 2nd layer CV ================")
        result_dict = train_model_regression(X_t,
                                             X_test_t,
                                             y_t,
                                             params,
                                             folds,
                                             model_type='lgb',
                                             eval_metric='mae',
                                             columns=None,
                                             plot_feature_importance=True,
                                             model=None,
                                             verbose=1000,
                                             early_stopping_rounds=200,
                                             n_estimators=n_estimators,
                                             mol_type=-1,
                                             fold_group=mol_name_t)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof']
        X_short.to_csv(submit_path /
                       f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    else:
        print("============= 2nd layer hold out ================")
        result_dict = hold_out_lgb_validation(X=X_t,
                                              y=y_t,
                                              params=params,
                                              eval_metric='mae',
                                              plot_feature_importance=True,
                                              verbose=5000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        eval_result: list = result_dict["eval_result"]["valid_1"]["l1"]
        training_log_df: pd.DataFrame = pd.DataFrame(
            eval_result, index=np.arange(len(eval_result)) + 1)
        training_log_df.columns = ["l1"]
        training_log_df.index.name = "iter"
        training_log_df.to_csv(
            log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv")

        to_pickle(
            model_path /
            f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
            result_dict["model"])
    #
    #
    #     to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict)
    #     importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv'
    #     result_dict["importance"].to_csv(importance_path, index=True)
    #
    # for type_, s in zip(X['type'].unique(), score_list):
    #     print(f"type {type_}, score: {s:0.5f}")

    if TRAIN_ALL_DATA or CV_FOLD:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        sub.to_csv(submit_path /
                   f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                   index=False)
        print(sub.head())
        send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}")

    if CV_FOLD:
        oof_log_mae = group_mean_log_mae(X_short['target'],
                                         X_short['oof'],
                                         X_short['type'],
                                         floor=1e-9)
        print(f"oof_log_mae: {oof_log_mae}")

        df_oof = pd.DataFrame(index=train.id)
        df_oof["scalar_coupling_constant"] = X_short['oof']
        df_oof.to_csv(submit_path /
                      f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                      index=True)
        send_message(
            f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
        )
Exemplo n.º 6
0
train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1, ord=1)
dist12('dist_xy', 'x', 'y')
dist12('dist_xz', 'x', 'z')
dist12('dist_yz', 'y', 'z')

atom_count = structures.groupby(['molecule_name',
                                 'atom']).size().unstack(fill_value=0)
train = pd.merge(train,
                 atom_count,
                 how='left',
                 left_on='molecule_name',
                 right_on='molecule_name')

train = create_features(train)

angle_df_train = angle_feature_conv()
train = train.merge(angle_df_train, on="id", how="left")
train = train.merge(train_add, on="id", how="left")
train = train.merge(babel_train, on="id", how="left")
train = train.merge(rdkit_train, on="id", how="left")

ob_charges = pd.read_csv("../processed/v003/ob_charges_augmented.csv",
                         index_col=0)
train = map_ob_charges(train, 0)
train = map_ob_charges(train, 1)

train = reduce_mem_usage(train)

to_pickle(save_path / f"augmented_train_v001.pkl", train)

print("finished.")
Exemplo n.º 7
0
                X=X,
                X_test=X_test,
                y=mullkan_0,
                params=params,
                folds=folds,
                model_type='lgb',
                eval_metric='group_mae',
                plot_feature_importance=False,
                verbose=500,
                early_stopping_rounds=200,
                n_estimators=n_estimators,
                fold_group=mol_name.values)
            oof_mullkan_0_train = result_dict_lgb_m0['oof']
            oof_mullkan_0_test = result_dict_lgb_m0['prediction']
            to_pickle(
                submit_path /
                f"train_oof_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
                oof_mullkan_0_train)
            to_pickle(
                submit_path /
                f"test_oof_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
                oof_mullkan_0_test)
            to_pickle(
                model_path /
                f"first_model_list_mullkan_0_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
                result_dict_lgb_m0["models"])

            ########################################################################################################
            # mullken 1
            print("=" * 30 + " mullken 1 " + "=" * 30)
            result_dict_lgb_m1 = train_model_regression(
                X=X,
Exemplo n.º 8
0
               categorical_feature=categorical,
               folds=folds,
               num_boost_round=30000,
               verbose_eval = 500,
               early_stopping_rounds=200,
               callbacks=callbacks,
               )
df_ret = pd.DataFrame(ret)
display(df_ret)
print("finish fitting.")

# Retrieving booster and training information.
proxy = extraction_cb.boosters_proxy
boosters = extraction_cb.raw_boosters
best_iteration = extraction_cb.best_iteration
to_pickle(model_path/'extraction_cb.pkl', extraction_cb)

# Create oof prediction result
print("create oof preds.")
fold_iter = folds.split(train, y)
oof_preds = np.zeros_like(y)
for n_fold, ((trn_idx, val_idx), booster) in enumerate(zip(fold_iter, boosters)):
    print(val_idx)
    valid = train.iloc[val_idx]
    oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration)
print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}")
np.save(submit_path/'oof.npy', oof_preds)

# Averaging prediction result for test data.
y_pred_proba_list = proxy.predict(test, num_iteration=best_iteration)
y_pred_proba_avg = np.array(y_pred_proba_list).mean(axis=0)
Exemplo n.º 9
0
# train = train.merge(train_angle_add, on="id", how="left")
# test = test.merge(test_angle_add, on="id", how="left")
train = train.merge(train_add, on="id", how="left")
test = test.merge(test_add, on="id", how="left")

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

to_pickle(save_path / f"train_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
to_pickle(save_path / f"test_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test)

X = train[good_columns].copy()
y = train['scalar_coupling_constant']
y_fc = train['fc']
X_test = test[good_columns].copy()

# export colnames
pd.DataFrame({
    "columns": X.columns.tolist()
}).to_csv(log_path / f"use_cols.csv")

####################################################################################################
# Model Fitting
n_fold = 5
Exemplo n.º 10
0
                                                  folds=folds,
                                                  model_type='lgb',
                                                  eval_metric='group_mae',
                                                  plot_feature_importance=True,
                                                  verbose=500,
                                                  early_stopping_rounds=200,
                                                  n_estimators=n_estimators,
                                                  fold_group=mol_name_t)

        importance_path = log_path / f'importance_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv'
        result_dict_lgb1["importance"].to_csv(importance_path, index=True)

        X_t['oof_fc'] = result_dict_lgb1['oof']
        X_test_t['oof_fc'] = result_dict_lgb1['prediction']
        to_pickle(
            submit_path /
            f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl",
            X_t['oof_fc'])
        to_pickle(
            submit_path /
            f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl",
            X_test_t['oof_fc'])

        params["num_leaves"] = 256  # num_leaves_dict[t]
        start_time = current_time()
        bairitsu = 256 / params["num_leaves"]
        n_estimators = int(15000 * bairitsu)

        if DEBUG:
            n_estimators = 5

        if TRAIN_ALL_DATA:
Exemplo n.º 11
0
            # result_dict["start_time"] = start_time
            # result_dict["n_estimator"] = n_estimators
            # result_dict["X_t_len"] = X_t.shape[0]
            # result_dict["type"] = t
            # result_dict["type_name"] = type_name[t]
            # score_list += [result_dict["score"]]
            #
            # eval_result: list = result_dict["eval_result"]["valid_1"]["l1"]
            # training_log_df: pd.DataFrame = pd.DataFrame(eval_result, index=np.arange(len(eval_result)) + 1)
            # training_log_df.columns = ["l1"]
            # training_log_df.index.name = "iter"
            # training_log_df.to_csv(log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{t}.csv")
            #
            # to_pickle(model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl", result_dict["model"])

        to_pickle(log_path / f"result_dict_{t}_{seed}.pkl", result_dict)
        importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv'
        result_dict["importance"].to_csv(importance_path, index=True)

    for t, s in zip(X['type'].unique(), score_list):
        print(f"type {t}, score: {s:0.5f}")

    if TRAIN_ALL_DATA or CV_FOLD:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        submit_file_path = str(
            submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv')
        sub.to_csv(submit_file_path, index=False)
Exemplo n.º 12
0
    rgs = RankGaussScalar()
    rgs.fit(X_fillna)
    X_rgs = rgs.transform(X_fillna)
    del X
    gc.collect()
    X_test_rgs = rgs.transform(X_test.fillna(0))
    del X_test
    gc.collect()

    X_rgs.fillna(0, inplace=True)
    X_test_rgs.fillna(0, inplace=True)

    X_rgs["type"] = type_train
    X_test_rgs["type"] = type_test

    to_pickle(train_data_path, X_rgs)
    to_pickle(test_data_path, X_test_rgs)
    to_pickle(y_data_path, y)
    to_pickle(mol_name_data_path, mol_name)
    print("saved files for model train.")

####################################################################################################
# Model Fitting
print("start fitting")
n_fold = 5
if DEBUG:
    n_fold = 3

if GROUP_K_FOLD:
    folds = GroupKFold(n_splits=n_fold)
else:
Exemplo n.º 13
0
               folds=folds,
               num_boost_round=30000,
               verbose_eval = 500,
               early_stopping_rounds=500,
               callbacks=callbacks,
               )
df_ret = pd.DataFrame(ret)
display(df_ret.tail())
print("finish fitting.")

# Retrieving booster and training information.
proxy = extraction_cb.boosters_proxy
boosters = extraction_cb.raw_boosters
best_iteration = extraction_cb.best_iteration
print(f"best_iteration: {best_iteration}")
to_pickle(model_path/f'extraction_cb_{mol_type}.pkl', extraction_cb)
to_pickle(model_path/f'boosters.pkl_{mol_type}', boosters)
to_pickle(model_path/f'proxy.pkl_{mol_type}', proxy)

# Create oof prediction result
print("create oof preds.")
fold_iter = folds.split(train, y)
oof_preds = np.zeros_like(y)

for n_fold, ((trn_idx, val_idx), booster) in enumerate(zip(fold_iter, boosters)):
    print(val_idx)
    valid = train.iloc[val_idx]
    oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration)
print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}")
df_oof = pd.DataFrame(index=train.index)
df_oof["scalar_coupling_constant"] = oof_preds
Exemplo n.º 14
0
                            axis=1)
        ret_list.append(acsf_df)
    return pd.concat(ret_list, axis=0)


print("loading structures")
structures = pd.read_csv("../input/structures.csv")

molecule_names = np.sort(structures.molecule_name.unique())
gb_structure = structures.groupby("molecule_name")

n_split = mp.cpu_count()
unit = np.ceil(len(molecule_names) / n_split).astype(int)
indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)]

split_mol_names = []
for idx in indexer:
    split_mol_names.append(molecule_names[idx[0]:idx[1]])

mp_data = [{"mol_names": m} for m in split_mol_names]

print("start multiprocessing")
num_workers = mp.cpu_count()
with mp.Pool(num_workers) as executor:
    features_chunk = executor.map(get_scsf, mp_data)

df = pd.concat(features_chunk)

to_pickle("../processed/v003/acsf_feat.pkl", df)
#df.to_csv("../processed/v003/acsf_feat.csv")
print("finished.")
Exemplo n.º 15
0
    'molecule_atom_index_1_dist_std_diff',
    'molecule_atom_index_0_dist_mean_diff',
    'molecule_atom_index_1_dist_mean_div',
    'molecule_atom_index_1_dist_min_diff', 'rc_A', 'rc_B', 'rc_C', 'mu',
    'alpha', 'h**o', 'lumo', 'gap', 'zpve', 'Cv', 'freqs_min', 'freqs_max',
    'freqs_mean', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0',
    'mulliken_atom_1', 'dist_C_0_x', 'dist_C_1_x', 'dist_C_2_x', 'dist_C_3_x',
    'dist_C_4_x', 'dist_F_0_x', 'dist_F_1_x', 'dist_F_2_x', 'dist_H_0_x',
    'dist_H_1_x', 'dist_H_2_x', 'dist_H_3_x', 'dist_H_4_x', 'dist_N_0_x',
    'dist_N_1_x', 'dist_N_2_x', 'dist_N_3_x', 'dist_N_4_x', 'dist_O_0_x',
    'dist_O_1_x', 'dist_O_2_x', 'dist_O_3_x', 'dist_O_4_x', 'dist_C_0_y',
    'dist_C_1_y', 'dist_C_2_y', 'dist_C_3_y', 'dist_C_4_y', 'dist_F_0_y',
    'dist_F_1_y', 'dist_F_2_y', 'dist_F_3_y', 'dist_F_4_y', 'dist_H_0_y',
    'dist_H_1_y', 'dist_H_2_y', 'dist_H_3_y', 'dist_H_4_y', 'dist_N_0_y',
    'dist_N_1_y', 'dist_N_2_y', 'dist_N_3_y', 'dist_N_4_y', 'dist_O_0_y',
    'dist_O_1_y', 'dist_O_2_y', 'dist_O_3_y', 'dist_O_4_y',
    'distance_closest_0', 'distance_closest_1', 'distance_farthest_0',
    'distance_farthest_1', 'cos_c0_c1', 'cos_f0_f1', 'cos_c0_f0', 'cos_c1_f1',
    'cos_center0_center1', 'cos_c0', 'cos_c1', 'cos_f0', 'cos_f1',
    'cos_center0', 'cos_center1'
] + giba_columns

cat_features = ['atom_y']

to_pickle("../processed/v003/train_kernel_plus_more_feats.pkl",
          train[all_features])
to_pickle("../processed/v003/test_kernel_plus_more_feats.pkl",
          test[all_features])

print("finished.")
Exemplo n.º 16
0
        g = unpickle(graph_list[j])
        node_df = pd.concat([structure[structure.molecule_name==graph_name][["molecule_name", "atom_index"]].reset_index(drop=True), 
                   pd.DataFrame(np.concatenate(g.node, -1), columns=[f"node_{i}" for i in range(13)])], axis=1)
        node_list += [node_df]
    return node_list

structure = pd.read_csv("../input/structures.csv")
graph_list = glob("../input/graph/*.pickle")
print(len(graph_list))
n_split = mp.cpu_count()
unit = np.ceil(len(graph_list) / n_split).astype(int)
indexer = [[unit * (i), unit * (i + 1)] for i in range(n_split)]

split_graph_list = []
for idx in indexer:
    split_graph_list.append(graph_list[idx[0]:idx[1]])

mp_data = [{"graph_list": m} for m in split_graph_list]

num_workers = mp.cpu_count()
with mp.Pool(num_workers) as executor:
    features_chunk = executor.map(func, mp_data)
    
concat_list = []
for i in range(len(features_chunk)):
    concat_list += features_chunk[i]

node_df = pd.concat(concat_list, axis=0)

to_pickle("../processed/v003/node_df.pkl", node_df)
Exemplo n.º 17
0
    test = map_ob_charges(test, 1)

    train = reduce_mem_usage(train)
    test = reduce_mem_usage(test)

    for f in ['atom_1', 'type_0', 'type']:
        if f in use_cols.good_columns:
            lbl = LabelEncoder()
            lbl.fit(list(train[f].values) + list(test[f].values))
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))

    Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True,
                                                         exist_ok=True)
    to_pickle(
        save_path /
        f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
        train)
    to_pickle(
        save_path /
        f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
        test)
else:
    train = unpickle(save_path / f"v003_024/train_concat_v003_024.pkl", )
    test = unpickle(save_path / f"v003_024/test_concat_v003_024.pkl", )

###################################################################################################
# add additional feature for trying

###################################################################################################
# final data preparation for train
X = train[use_cols.good_columns].copy()
Exemplo n.º 18
0
    axis=1,
    inplace=True)

print("test 1")
test = map_atom_info(test, 0)
print("test 2")
test = map_atom_info(test, 1)
print("test 3")
test.rename(
    {
        "mass_x": "f006:mass_0",
        "mass_y": "f006:mass_1",
        "dist_from_origin_x": "f006:dist_from_origin_0",
        "dist_from_origin_y": "f006:dist_from_origin_1",
    },
    axis=1,
    inplace=True)

Path("../processed/v003").mkdir(parents=True, exist_ok=True)
to_pickle(
    "../processed/v003/train_augmented_006.df.pkl", train[[
        "id", "f006:dist_origin_mean", "f006:mass_0", "f006:mass_1",
        "f006:dist_from_origin_0", "f006:dist_from_origin_1"
    ]])

# to_pickle("../processed/v004/aug_test_006.df.pkl",test[["id",
#                                            "f006:dist_origin_mean", "f006:mass_0", "f006:mass_1",
#                                            "f006:dist_from_origin_0", "f006:dist_from_origin_1"]])

print("finished")
Exemplo n.º 19
0
                                                  plot_feature_importance=True,
                                                  verbose=500,
                                                  early_stopping_rounds=200,
                                                  n_estimators=n_estimators,
                                                  fold_group=mol_name.values,
                                                  phase_mark="_fc")

        importance_path = log_path / f'importance_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv'
        result_dict_lgb1["importance"].to_csv(importance_path, index=True)

        ########################################################################################################
        X['oof_fc'] = result_dict_lgb1['oof']
        X_test['oof_fc'] = result_dict_lgb1['prediction']

        to_pickle(
            submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            X['oof_fc'])
        to_pickle(
            submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            X_test['oof_fc'])
        to_pickle(
            model_path /
            f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            result_dict_lgb1["models"])
    else:
        X['oof_fc'] = unpickle(
            submit_path /
            f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{current_seed}.pkl")
        X_test['oof_fc'] = unpickle(
            submit_path /
            f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{current_seed}.pkl")
Exemplo n.º 20
0
    folds=folds,
    num_boost_round=30000,
    verbose_eval=500,
    early_stopping_rounds=500,
    callbacks=callbacks,
)
df_ret = pd.DataFrame(ret)
display(df_ret.tail())
print("finish fitting.")

# Retrieving booster and training information.
proxy = extraction_cb.boosters_proxy
boosters = extraction_cb.raw_boosters
best_iteration = extraction_cb.best_iteration
print(f"best_iteration: {best_iteration}")
to_pickle(model_path / 'extraction_cb.pkl', extraction_cb)
to_pickle(model_path / 'boosters.pkl', boosters)
to_pickle(model_path / 'proxy.pkl', proxy)

# Create oof prediction result
print("create oof preds.")
fold_iter = folds.split(train, y)
oof_preds = np.zeros_like(y)

for n_fold, ((trn_idx, val_idx),
             booster) in enumerate(zip(fold_iter, boosters)):
    print(val_idx)
    valid = train.iloc[val_idx]
    oof_preds[val_idx] = booster.predict(valid, num_iteration=best_iteration)
print(f"mae on oof preds: {mean_absolute_error(y, oof_preds)}")
df_oof = pd.DataFrame(index=train.index)
Exemplo n.º 21
0
train = map_ob_charges(train, 0)
train = map_ob_charges(train, 1)
test = map_ob_charges(test, 0)
test = map_ob_charges(test, 1)

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

to_pickle(save_path / f"train_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl",
          train)
to_pickle(save_path / f"test_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test)

X = train[good_columns].copy()
y = train['scalar_coupling_constant']
y_fc = train['fc']
X_test = test[good_columns].copy()

# export colnames
pd.DataFrame({
    "columns": X.columns.tolist()
}).to_csv(log_path / f"use_cols.csv")

####################################################################################################
# Model Fitting
n_fold = 5
Exemplo n.º 22
0
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None,
                           plot_feature_importance=False, model=None,
                           verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1,
                           fold_group=None, skip_folds=None, phase_mark="", skipped_mark=[]):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    assert isinstance(skip_folds, list) or skip_folds is None
    print(f"skip_folds :{skip_folds}")

    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    model_list = []

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)):

        if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark:
            print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}')
            oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", )
            y_pred = unpickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", )
            model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", )
            fold_importance = unpickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", )

            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
            prediction += y_pred
            model_list += [model]
            continue

        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain')
            print(model)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose, early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            params["objective"] = "reg:linear"
            params["eval_metric"] = metrics_dict[eval_metric]['lgb_metric_name']
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200,
                              verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                                      **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )

        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1

            try:
                fold_importance.to_csv(mid_path / f"importance_cv_{fold_n}.csv")
            except Exception as e:
                print("failed to save importance...")
                print(e)

            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
        model_list += [model]

        try:
            to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof)
            to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", y_pred)
            to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model)
            to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", fold_importance)
        except Exception as e:
            print("failed to save intermediate data...")
            print(e)

    if model_type == 'lgb' and plot_feature_importance:
        result_dict['importance'] = feature_importance

    prediction /= folds.n_splits
    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' +' CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["models"] = model_list
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    return result_dict
Exemplo n.º 23
0
def get_train_test_data(use_prev=False,
                        prev_data_version=None,
                        prev_trial_no=None):
    if use_prev:
        assert prev_data_version is not None
        assert prev_trial_no is not None

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')
    if not use_prev:
        test = pd.read_csv(f'{file_folder}/test.csv')
        structures = pd.read_csv(f'{file_folder}/structures.csv')
        scalar_coupling_contributions = pd.read_csv(
            f'{file_folder}/scalar_coupling_contributions.csv')

        # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
        # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

        train_add = unpickle(save_path / "train_006.df.pkl", )
        test_add = unpickle(save_path / "test_006.df.pkl", )

        babel_train = pd.read_csv(save_path / "babel_train.csv",
                                  usecols=use_cols.babel_cols)
        babel_test = pd.read_csv(save_path / "babel_test.csv",
                                 usecols=use_cols.babel_cols)

        use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
        rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                                  usecols=use_cols.rdkit_cols)
        rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
                                 usecols=use_cols.rdkit_cols)

        coulomb_train = pd.read_csv(save_path /
                                    "coulomb_interaction_train.csv")
        coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv")

        bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl")
        bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl")

        ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0)

        tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv",
                                    index_col=0)

        tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv",
                                       index_col=0)

        pca_feat = unpickle(save_path / "pca_feat_df.pkl")

        ####################################################################################################
        # Feature Engineering

        train = pd.merge(
            train,
            scalar_coupling_contributions,
            how='left',
            left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
            right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

        train = map_atom_info(train, 0, structures)
        train = map_atom_info(train, 1, structures)
        test = map_atom_info(test, 0, structures)
        test = map_atom_info(test, 1, structures)

        train_p_0 = train[['x_0', 'y_0', 'z_0']].values
        train_p_1 = train[['x_1', 'y_1', 'z_1']].values
        test_p_0 = test[['x_0', 'y_0', 'z_0']].values
        test_p_1 = test[['x_1', 'y_1', 'z_1']].values

        train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
        test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
        train['dist_x'] = (train['x_0'] - train['x_1'])**2
        test['dist_x'] = (test['x_0'] - test['x_1'])**2
        train['dist_y'] = (train['y_0'] - train['y_1'])**2
        test['dist_y'] = (test['y_0'] - test['y_1'])**2
        train['dist_z'] = (train['z_0'] - train['z_1'])**2
        test['dist_z'] = (test['z_0'] - test['z_1'])**2

        train['type_0'] = train['type'].apply(lambda x: x[0])
        test['type_0'] = test['type'].apply(lambda x: x[0])

        train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1,
                                           axis=1,
                                           ord=1)
        test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1)
        dist12('dist_xy', 'x', 'y')
        dist12('dist_xz', 'x', 'z')
        dist12('dist_yz', 'y', 'z')

        atom_count = structures.groupby(['molecule_name',
                                         'atom']).size().unstack(fill_value=0)
        train = pd.merge(train,
                         atom_count,
                         how='left',
                         left_on='molecule_name',
                         right_on='molecule_name')
        test = pd.merge(test,
                        atom_count,
                        how='left',
                        left_on='molecule_name',
                        right_on='molecule_name')

        train = create_features(train)
        test = create_features(test)

        angle_df_train, angle_df_test = angle_feature_conv(structures)
        train = train.merge(angle_df_train, on="id", how="left")
        test = test.merge(angle_df_test, on="id", how="left")

        train = train.merge(train_add, on="id", how="left")
        test = test.merge(test_add, on="id", how="left")

        # train = train.merge(train_cos, on="id", how="left")
        # test = test.merge(test_cos, on="id", how="left")

        train = train.merge(babel_train, on="id", how="left")
        test = test.merge(babel_test, on="id", how="left")

        train = train.merge(rdkit_train, on="id", how="left")
        test = test.merge(rdkit_test, on="id", how="left")

        train = train.merge(coulomb_train, on="id", how="left")
        test = test.merge(coulomb_test, on="id", how="left")

        train = train.merge(bond_calc_train, on="id", how="left")
        test = test.merge(bond_calc_test, on="id", how="left")

        train = train.merge(tda_radius_df, on="molecule_name", how="left")
        test = test.merge(tda_radius_df, on="molecule_name", how="left")

        train = train.merge(tda_radius_df_03, on="molecule_name", how="left")
        test = test.merge(tda_radius_df_03, on="molecule_name", how="left")

        train = train.merge(pca_feat, on="molecule_name", how="left")
        test = test.merge(pca_feat, on="molecule_name", how="left")

        train = map_ob_charges(train, ob_charges, 0)
        train = map_ob_charges(train, ob_charges, 1)
        test = map_ob_charges(test, ob_charges, 0)
        test = map_ob_charges(test, ob_charges, 1)

        train = reduce_mem_usage(train)
        test = reduce_mem_usage(test)

        for f in ['atom_1', 'type_0', 'type']:
            if f in use_cols.good_columns:
                lbl = LabelEncoder()
                lbl.fit(list(train[f].values) + list(test[f].values))
                train[f] = lbl.transform(list(train[f].values))
                test[f] = lbl.transform(list(test[f].values))

        Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True,
                                                             exist_ok=True)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            train)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            test)
    else:
        sample_loaded = False
        prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
        if DEBUG:
            # v003_033
            train_path = Path(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )
            test_path = Path(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )

            if train_path.exists() and test_path.exists():
                print("sample loading")
                train = unpickle(train_path)
                test = unpickle(test_path)
                sample_loaded = True
                print("sample load finish")

        if not sample_loaded:
            print(f"loading previous dataest")
            print("train loading")
            train: pd.DataFrame = unpickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            assert "scalar_coupling_constant" in train.columns
            print("test loading")
            test: pd.DataFrame = unpickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            print(f"loading finished")

        if DEBUG and not sample_loaded:
            n_sample = 5000
            print(f"sampling {n_sample} rows.")
            train = train.sample(n=n_sample)
            test = test.sample(n=n_sample)
            Path(
                f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
            ).mkdir(parents=True, exist_ok=True)
            to_pickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                train)
            to_pickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                test)
            print("saved.")

        ###################################################################################################
        # add additional feature for trying

        # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test)
    return train, test
Exemplo n.º 24
0
angle_df_train, angle_df_test = angle_feature_conv()
train = train.merge(angle_df_train, on="id", how="left").merge(train_cos,  on="id", how="left")
test = test.merge(angle_df_test, on="id", how="left").merge(test_cos,  on="id", how="left")


train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

to_pickle(save_path/"train_v003.pkl", train)
to_pickle(save_path/"test_v003.pkl", test)

X = train[good_columns].copy()
y = train['scalar_coupling_constant']
y_fc = train['fc']
X_test = test[good_columns].copy()

# export colnames
pd.DataFrame({"columns": X.columns.tolist()}).to_csv(log_path/f"use_cols.csv")

####################################################################################################
# Model Fitting
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)