예제 #1
0
srp = SparseRandomProjection(n_components=N_COMP,
                             dense_output=True,
                             random_state=SEED)
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED)
tsne = TSNE(n_components=3, random_state=SEED)

ss = StandardScaler()
df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:])

decomp_cols = []
comp_results = []
comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp",
              "mbkm"]  #, "tsne"] # removing tsne
for name, transform in zip(comp_names,
                           [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]):
    print(current_time(), "{} converting...".format(name), flush=True)
    n_components = N_COMP
    if name == 'mbkm':
        n_components = num_clusters2
    elif name == "tsne":
        n_components = 2
    df_results = pd.DataFrame(transform.fit_transform(df_ss))
    decomp_col = ["{0}_{1:02d}".format(name, i) for i in range(n_components)]
    df_results.columns = decomp_col
    decomp_cols.extend(decomp_col)
    df_results.reset_index(inplace=True)
    del df_results['index']
    comp_results.append(df_results)

comp_results_df = pd.concat(comp_results, axis=1)
comp_results_df = pd.concat([
예제 #2
0
    print(f"X['type'].unique(): {X['type'].unique()}")
    for t in X['type'].unique():
        #if seed==current_seed and t in [0, 3, 1, 4] : continue   # [0, 3, 1, 4, 2, 6]

        print(f'{current_time()} Training of type {t} / {X["type"].unique()}')
        X_t = X.loc[X['type'] == t]
        X_test_t = X_test.loc[X_test['type'] == t]
        y_t = X_short.loc[X_short['type'] == t, 'target']
        mol_name_t = mol_name.loc[X['type'] == t][
            X_t.index] if GROUP_K_FOLD else None
        print(
            f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}"
        )

        params["num_leaves"] = 256  # num_leaves_dict[t]
        start_time = current_time()
        bairitsu = 256 / params["num_leaves"]
        n_estimators = int(15000 * bairitsu)

        if DEBUG:
            n_estimators = 5

        if TRAIN_ALL_DATA:
            print("============= 2nd layer TRIAN ALL DATA ================")
            if t == 0:
                print("if t==0, then not using mullkan feat.")
                X_t = X_t.drop(["oof_mullkan_0", "oof_mullkan_1"], axis=1)
                X_test_t = X_test_t.drop(["oof_mullkan_0", "oof_mullkan_1"],
                                         axis=1)

                result_dict = train_lgb_regression_alldata(
예제 #3
0
def train_main(seed, type_):
    print(f"==================== seed: {seed} ====================")
    params = { #'num_leaves': 128,
              'min_child_samples': 79,
              'objective': 'regression',
              'max_depth': -1, #9,
              'learning_rate': 0.2,
              "boosting_type": "gbdt",
              "subsample_freq": 1,
              "subsample": 0.9,
              "metric": 'mae',
              "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 1.0,
              'num_threads' : -1,
             }

    params["seed"] = seed
    params["bagging_seed"] = seed + 1
    params["feature_fraction_seed"] = seed + 2

    n_estimators = 5  #10000
    params["num_leaves"] = 256
    if DEBUG:
        n_estimators = 5

    X_short = pd.DataFrame({
        'ind': list(X.index),
        'type': X['type'].values,
        'oof': [0] * len(X),
        'target': y.values,
        'fc': y_fc.values
    })

    X_short_test = pd.DataFrame({
        'ind': list(X_test.index),
        'type': X_test['type'].values,
        'prediction': [0] * len(X_test)
    })

    print(f'{current_time()} Training of type {type_} / {X["type"].unique()}')
    X_t = X.loc[X['type'] == type_]
    X_test_t = X_test.loc[X_test['type'] == type_]
    y_fc_t = X_short.loc[X_short['type'] == type_, 'fc']
    y_t = X_short.loc[X_short['type'] == type_, 'target']
    mol_name_t = mol_name.loc[X['type'] == type_][
        X_t.index] if GROUP_K_FOLD else None
    print(
        f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}"
    )

    ########################################################################################################
    # fc
    print("=" * 30 + " fc " + "=" * 30)
    result_dict_lgb1 = train_model_regression(X=X_t,
                                              X_test=X_test_t,
                                              y=y_fc_t,
                                              params=params,
                                              folds=folds,
                                              model_type='lgb',
                                              eval_metric='group_mae',
                                              plot_feature_importance=False,
                                              verbose=1000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators,
                                              fold_group=mol_name.values)

    X['oof_fc'] = result_dict_lgb1['oof']
    X_test['oof_fc'] = result_dict_lgb1['prediction']

    to_pickle(
        submit_path /
        f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X['oof_fc'])
    to_pickle(
        submit_path /
        f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        X_test['oof_fc'])
    to_pickle(
        model_path /
        f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
        result_dict_lgb1["models"])

    #########################################################################################################
    # 2nd layer model
    params["seed"] = seed + 3
    params["bagging_seed"] = seed + 4
    params["feature_fraction_seed"] = seed + 5
    params["num_leaves"] = 256  # num_leaves_dict[t]
    start_time = current_time()
    bairitsu = 256 / params["num_leaves"]
    n_estimators = 5  #int(15000 * bairitsu)

    if DEBUG:
        n_estimators = 5

    if TRAIN_ALL_DATA:
        print("============= 2nd layer TRIAN ALL DATA ================")
        result_dict = train_lgb_regression_alldata(
            X=X_t,
            X_test=X_test_t,
            y=y_t,
            params=params,
            eval_metric='group_mae',
            plot_feature_importance=True,
            verbose=5000,
            n_estimators=int(n_estimators * 1.6),
            mol_type=type_)

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    elif CV_FOLD:
        print("============= 2nd layer CV ================")
        result_dict = train_model_regression(X_t,
                                             X_test_t,
                                             y_t,
                                             params,
                                             folds,
                                             model_type='lgb',
                                             eval_metric='mae',
                                             columns=None,
                                             plot_feature_importance=True,
                                             model=None,
                                             verbose=1000,
                                             early_stopping_rounds=200,
                                             n_estimators=n_estimators,
                                             mol_type=-1,
                                             fold_group=mol_name_t)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof']
        X_short.to_csv(submit_path /
                       f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

        X_short_test.loc[X_short_test['type'] == type_,
                         'prediction'] = result_dict['prediction']
        X_short_test.to_csv(
            submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv")

    else:
        print("============= 2nd layer hold out ================")
        result_dict = hold_out_lgb_validation(X=X_t,
                                              y=y_t,
                                              params=params,
                                              eval_metric='mae',
                                              plot_feature_importance=True,
                                              verbose=5000,
                                              early_stopping_rounds=200,
                                              n_estimators=n_estimators)

        result_dict["start_time"] = start_time
        result_dict["n_estimator"] = n_estimators
        result_dict["X_t_len"] = X_t.shape[0]
        result_dict["type"] = type_
        result_dict["type_name"] = type_name[type_]

        eval_result: list = result_dict["eval_result"]["valid_1"]["l1"]
        training_log_df: pd.DataFrame = pd.DataFrame(
            eval_result, index=np.arange(len(eval_result)) + 1)
        training_log_df.columns = ["l1"]
        training_log_df.index.name = "iter"
        training_log_df.to_csv(
            log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv")

        to_pickle(
            model_path /
            f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl",
            result_dict["model"])
    #
    #
    #     to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict)
    #     importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv'
    #     result_dict["importance"].to_csv(importance_path, index=True)
    #
    # for type_, s in zip(X['type'].unique(), score_list):
    #     print(f"type {type_}, score: {s:0.5f}")

    if TRAIN_ALL_DATA or CV_FOLD:
        #########################################################################################################
        # create oof & submission file.
        sub = pd.read_csv(f'../input/sample_submission.csv')
        sub['scalar_coupling_constant'] = X_short_test['prediction']
        sub.to_csv(submit_path /
                   f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                   index=False)
        print(sub.head())
        send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}")

    if CV_FOLD:
        oof_log_mae = group_mean_log_mae(X_short['target'],
                                         X_short['oof'],
                                         X_short['type'],
                                         floor=1e-9)
        print(f"oof_log_mae: {oof_log_mae}")

        df_oof = pd.DataFrame(index=train.id)
        df_oof["scalar_coupling_constant"] = X_short['oof']
        df_oof.to_csv(submit_path /
                      f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv',
                      index=True)
        send_message(
            f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
        )