Пример #1
0
def load_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if to_rank:
        oof = rankdata(oof) / len(oof)
        pred = rankdata(pred) / len(pred)
    return (oof, pred)
Пример #2
0
 def load(self):
     X_train = Data.load(
         join(self.output_dir, f"X_train_{self.run_name}.pkl"))
     y_train = Data.load(
         join(self.output_dir, f"y_train_{self.run_name}.pkl"))
     X_test = Data.load(join(self.output_dir,
                             f"X_test_{self.run_name}.pkl"))
     return X_train, X_test, y_train
Пример #3
0
def load_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if run_id in ('run015'):
        oof = oof.reshape(-1, )
        pred = pred.reshape(-1, )
    if to_rank:
        oof = rankdata(oof) / len(oof)
        pred = rankdata(pred) / len(pred)
    return (oof, pred)
Пример #4
0
 def submission(self) -> None:
     pred = Data.load(f"../output/pred/{self.run_name}-test.pkl")
     sub = pd.read_csv(self.sample_submission)
     if self.evaluation_metric == "log_loss":
         sub[self.cols_definition["target_col"]] = np.argmax(pred, axis=1)
     else:
         oof = Data.load(f"../output/pred/{self.run_name}-train.pkl")
         oof = np.array([convert(v) for v in oof])
         pred = np.array([convert(v) for v in pred])
         sub[self.cols_definition["target_col"]] = pred
     sub[self.cols_definition["target_col"]] = sub[
         self.cols_definition["target_col"]].astype(float)
     sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv",
                index=False)
Пример #5
0
def load_oof_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    if run_id in ('run091', 'run092', 'run097'):
        oof = oof.reshape(-1, )
    if to_rank:
        oof = rankdata(oof) / len(oof)
    return oof
Пример #6
0
def load_pred_from_run_id(run_id: str, to_rank: False):
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if run_id in ('run091', 'run092', 'run097'):
        pred = pred.reshape(-1, )
    if to_rank:
        pred = rankdata(pred) / len(pred)
    return pred
Пример #7
0
 def submission(self):
     pred = Data.load(f'../output/pred/{self.run_name}-test.pkl')
     sub = pd.read_csv(self.sample_submission)
     if self.advanced and 'predict_exp' in self.advanced:
         sub[self.cols_definition['target_col']] = np.expm1(pred)
     else:
         sub[self.cols_definition['target_col']] = pred
     sub.to_csv(f'../output/submissions/submission_{self.run_name}.csv', index=False)
Пример #8
0
 def submission(self) -> None:
     pred = Data.load(f"../output/pred/{self.run_name}-test.pkl")
     sub = pd.read_csv(self.sample_submission)
     if self.advanced and "predict_exp" in self.advanced:
         sub[self.cols_definition["target_col"]] = np.exp(pred)
     else:
         sub[self.cols_definition["target_col"]] = pred
     sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv",
                index=False)
Пример #9
0
    def __init__(self, configs: dict, cv):
        self.exp_name = configs['exp_name']
        self.run_name = configs['run_name']
        self.run_id = None
        self.fe_name = configs['fe_name']
        self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl")
        self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl")
        self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl")
        self.evaluation_metric = configs['evaluation_metric']
        self.params = configs['params']
        self.cols_definition = configs['cols_definition']
        self.cv = cv
        self.sample_submission = configs['data']['sample_submission']
        self.description = configs['description']
        self.advanced = configs['advanced'] if 'advanced' in configs else None

        if configs['model_name'] in models_map.keys():
            self.model_cls = models_map[configs['model_name']]
        else:
            raise ValueError
Пример #10
0
    def __init__(self, configs: Dict, cv) -> None:  # type: ignore
        self.exp_name = configs["exp_name"]
        self.run_name = configs["run_name"]
        self.run_id = None
        self.fe_name = configs["fe_name"]
        self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl")
        self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl")
        self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl")
        self.evaluation_metric = configs["evaluation_metric"]
        self.params = configs["params"]
        self.cols_definition = configs["cols_definition"]
        self.cv = cv
        self.sample_submission = configs["data"]["sample_submission"]
        self.description = configs["description"]
        self.advanced = configs["advanced"] if "advanced" in configs else None

        if configs["model_name"] in models_map.keys():
            self.model_cls = models_map[configs["model_name"]]
        else:
            raise ValueError
Пример #11
0
import japanize_matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

from ayniy.utils import Data

if __name__ == '__main__':
    oof = Data.load('../output/pred/run003-train.pkl')

    train_player = pd.read_csv('../input/train_player.csv')
    train_pitch = pd.read_csv('../input/train_pitch.csv')
    train_pitch = train_pitch[train_pitch['試合種別詳細'] != 'パ・リーグ公式戦'].reset_index(
        drop=True)

    # 投手情報の紐付け
    train = pd.merge(train_pitch,
                     train_player,
                     left_on=['年度', '投手ID'],
                     right_on=['年度', '選手ID'],
                     how='inner')

    # 打者情報の紐付け
    train = pd.merge(train,
                     train_player,
                     left_on=['年度', '打者ID'],
                     right_on=['年度', '選手ID'],
                     how='inner',
                     suffixes=('_p', '_b'))

    X_train, _, _, _ = train_test_split(train.drop('試合種別詳細', axis=1),
Пример #12
0
import numpy as np
import pandas as pd
import yaml

from ayniy.model.model_cat import ModelCatRegressor
from ayniy.model.runner import Runner
from ayniy.utils import Data


X_train = Data.load('../input/X_train_00.pkl')
y_train = Data.load('../input/y_train.pkl')
X_test = Data.load('../input/X_test_00.pkl')

X_train.drop(['fiscal_year'], axis=1, inplace=True)
X_test.drop(['fiscal_year'], axis=1, inplace=True)
y_train = np.log(np.sqrt(y_train))

f = open("configs/fe_00.yml", "r+")
configs = yaml.load(f)
categorical_cols = configs['cols_definition']['categorical_col']

params_cat = {
    'depth': 6,
    'learning_rate': 0.1,
    'iterations': 10000,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 777,
    'allow_writing_files': False,
    'task_type': "CPU",
    'early_stopping_rounds': 50
from sklearn.metrics import roc_auc_score, confusion_matrix

from ayniy.utils import Data

if __name__ == '__main__':
    pred = Data.load('../output/pred/run003-test.pkl')
    y_test = Data.load('../input/y_test_fe003.pkl')
    print(roc_auc_score(y_test, pred))
    print(confusion_matrix(y_test, (pred > 0.5).astype(int)))
Пример #14
0
        else:
            pred += d[1] * (1 - sum(weights))
    Data.dump(pred, f'../output/pred/{run_name}-test.pkl')
    return pred


def make_submission(pred, run_name: str):
    sub = pd.read_csv('../input/solafune-light/UploadFileTemplate.csv')
    sub['LandPrice'] = np.expm1(pred)
    sub.to_csv(f'../output/submissions/submission_{run_name}.csv', index=False)


run_ids = [
    'run004',
    'run005',
]
run_name = 'weight001'

if __name__ == '__main__':
    y_train = Data.load('../input/pickle/y_train_fe000.pkl')
    data = [load_from_run_id(ri, to_rank=False) for ri in run_ids]

    for d in data:
        print(np.sqrt(mean_squared_error(y_train, d[0])))

    init_state = [round(1 / len(data), 3) for _ in range(len(data) - 1)]
    result = minimize(f, init_state, method='Nelder-Mead')
    print('optimized CV: ', result['fun'])
    print('w: ', result['x'])
    make_submission(make_predictions(data, result['x']), run_name)
Пример #15
0
 def load_model(self):
     model_path = os.path.join('../output/model',
                               f'{self.run_fold_name}.model')
     self.model = Data.load(model_path)
Пример #16
0
            "../input/feather/count_encoding_interact.ftr",
            "../input/feather/aggregation.ftr",
            "../input/feather/target_encoding.ftr",
        ],
        target_col=target_col,
    )

    X_train_u = features.X_train
    y_train = features.y_train
    X_test_u = features.X_test

    fe_id_u = 'fe006'
    run_id = 'run021'
    N_FEATURES = 300

    X_train_u = Data.load(f'../input/pickle/X_train_{fe_id_u}.pkl')
    X_test_u = Data.load(f'../input/pickle/X_test_{fe_id_u}.pkl')
    fi = pd.read_csv(f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES]
    X_train_u = X_train_u[fi]
    X_test_u = X_test_u[fi].reset_index(drop=True)
    X_train_u.columns = [f'u_{c}' for c in fi]
    X_test_u.columns = [f'u_{c}' for c in fi]

    fe_id = 'fe_siavrez'
    X_train = Data.load(f'../input/pickle/X_train_{fe_id}.pkl')
    X_test = Data.load(f'../input/pickle/X_test_{fe_id}.pkl')

    print(X_train.shape, X_train_u.shape)
    print(X_test.shape, X_test_u.shape)

    X_train = pd.concat([X_train, X_train_u], axis=1)
Пример #17
0
def load_oof_from_run_id(run_id: str, to_rank: False):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    if to_rank:
        oof = rankdata(oof) / len(oof)
    return oof
Пример #18
0
def load_pred_from_run_id(run_id: str, to_rank: False):
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if to_rank:
        pred = rankdata(pred) / len(pred)
    return pred
Пример #19
0
def load_pred_from_run_id(run_id: str):
    pred = Data.load(f'../output/pred/{run_id}-test.pkl')
    if run_id in ('run013', 'run014', 'run015'):
        pred = pred.reshape(-1, )
    return pred
Пример #20
0
 def load_model(self) -> None:
     model_path = os.path.join("../output/model", f"{self.run_fold_name}.model")
     self.model = Data.load(model_path)
Пример #21
0
    def train_fold(self, i_fold: int) -> Tuple[Any, Any, Any, Any]:
        """クロスバリデーションでのfoldを指定して学習・評価を行う

        他のメソッドから呼び出すほか、単体でも確認やパラメータ調整に用いる

        :param i_fold: foldの番号(すべてのときには'all'とする)
        :return: (モデルのインスタンス、レコードのインデックス、予測値、評価によるスコア)のタプル
        """
        # 学習データの読込
        X_train = self.X_train
        y_train = self.y_train

        # 残差の設定
        if self.advanced and "ResRunner" in self.advanced:
            oof = Data.load(self.advanced["ResRunner"]["oof"])
            X_train["res"] = (y_train - oof).abs()

        # 学習データ・バリデーションデータをセットする
        tr_idx, va_idx = self.load_index_fold(i_fold)
        X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        X_val, y_val = X_train.iloc[va_idx], y_train.iloc[va_idx]

        # 残差でダウンサンプリング
        if self.advanced and "ResRunner" in self.advanced:
            X_tr = X_tr.loc[(
                X_tr["res"] <
                self.advanced["ResRunner"]["res_threshold"]).values]
            y_tr = y_tr.loc[(
                X_tr["res"] <
                self.advanced["ResRunner"]["res_threshold"]).values]
            print(X_tr.shape)
            X_tr.drop("res", axis=1, inplace=True)
            X_val.drop("res", axis=1, inplace=True)

        # Pseudo Lebeling
        if self.advanced and "PseudoRunner" in self.advanced:
            y_test_pred = Data.load(
                self.advanced["PseudoRunner"]["y_test_pred"])
            if "pl_threshold" in self.advanced["PseudoRunner"]:
                X_add = self.X_test.loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold"])
                    | (y_test_pred > 1 -
                       self.advanced["PseudoRunner"]["pl_threshold"])]
                y_add = pd.DataFrame(y_test_pred).loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold"])
                    | (y_test_pred > 1 -
                       self.advanced["PseudoRunner"]["pl_threshold"])]
                y_add = pd.DataFrame(
                    ([1 if ya > 0.5 else 0 for ya in y_add[0]]))
            elif "pl_threshold_neg" in self.advanced["PseudoRunner"]:
                X_add = self.X_test.loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold_neg"])
                    | (y_test_pred > self.
                       advanced["PseudoRunner"]["pl_threshold_pos"])]
                y_add = pd.DataFrame(y_test_pred).loc[
                    (y_test_pred < self.
                     advanced["PseudoRunner"]["pl_threshold_neg"])
                    | (y_test_pred > self.
                       advanced["PseudoRunner"]["pl_threshold_pos"])]
                y_add = pd.DataFrame(
                    ([1 if ya > 0.5 else 0 for ya in y_add[0]]))
            else:
                X_add = self.X_test
                y_add = pd.DataFrame(y_test_pred)
            print(f"added X_test: {len(X_add)}")
            X_tr = pd.concat([X_tr, X_add])
            y_tr = pd.concat([y_tr, y_add])

        # 学習を行う
        model = self.build_model(i_fold)
        model.train(X_tr, y_tr, X_val, y_val, self.X_test)  # type: ignore

        # バリデーションデータへの予測・評価を行う
        pred_val = model.predict(X_val)

        if self.evaluation_metric == "log_loss":
            score = log_loss(y_val, pred_val, eps=1e-15, normalize=True)
        elif self.evaluation_metric == "mean_absolute_error":
            score = mean_absolute_error(y_val, pred_val)
        elif self.evaluation_metric == "rmse":
            score = np.sqrt(mean_squared_error(y_val, pred_val))
        elif self.evaluation_metric == "auc":
            score = roc_auc_score(y_val, pred_val)
        elif self.evaluation_metric == "prauc":
            score = average_precision_score(y_val, pred_val)

        # モデル、インデックス、予測値、評価を返す
        return model, va_idx, pred_val, score
Пример #22
0
# sub = pd.read_csv('../input/atmaCup5__sample_submission.csv')
# train = pd.read_csv('../input/train.csv')
# test = pd.read_csv('../input/test.csv')
# fitting = pd.read_csv('../input/fitting.csv')

# train = pd.merge(train, fitting, on='spectrum_id', how='inner')
# test = pd.merge(test, fitting, on='spectrum_id', how='inner')

# train.to_csv('../input/train_fitting.csv', index=False)
# test.to_csv('../input/test_fitting.csv', index=False)

add_tr = pd.read_csv('../input/additional_features_train.csv')
add_te = pd.read_csv('../input/additional_features_test.csv')

fe005_tr = Data.load('../input/X_train_fe005.pkl')
fe005_te = Data.load('../input/X_test_fe005.pkl')

# fe001_tr = Data.load('../input/X_train_fe001.pkl')
# fe001_te = Data.load('../input/X_test_fe001.pkl')
# top10_tr = Data.load('../input/X_train_fe004_top10.pkl')
# top10_te = Data.load('../input/X_test_fe004_top10.pkl')

# top10_tr, top10_te = standerize(top10_tr, top10_te, {'encode_col': top10_tr.columns})
# print(top10_tr.head())

train_fitting_ef_add = pd.concat([fe005_tr, add_tr], axis=1)
test_fitting_ef_add = pd.concat([fe005_te, add_te], axis=1)

fe_name = 'fe005_add'
Data.dump(train_fitting_ef_add, f'../input/X_train_{fe_name}.pkl')
Пример #23
0
def load_oof_from_run_id(run_id: str):
    oof = Data.load(f'../output/pred/{run_id}-train.pkl')
    if run_id in ('run013', 'run014', 'run015'):
        oof = oof.reshape(-1, )
    return oof
Пример #24
0
from ayniy.utils import Data
from sklearn.preprocessing import StandardScaler

if __name__ == "__main__":

    fe_id = "fe000"
    fe_name = f"{fe_id}_nn_small"

    X_train = Data.load(f"../input/pickle/X_train_{fe_id}.pkl")
    y_train = Data.load(f"../input/pickle/y_train_{fe_id}.pkl")
    X_test = Data.load(f"../input/pickle/X_test_{fe_id}.pkl")

    del_col = []
    for c in X_train.columns:
        X_train[c].fillna(-1, inplace=True)
        X_test[c].fillna(-1, inplace=True)
        try:
            prep = StandardScaler()
            X_train[c] = prep.fit_transform(X_train[[c]])
            X_test[c] = prep.transform(X_test[[c]])
        except ValueError:
            del_col.append(c)
    print(del_col)
    print(len(del_col))
    X_train.drop(del_col, axis=1, inplace=True)
    X_test.drop(del_col, axis=1, inplace=True)
    print(X_train.shape)

    X_train = X_train.loc[:100]
    y_train = y_train.loc[:100]
Пример #25
0
ython select_features.py --n 100
"""
import argparse

import pandas as pd

from ayniy.utils import Data

parser = argparse.ArgumentParser()
parser.add_argument('--n')
args = parser.parse_args()

fe_id = 'fe005'
run_id = 'run046'
N_FEATURES = int(args.n)
fe_name = f'fe005_top{N_FEATURES}'

X_train = Data.load(f'../input/X_train_{fe_id}.pkl')
y_train = Data.load(f'../input/y_train_{fe_id}.pkl')
X_test = Data.load(f'../input/X_test_{fe_id}.pkl')

fi = pd.read_csv(
    f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES]

X_train = X_train[fi]
X_test = X_test[fi]

Data.dump(X_train, f'../input/X_train_{fe_name}.pkl')
Data.dump(y_train, f'../input/y_train_{fe_name}.pkl')
Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
Пример #26
0
import pandas as pd

from ayniy.utils import Data

if __name__ == '__main__':
    ef_tr = pd.read_csv('../input/efficient_tr.csv')
    ef_te = pd.read_csv('../input/efficient_te.csv')

    fe001_top500_tr = Data.load('../input/X_train_fe001_top500.pkl')
    fe001_top500_te = Data.load('../input/X_test_fe001_top500.pkl')

    train_tag = pd.concat([fe001_top500_tr, ef_tr], axis=1)
    test_tag = pd.concat([fe001_top500_te, ef_te], axis=1)

    fe_name = 'fe001_top500_ef'
    Data.dump(train_tag, f'../input/X_train_{fe_name}.pkl')
    Data.dump(test_tag, f'../input/X_test_{fe_name}.pkl')