def on_fe_end(self, state: State):
        feature_dir = state.feature_dir
        features = state.features

        for name, feature in features.items():
            for phase in ["train", "test"]:
                if isinstance(feature[phase], dict) or isinstance(
                        feature[phase], csr_matrix):
                    mdict = {name: feature[phase]}
                    mat_name = f"{self.prefix}{name}_{phase}.mat"
                    with utils.timer("Saving " + mat_name, state.logger):
                        savemat(feature_dir / mat_name, mdict)
                elif isinstance(feature[phase], pd.DataFrame):
                    ftr_name = f"{self.prefix}{name}_{phase}.ftr"
                    with utils.timer("Saving " + ftr_name, state.logger):
                        for col in feature[phase].columns:
                            if feature[phase][col].dtype == "float16":
                                feature[phase][col] = feature[phase][
                                    col].astype("float32")
                        feature[phase].to_feather(feature_dir / ftr_name)
                else:
                    raise NotImplementedError

        target = state.target
        target_name = f"{self.prefix}main_target.npy"
        with utils.timer("Saving " + target_name, state.logger):
            np.save(feature_dir / target_name, target)
示例#2
0
def load_features(config: dict) -> Tuple[cudf.DataFrame, cudf.DataFrame]:
    feature_path = config["dataset"]["feature_dir"]

    with timer("load train"):
        train_feats = [
            cudf.read_feather(f"{feature_path}/{f}_train.ftr")
            for f in config["features"]
            if Path(f"{feature_path}/{f}_train.ftr").exists()
        ]
        cols = []
        for feats in train_feats:
            cols = cols + feats.columns.tolist()

        print(
            f"duplicated cols: {[k for k, v in collections.Counter(cols).items() if v > 1]}"
        )
        assert len(cols) == len(np.unique(cols))
        x_train = cudf.concat(
            train_feats,
            axis=1,
            sort=False,
        )

    with timer("load test"):
        x_test = cudf.concat(
            [
                cudf.read_feather(f"{feature_path}/{f}_test.ftr")
                for f in config["features"]
                if Path(f"{feature_path}/{f}_test.ftr").exists()
            ],
            axis=1,
            sort=False,
        )

    return x_train, x_test
示例#3
0
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)
            del train, test
            gc.collect()

        with timer("make feats"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)
            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        new_cols = [col for col in total.columns if col not in org_cols]

        train = total[new_cols].iloc[:len_train].reset_index(drop=True)
        test = total[new_cols].iloc[len_train:].reset_index(drop=True)

        with timer("end"):
            self.train = train.reset_index(drop=True).to_pandas()
            self.test = test.reset_index(drop=True).to_pandas()
示例#4
0
def load_features(config: dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
    feature_path = config["dataset"]["feature_dir"]

    with timer("load train"):
        x_train = pd.concat(
            [
                load_pickle(f"{feature_path}/{f}_train.pkl")
                for f in config["features"]
                if Path(f"{feature_path}/{f}_train.pkl").exists()
            ],
            axis=1,
            sort=False,
        )

    with timer("load test"):
        x_test = pd.concat(
            [
                load_pickle(f"{feature_path}/{f}_test.pkl")
                for f in config["features"]
                if Path(f"{feature_path}/{f}_test.pkl").exists()
            ],
            axis=1,
            sort=False,
        )

    return x_train, x_test
    def create_features(
        self,
        train_df: cudf.DataFrame,
        test_df: cudf.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)

        with timer("category vectorizer"):
            new_features = []
            for i in [5, 10, 20, 30]:
                vectorizer = CategoryVectorizer(
                    categorical_columns=cat_var_list,
                    n_components=i,
                    transformer=LatentDirichletAllocation(n_components=i))
                new_feats = vectorizer.transform(total)
                new_features.append(new_feats)

            new_features = cudf.concat(new_features, axis=1)

        with timer("end"):

            self.train = new_features.iloc[:len_train].reset_index(drop=True)
            self.test = new_features.iloc[len_train:].reset_index(drop=True)
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("combi cats"):
            new_cat_df = cudf.concat(
                [
                    xfeat.ConcatCombination(
                        drop_origin=True, r=r).fit_transform(
                            total[cat_cols].astype(str).fillna("none"))
                    for r in [2, 3, 4]
                ],
                axis="columns",
            )

            for col in new_cat_df.columns:
                le = LabelEncoder()
                new_cat_df[col] = le.fit_transform(
                    new_cat_df[col]).astype("category")

            total = cudf.concat(
                [total, new_cat_df],
                axis="columns",
            )

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [
                col for col in total.columns
                if col not in org_cols + ["index"]
            ]

            self.train = total[new_cols].iloc[:len_train].reset_index(
                drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
示例#7
0
 def on_data_end(self, state: State):
     with utils.timer("Data Compressing", state.logger):
         dfs = state.dataframes
         for key in dfs:
             dfs[key] = utils.reduce_mem_usage(dfs[key],
                                               verbose=True,
                                               logger=state.logger)
示例#8
0
    def fit(
        self,
        train_loader: DataLoader,
        valid_loader: DataLoader,
        with_validation: bool = False,
    ) -> None:
        for _ in range(self.config.n_epochs):

            with timer(
                f"CV {self.cv_num} epoch {self.epoch}", mlflow_on=self.mlflow_on
            ):
                summary_loss = self._train_one_epoch(train_loader)

                if with_validation:
                    summary_loss = self._validation(valid_loader)

                if summary_loss.avg < self.best_summary_loss:
                    self.best_summary_loss = summary_loss.avg
                    if self.start_time is not None:
                        self.train_model.eval()
                        self.save(
                            f"{self.log_path}/best-checkpoint_cv{self.cv_num}.bin"
                        )

                if self.config.validation_scheduler:
                    self.scheduler.step(metrics=summary_loss.avg)

                self.epoch += 1
示例#9
0
def fit_catboost(X, y, cv=None, params: dict = None, verbose=500):

    if params is None:
        params = deepcopy(CAT_DEFAULT_PARAMS)

    if cv is None:
        cv = StratifiedKFold(n_splits=2, shuffle=True)
    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = CatBoost(params=params)

        with timer(prefix='fit fold={} '.format(i + 1)):
            clf_train = Pool(x_train, y_train)
            clf_val = Pool(x_valid, y_valid)
            clf.fit(clf_train, eval_set=[clf_val])

        pred_i = clf.predict(x_valid, prediction_type='Probability')[:, 1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}')

    score = roc_auc_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score
示例#10
0
def shrink_dateframe(x_trn: pd.DataFrame, config: dict) -> pd.DataFrame:
    with timer("shrink datafrme"):
        if config["shrink_by_release"]:
            x_trn = shrink_by_release(x_trn)
        x_trn = shrink_by_date(x_trn, config)

    return x_trn
示例#11
0
def shrink_by_release(x_trn: pd.DataFrame) -> pd.DataFrame:
    # wm_yr_wk, releaseがカラムに必要
    with timer("shrink by release"):
        logging.info(f"before train shape : {x_trn.shape}")

        x_trn = x_trn.query("wm_yr_wk >= release").reset_index(drop=True)

        logging.info(f"after train shape : {x_trn.shape}")

    return x_trn
示例#12
0
    def run_loader(self):
        # TODO implement search of cache for data
        with timer("loading fpl summary", __file__):
            self.load_fpl_summary()

        with timer("adding maps", __file__):
            self.add_maps()

        with timer("loading scores", __file__):
            self.load_match_scores()

        with timer("adding player IDs", __file__):
            self.add_player_id_list()

        with timer("loading player data", __file__):
            self.load_player_data()

        with timer("adding team IDs", __file__):
            self.add_player_team_id_to_player_data()

        with timer("adding player positions", __file__):
            self.add_player_positions()

        # TODO: implement player % filtering

        if self.add_team_ratings:
            with timer("calculating attack and defence scores", __file__):
                self.add_att_def_scores_to_data()
            with timer("merging att def scores", __file__):
                self.merge_att_def_ratings_to_all_player_data()

        if self.add_team_assists:
            with timer("adding assists", __file__):
                self.add_assists_to_data()

        self.data['all_player_data'] = filter_data(
            self.data['all_player_data'],
            "corresponding to rows where less than 30 minutes played",
            self.data['all_player_data'].minutes > 30)

        return self.data
示例#13
0
 def _aggregate(self, dataframe):
     with timer("aggregate"):
         self.features = []
         for param_dict in tqdm(self.param_dict):
             key, var, agg, on = self._get_params(param_dict)
             all_features = list(set(key + var))
             new_features = self._get_feature_names(key, var, agg)
             features = (dataframe[all_features].groupby(key)[var].agg(
                 agg).reset_index())
             features.columns = key + new_features
             self.features.append(features)
     return self
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("log transform"):
            for sub_target in num_var_list:
                total[sub_target] = cudf.Series(np.log1p(total[sub_target].to_pandas()))

        with timer("GroupbyTransformer"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)

            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)


        with timer("end"):
            total = total.sort_values("index")
            new_cols = [col for col in total.columns if col not in org_cols + ["index"]]

            self.train = total[new_cols].iloc[:len_train].reset_index(drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
    def on_fe_end(self, state: State):
        features = state.features

        for key in features:
            if isinstance(features[key]["train"], pd.DataFrame):
                with utils.timer(f"Sort columns of features `{key}`",
                                 logger=state.logger):
                    features[key]["train"] = features[key]["train"].sort_index(
                        axis=1)
                    if "test" in features[key].keys():
                        features[key]["test"] = features[key][
                            "test"].sort_index(axis=1)

        state.features = features
示例#16
0
def shrink_by_date_index(x_trn: pd.Series, config: dict) -> pd.Index:
    # dateがカラムに必要
    with timer("shrink by date index"):
        logging.info(f"before train shape : {x_trn.shape}")

        params = config["params"]
        x_trn = pd.to_datetime(x_trn)
        x_trn = x_trn[x_trn >= datetime.datetime(
            params["year"], params["month"], params["day"])]
        x_trn_idx = x_trn.index

        logging.info(f"after train shape : {x_trn.shape}")

    return x_trn_idx
示例#17
0
def shrink_by_date(x_trn: pd.DataFrame, config: dict) -> pd.DataFrame:
    # dateがカラムに必要
    with timer("shrink by date"):
        logging.info(f"before train shape : {x_trn.shape}")

        params = config["params"]
        x_trn["date"] = pd.to_datetime(x_trn["date"])
        x_trn = x_trn[x_trn["date"] >= datetime.datetime(
            params["year"], params["month"], params["day"])]
        x_trn = x_trn.reset_index(drop=True)

        logging.info(f"after train shape : {x_trn.shape}")

    return x_trn
示例#18
0
 def __init__(self, meta_epoch, valid_check_epoch, patience, valid_tasks, batch_size, first_eval=1, logger=logger.get_logger('base')) -> None:
     super().__init__()
     self.logger = logger
     self.timer = timer()
     self.timer.initialize(time.time(), 60 * 100)
     self.meta_epoch = meta_epoch
     self.valid_check_epoch = valid_check_epoch
     self.patience = patience
     self.valid_tasks = valid_tasks
     self.batch_size = batch_size
     self.first_eval = first_eval
     self.training_mode = 0
     self.training_stage = 0
     self.saving = False
示例#19
0
 def run(
     self,
     train_df: XDataFrame,
     test_df: Optional[XDataFrame] = None,
     log: bool = False,
 ):
     with timer(self.name, log=log):
         self.create_features(train_df, test_df=test_df)
         prefix = self.prefix + "_" if self.prefix else ""
         suffix = self.suffix + "_" if self.suffix else ""
         self.train.columns = pd.Index([str(c) for c in self.train.columns])
         self.valid.columns = pd.Index([str(c) for c in self.valid.columns])
         self.test.columns = pd.Index([str(c) for c in self.test.columns])
         self.train.columns = prefix + self.train.columns + suffix
         self.valid.columns = prefix + self.valid.columns + suffix
         self.test.columns = prefix + self.test.columns + suffix
     return self
示例#20
0
def fit_xgb(X, y, cv=None, params: dict = None, verbose=500):

    if params is None:
        params = deepcopy(XGB_DEFAULT_PARAMS)

    if cv is None:
        cv = StratifiedKFold(n_splits=2, shuffle=True)
    models = []
    # training data の target と同じだけのゼロ配列を用意
    # float にしないと悲しい事件が起こるのでそこだけ注意
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv.split(X, y)):
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        with timer(prefix='fit fold={} '.format(i + 1)):

            print(x_train.shape, y_train.shape)
            print(x_valid.shape, y_valid.shape)

            dtrain = xgb.DMatrix(x_train, label=y_train)
            dval = xgb.DMatrix(x_valid, label=y_valid)
            evals = [(dtrain, 'train'), (dval, 'eval')]

            clf = xgb.train(
                params,
                dtrain,
                evals=evals,
                early_stopping_rounds=params['early_stopping_rounds'],
                num_boost_round=params['num_boost_round'],
                verbose_eval=verbose)

        pred_i = clf.predict(dval)
        oof_pred[idx_valid] = pred_i
        models.append(clf)

        print(f'Fold {i} AUC: {roc_auc_score(y_valid, pred_i):.4f}')

    score = roc_auc_score(y, oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models, score
    def on_fe_end(self, state: State):
        features = state.features

        as_sparse = False
        for feature in features.values():
            if isinstance(feature["train"], dict) or isinstance(
                    feature["train"], csr_matrix):
                as_sparse = True
                break

        main_feature = {}
        with utils.timer("Concatenating `main` features", state.logger):
            if as_sparse:
                for phase in ["train", "test"]:
                    sparse_matrices = []
                    for f in features.values():
                        if isinstance(f[phase], pd.DataFrame):
                            feature_values = csr_matrix(f[phase].values)
                            sparse_matrices.append(feature_values)
                        elif isinstance(f[phase], dict):
                            sparse_dict = f[phase]
                            for sp_mat in sparse_dict.values():
                                sparse_matrices.append(sp_mat)
                        elif isinstance(f[phase], csr_matrix):
                            sparse_matrices.append(f[phase])
                    main_feature[phase] = hstack(sparse_matrices).tocsr()
            else:
                for phase in ["train", "test"]:
                    dfs = []
                    for f in features.values():
                        dfs.append(f[phase])

                    main_feature[phase] = pd.concat(dfs, axis=1)
        state.features["main"] = main_feature

        if self.delete_original:
            keys = list(features.keys())
            keys.remove("main")

            for key in keys:
                del state.features[key]

            gc.collect()
示例#22
0
    def run(self):
        self._run_callbacks(phase="start")

        for config in self.state.config:
            method, kwargs = data.file_open_method(config)
            columns = data.required_columns(config)
            if columns is not None:
                kwargs["columns"] = columns

            filepath = Path(config["dir"]) / config["name"]

            if self.state.data_stats[str(filepath)] is not None:
                stats_path = self.state.data_stats[str(filepath)]
                stats = data.open_stats(stats_path)

                dtypes = stats["dtypes"]
                if columns is not None:
                    dtypes_cols = {}
                    for col in columns:
                        dtypes_cols[col] = dtypes[col]
                    if method == "read_csv" and config["mode"] == "normal":
                        kwargs["dtype"] = dtypes_cols
                else:
                    kwargs["dtype"] = dtypes

            with utils.timer("Reading " + config["name"], self.state.logger):
                if method in {"read_parquet", "read_pickle", "read_feather"}:
                    df = pd.__getattribute__(method)(filepath, **kwargs)
                    self.state.dataframes[str(filepath)] = df
                elif method == "read_csv":
                    if config["mode"] == "normal":
                        df = pd.__getattribute__(method)(filepath, **kwargs)
                        self.state.dataframes[str(filepath)] = df
                    elif config["mode"] == "large":
                        raise NotImplementedError
                    else:
                        pass
                else:
                    raise NotImplementedError
            self.state.dataframe_roles[str(filepath)] = config["role"]

        self._run_callbacks(phase="end")
示例#23
0
 def _merge(self, dataframe, merge=True):
     with timer("merge"):
         for param_dict, features in tqdm(zip(self.param_dict,
                                              self.features),
                                          total=len(self.features)):
             key, var, agg, on = self._get_params(param_dict)
             if merge:
                 if is_cudf(dataframe):
                     dataframe = cudf.merge(dataframe,
                                            features,
                                            how="left",
                                            on=on)
                 else:
                     dataframe = dataframe.merge(features,
                                                 how="left",
                                                 on=on)
             else:
                 new_features = self._get_feature_names(key, var, agg)
                 dataframe = pd.concat([dataframe, features[new_features]],
                                       axis=1)
     return dataframe
示例#24
0
 def __init__(self,
              meta_epoch,
              valid_check_epoch,
              patience,
              valid_tasks,
              batch_size,
              first_eval=1,
              logger=logger.get_logger('base')) -> None:
     super().__init__()
     self.logger = logger
     self.timer = timer()
     self.timer.initialize(time.time(), 60 * 1000)
     self.meta_epoch = meta_epoch
     self.valid_check_epoch = valid_check_epoch
     self.patience = patience
     self.valid_tasks = valid_tasks
     self.batch_size = batch_size
     self.first_eval = first_eval
     self.data_augmentor = DataArgumentor(
     ) if self.use_data_augmentation else None
     self.turn_on_data_augmentor = False
示例#25
0
    def run_tuner(self):
        with timer('Optimising with method {}'.format(self.method), __file__):
            logger.info("Null model likelihood: {:.4E}".format(
                self._get_null_model_likelihood()))
            self.tuner_params.log_initial()
            minimise_kwargs = self.minimize_args()

            try:
                if self.use_multicore_gradient:
                    optimal = multioptimiser(**minimise_kwargs)
                else:
                    optimal = minimize(**minimise_kwargs)
            except (KeyboardInterrupt, SystemExit) as e:
                time.sleep(2)
                logger.info('Cancelling optimisation........')
                self.teardown_params()
                raise e

            logger.info(
                'Finished having run {} evaluations over {} iterations'.format(
                    optimal.nfev, optimal.nit))
            self.tuner_params.update_using_opt_array(optimal.x)
            self.teardown_params()
            return self.tuner_params.nested_params
示例#26
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config_file', type=str, required=True)
    parser.add_argument('--valid', action='store_true')
    args = parser.parse_args()

    config_file = Path(args.config_file)
    config = load_config(config_file)

    config.setdefault('max_len', 220)
    config.setdefault('max_head_len', 128)
    config.setdefault('epochs', 2)
    config.setdefault('down_sample_frac', 0.5)
    config.setdefault('lr', 1.5e-5)
    config.setdefault('batch_size', 16)
    config.setdefault('accumulation_steps', 4)
    config.setdefault('lr_weight_decay_coef', 1.0)
    config.setdefault('warmup', 0.05)
    config.setdefault('old_data', False)
    config.setdefault('old_fine_tuned', False)
    config.setdefault('device', 'cuda')
    config.setdefault('seed', 1234)

    assert 'lm_model_name' in config
    assert not (config.old_fine_tuned and config.old_data)
    assert config.max_len >= config.max_head_len
    assert config.epochs <= 2

    lm_model_name = config_file.stem
    if config.old_fine_tuned:
        PRETRAINED_PATH = Path(f'../output/{lm_model_name}_old_fine_tune/')
        assert PRETRAINED_PATH.exists()
    else:
        PRETRAINED_PATH = args.lm_model
    MODE = args.lm_model[:4]
    LOWER_CASE = 'uncased' in args.lm_model
    LARGE_MODEL = 'large' in args.lm_model
    DEVICE = torch.device(config.device)

    if config.old_data:
        lm_model_name += '_old_fine_tune'

    if args.valid:
        valid_size = 200000
        shuffle_seed = 1029
        lm_model_name += '_valid'
    else:
        valid_size = 0
        shuffle_seed = config.seed

    OUT_DIR = Path(f'../output/{lm_model_name}/')
    TEST_SUBMISSION = OUT_DIR / 'submission.csv'
    VALID_SUBMISSION = OUT_DIR / 'valid_submission.csv'
    OUT_DIR.mkdir(exist_ok=True)

    warnings.filterwarnings('ignore')
    seed_torch(config.seed)

    if not args.old:
        train_data = TRAIN_DATA
        test_data = TEST_DATA
        sample_submission = SAMPLE_SUBMISSION
        train_size = 1804874 - valid_size
    else:
        train_data = TRAIN_OLD
        test_data = TEST_OLD
        sample_submission = SAMPLE_OLD
        train_size = 159571 - valid_size

        TOXICITY_COLUMN = OLD_TOXICITY_COLUMN
        IDENTITY_COLUMNS = OLD_IDENTITY_COLUMNS
        AUX_TOXICITY_COLUMNS = OLD_AUX_TOXICITY_COLUMNS

    if MODE == 'bert':
        from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam

        lm_tokenizer = BertTokenizer.from_pretrained(args.lm_model,
                                                     cache_dir=None,
                                                     do_lower_case=LOWER_CASE)
        model = BertForSequenceClassification.from_pretrained(
            PRETRAINED_PATH,
            cache_dir=None,
            num_labels=1 + len(AUX_TOXICITY_COLUMNS))
        optimizer_class = BertAdam
    else:
        from pytorch_pretrained_bert import GPT2Tokenizer, OpenAIAdam, GPT2Model

        lm_tokenizer = GPT2Tokenizer.from_pretrained(args.lm_model,
                                                     cache_dir=None)
        model = GPT2ClassificationHeadModel.from_pretrained(
            PRETRAINED_PATH,
            clf_dropout=config.get('dropout_rate', 0.1),
            n_class=1 + len(AUX_TOXICITY_COLUMNS))
        optimizer_class = OpenAIAdam
        assert config.lr_weight_decay_coef == 1.0

    with timer('preprocess'):
        tokenizer = MyTokenizer(lm_tokenizer, config.max_len,
                                config.max_head_len, MODE)
        df_train = pd.read_csv(TRAIN_DATA).sample(
            frac=1, random_state=shuffle_seed).reset_index(drop=True)
        df_train['comment_text'] = df_train['comment_text'].astype(str)
        df_train = df_train.fillna(0)
        X_train = tokenizer.tokenize(
            df_train['comment_text'].fillna('DUMMY_VALUE'),
            num_threads=16,
            chunksize=5000)

        df_test = pd.read_csv(TEST_DATA)
        df_test['comment_text'] = df_test['comment_text'].astype(str)
        df_test = df_test.fillna(0)
        X_test = tokenizer.tokenize(
            df_test['comment_text'].fillna('DUMMY_VALUE'),
            num_threads=16,
            chunksize=5000)

        df_train.drop(['comment_text'], axis=1, inplace=True)
        df_test.drop(['comment_text'], axis=1, inplace=True)

        X_valid = X_train[train_size:]
        X_train = X_train[:train_size]

        y_identity_train = df_train[IDENTITY_COLUMNS].values
        y_annotator_counts_train = df_train['toxicity_annotator_count'].values

        weights = training_weights(df_train, TOXICITY_COLUMN, IDENTITY_COLUMNS)
        y_train = np.hstack(
            (df_train[TOXICITY_COLUMN].values.reshape(-1, 1),
             weights.reshape(-1, 1), df_train[AUX_TOXICITY_COLUMNS].values))

        y_valid = y_train[train_size:]
        y_train = y_train[:train_size]
        y_identity_valid = y_identity_train[train_size:]
        y_identity_train = y_identity_train[:train_size]
        y_annotator_counts_valid = y_annotator_counts_train[train_size:]
        y_annotator_counts_train = y_annotator_counts_train[:train_size]
        loss_weight = 1.0 / weights.mean() if not args.old else None

        # drop negative samples here
        frac = config.down_sample_frac
        target_negative = (y_train > 0.0).sum(axis=1) == 1
        identity_negative = (y_identity_train > 0.0).sum(axis=1) == 0
        negative_mask = identity_negative & target_negative
        negative_indices = np.arange(len(y_train))[negative_mask]
        drop_indices_0 = set(
            negative_indices[:int(len(negative_indices) * frac)])
        drop_indices_1 = set(
            negative_indices[int(len(negative_indices) * (1 - frac)):])
        drop_indices_list = [drop_indices_0, drop_indices_1]

        len_train = len(y_train) - len(drop_indices_0)

    with timer('train'):
        model.zero_grad()
        model = model.to(DEVICE)
        num_layers = 24 if LARGE_MODEL else 12
        optimizer_grouped_parameters = get_optimizer_params(
            model, config.lr, config.lr_weight_decay_coef, num_layers)
        num_train_optimization_steps = int(config.epochs * len_train /
                                           config.batch_size /
                                           config.accumulation_steps)

        optimizer = optimizer_class(optimizer_grouped_parameters,
                                    lr=config.lr,
                                    warmup=config.warmup,
                                    t_total=num_train_optimization_steps)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)
        model = model.train()

        batch_count = len_train // config.batch_size
        loss_fn = CustomLoss(loss_weight)
        for epoch, drop_indices in zip(range(config.epochs),
                                       drop_indices_list):
            sample_indices = np.array(
                [i for i in range(len(y_train)) if i not in drop_indices])
            X_sampled_train = [X_train[i] for i in sample_indices]
            y_sampled_train = y_train[sample_indices]
            y_sampled_identity_train = y_identity_train[sample_indices]
            y_sampled_annotator_counts_train = y_annotator_counts_train[
                sample_indices]
            train_dataset = TextDataset(X_sampled_train, y_sampled_train,
                                        y_sampled_identity_train,
                                        y_sampled_annotator_counts_train)
            train_loader = LengthBucketingDataLoader(
                train_dataset,
                shuffle=True,
                drop_last=True,
                batch_size=config.batch_size)
            tk0 = tqdm(enumerate(train_loader), total=batch_count)
            optimizer.zero_grad()
            for i, (x_batch, _, a_batch, y_batch, y_identity_batch) in tk0:
                y_pred = model(x_batch.to(DEVICE),
                               attention_mask=(x_batch > 0).to(DEVICE),
                               labels=None)
                loss = loss_fn(y_pred, y_batch.to(DEVICE))
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if (i + 1) % config.accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

        model.save_pretrained(OUT_DIR)

    with timer('evaluate'):
        if args.valid:
            valid_dataset = TextDataset(X_valid, y_valid, y_identity_valid,
                                        y_annotator_counts_valid)
            valid_preds = predict(model, valid_dataset, device=DEVICE)

            df_valid = df_train.tail(valid_size)
            df_valid['model1'] = valid_preds
            evaluator = JigsawEvaluator(df_valid[TOXICITY_COLUMN].values,
                                        df_valid[IDENTITY_COLUMNS].values)
            final_score, _ = evaluator.get_final_metric(
                df_valid['model1'].values)

            valid_prediction = predict(model,
                                       TextDataset(X_valid),
                                       device=DEVICE)
            valid_submission = pd.DataFrame({
                'id': df_valid['id'],
                'prediction': valid_prediction
            })
            valid_submission.to_csv(VALID_SUBMISSION, index=False)
            print(f'validation score: {final_score:.5f}')

        test_prediction = predict(model, TextDataset(X_test), device=DEVICE)
        submission = pd.DataFrame({
            'id': df_test['id'],
            'prediction': test_prediction
        })
        submission.to_csv(TEST_SUBMISSION, index=False)
示例#27
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config_file', type=str, required=True)
    args = parser.parse_args()

    # settings
    config_path = Path(args.config_file)
    config = Config.load(config_path)

    warnings.filterwarnings('ignore')
    set_seed(config.seed)
    start_time = time.time()

    with timer('load data'):
        DATA_DIR = './input/riiid-test-answer-prediction/'
        usecols = [
            'row_id',
            'timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'answered_correctly',
            'prior_question_elapsed_time',
        ]
        dtype = {
            'row_id': 'int64',
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'answered_correctly': 'int8',
            'prior_question_elapsed_time': 'float32'
        }

        train_df = pd.read_csv(DATA_DIR + 'train.csv',
                               usecols=usecols,
                               dtype=dtype)
        question_df = pd.read_csv(DATA_DIR + 'questions.csv',
                                  usecols=['question_id', 'part'])

    train_df = train_df[train_df['content_type_id'] == 0].reset_index(
        drop=True)

    question_df['part'] += 1  # 0: padding id, 1: start id
    train_df['content_id'] += 2  # 0: padding id, 1: start id
    question_df['question_id'] += 2
    train_df = train_df.merge(question_df,
                              how='left',
                              left_on='content_id',
                              right_on='question_id')

    with timer('validation split'):
        train_idx, valid_idx, epoch_valid_idx = virtual_time_split(
            train_df,
            valid_size=config.valid_size,
            epoch_valid_size=config.epoch_valid_size)
        valid_y = train_df.iloc[valid_idx]['answered_correctly'].values
        epoch_valid_y = train_df.iloc[epoch_valid_idx][
            'answered_correctly'].values

    print('-' * 20)
    print(f'train size: {len(train_idx)}')
    print(f'valid size: {len(valid_idx)}')

    with timer('prepare data loader'):
        train_user_seqs = get_user_sequences(train_df.iloc[train_idx])
        valid_user_seqs = get_user_sequences(train_df.iloc[valid_idx])

        train_dataset = TrainDataset(train_user_seqs,
                                     window_size=config.window_size,
                                     stride_size=config.stride_size)
        valid_dataset = ValidDataset(train_df,
                                     train_user_seqs,
                                     valid_user_seqs,
                                     valid_idx,
                                     window_size=config.window_size)

        train_loader = DataLoader(train_dataset, **config.train_loader_params)
        valid_loader = DataLoader(valid_dataset, **config.valid_loader_params)

        # valid loader for epoch validation
        epoch_valid_user_seqs = get_user_sequences(
            train_df.iloc[epoch_valid_idx])
        epoch_valid_dataset = ValidDataset(train_df,
                                           train_user_seqs,
                                           epoch_valid_user_seqs,
                                           epoch_valid_idx,
                                           window_size=config.window_size)
        epoch_valid_loader = DataLoader(epoch_valid_dataset,
                                        **config.valid_loader_params)

    with timer('train'):
        if config.model == 'akt':
            content_encoder_config = BertConfig(
                **config.content_encoder_config)
            knowledge_encoder_config = BertConfig(
                **config.knowledge_encoder_config)
            decoder_config = BertConfig(**config.decoder_config)

            content_encoder_config.max_position_embeddings = config.window_size + 1
            knowledge_encoder_config.max_position_embeddings = config.window_size
            decoder_config.max_position_embeddings = config.window_size + 1

            model = AktEncoderDecoderModel(content_encoder_config,
                                           knowledge_encoder_config,
                                           decoder_config)

        elif config.model == 'saint':
            encoder_config = BertConfig(**config.encoder_config)
            decoder_config = BertConfig(**config.decoder_config)

            encoder_config.max_position_embeddings = config.window_size
            decoder_config.max_position_embeddings = config.window_size

            model = SaintEncoderDecoderModel(encoder_config, decoder_config)

        else:
            raise ValueError(f'Unknown model: {config.model}')

        model.to(config.device)
        model.zero_grad()

        optimizer = optim.Adam(model.parameters(), **config.optimizer_params)
        scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps)
        loss_ema = None

        for epoch in range(config.n_epochs):
            epoch_start_time = time.time()
            model.train()

            progress = tqdm(train_loader,
                            desc=f'epoch {epoch + 1}',
                            leave=False)
            for i, (x_batch, w_batch, y_batch) in enumerate(progress):
                y_pred = model(**x_batch.to(config.device).to_dict())
                loss = nn.BCEWithLogitsLoss(weight=w_batch.to(config.device))(
                    y_pred, y_batch.to(config.device))
                loss.backward()

                if (config.gradient_accumulation_steps is None
                        or (i + 1) % config.gradient_accumulation_steps == 0):
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()

                loss_ema = loss_ema * 0.9 + loss.item(
                ) * 0.1 if loss_ema is not None else loss.item()
                progress.set_postfix(loss=loss_ema)

            valid_preds = predict(model,
                                  epoch_valid_loader,
                                  device=config.device)
            valid_score = roc_auc_score(epoch_valid_y, valid_preds)

            elapsed_time = time.time() - epoch_start_time
            print(
                f'Epoch {epoch + 1}/{config.n_epochs} \t valid score: {valid_score:.5f} \t time: {elapsed_time / 60:.1f} min'
            )

    with timer('predict'):
        valid_preds = predict(model, valid_loader, device=config.device)
        valid_score = roc_auc_score(valid_y, valid_preds)

    print(f'valid score: {valid_score:.5f}')

    output_dir = Path(f'./output/{config_path.stem}/')
    output_dir.mkdir(parents=True, exist_ok=True)

    torch.save(model.state_dict(), output_dir / 'model.pt')
    torch.save(optimizer.state_dict(), output_dir / 'optimizer.pt')

    elapsed_time = time.time() - start_time
    print(f'all processes done in {elapsed_time / 60:.1f} min.')
示例#28
0
    def create_features(
        self,
        train_df: cudf.DataFrame,
        test_df: cudf.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)

        with timer("label encoding"):
            with timer("rating"):
                rating_dict = {
                    "RP": 0,
                    "EC": 1,
                    "K-A": 2,
                    "E": 2,
                    "E10+": 3,
                    "T": 4,
                    "M": 5,
                    "AO": 5,
                }
                total["Rating"] = total["Rating"].replace(rating_dict).astype(
                    int)

            with timer("other cat cols"):
                cat_cols = [
                    "Name",
                    "Platform",
                    "Genre",
                    "Publisher",
                    "Developer",
                ]
                for col in cat_cols:
                    le = LabelEncoder(handle_unknown="ignore")
                    le.fit(total[col])
                    total[col] = le.transform(total[col]).astype("category")

        with timer("User_Score"):
            total["User_Score"] = (total["User_Score"].replace(
                to_replace="tbd", value=np.nan).astype(float))

        with timer("Year_of_Release"):
            total["Year_of_Release"] = total["Year_of_Release"].replace(
                to_replace=2020.0, value=2017.0)

        with timer("log_User_Count"):
            total["log_User_Count"] = np.log1p(total["User_Count"].to_pandas())

        with timer("end"):
            basic_cols = [
                "Name",
                "Platform",
                "Year_of_Release",
                "Genre",
                "Publisher",
                "Critic_Score",
                "Critic_Count",
                "User_Score",
                "User_Count",
                "log_User_Count",
                "Developer",
                "Rating",
            ]
            target_cols = [
                "NA_Sales",
                "EU_Sales",
                "JP_Sales",
                "Other_Sales",
                "Global_Sales",
            ]
            self.train = total[basic_cols +
                               target_cols].iloc[:len_train].reset_index(
                                   drop=True)
            self.test = total[basic_cols].iloc[len_train:].reset_index(
                drop=True)
示例#29
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--valid', action='store_true')
    args = parser.parse_args()

    config = load_config('./config/lstm_f.json')
    config.setdefault('max_len', 220)
    config.setdefault('max_features', 100000)
    config.setdefault('batch_size', 512)
    config.setdefault('train_epochs', 10)
    config.setdefault('tolerance', 10)
    config.setdefault('num_folds', 5)
    config.setdefault('lr', 1e-3)
    config.setdefault('loss_alpha', 0.1)
    config.setdefault('loss_beta', 1.0)
    config.setdefault('device', 'cuda')
    config.setdefault('seed', 1029)

    device = torch.device(config.device)

    OUT_DIR = Path(f'../output/lstm_f/')
    submission_file_name = 'valid_submission.csv' if args.valid else 'submission.csv'
    SUBMISSION_PATH = OUT_DIR / submission_file_name
    OUT_DIR.mkdir(exist_ok=True)

    warnings.filterwarnings('ignore')
    seed_torch(config.seed)

    with timer('preprocess'):
        train = pd.read_csv(TRAIN_DATA)
        if args.valid:
            train = train.sample(frac=1,
                                 random_state=1029).reset_index(drop=True)
            test = train.tail(200000)
            train = train.head(len(train) - 200000)
        else:
            test = pd.read_csv(TEST_DATA)

        train['comment_text'] = train['comment_text'].apply(preprocess)
        test['comment_text'] = test['comment_text'].apply(preprocess)

        # replace blank with nan
        train['comment_text'].replace('', np.nan, inplace=True)
        test['comment_text'].replace('', np.nan, inplace=True)

        # nan prediction
        nan_pred = train['target'][train['comment_text'].isna()].mean()

        # fill up the missing values
        X_train = train['comment_text'].fillna('_##_').values
        X_test = test['comment_text'].fillna('_##_').values

        # get the target values
        weights = training_weights(train, TOXICITY_COLUMN, IDENTITY_COLUMNS)
        loss_weight = 1.0 / weights.mean()
        y_train_identity = train[IDENTITY_COLUMNS].values
        y_train_annotator_counts = train['toxicity_annotator_count'].values
        y_train = np.hstack(
            (train[TOXICITY_COLUMN].values.reshape(-1, 1),
             weights.reshape(-1, 1), train[AUX_TOXICITY_COLUMNS].values))

        train_nan_mask = X_train == '_##_'
        test_nan_mask = X_test == '_##_'

        vocab = build_vocab(chain(X_train), config.max_features)
        fasttext_embedding_matrix = load_embedding(EMBEDDING_FASTTEXT,
                                                   vocab['token2id'])
        glove_embedding_matrix = load_embedding(EMBEDDING_GLOVE,
                                                vocab['token2id'])

        joblib.dump(vocab, OUT_DIR / 'vocab.pkl')
        np.save(OUT_DIR / 'fasttext_embedding_matrix',
                fasttext_embedding_matrix)
        np.save(OUT_DIR / 'glove_embedding_matrix', glove_embedding_matrix)

        X_train = np.array(tokenize(X_train, vocab, config.max_len))
        X_test = np.array(tokenize(X_test, vocab, config.max_len))

        all_related_columns = [TOXICITY_COLUMN
                               ] + AUX_TOXICITY_COLUMNS + IDENTITY_COLUMNS
        negative_indices = np.arange(
            0, len(train))[(train[all_related_columns] == 0.0).sum(
                axis=1) == len(all_related_columns)]

    with timer('train'):
        skf = StratifiedKFold(n_splits=config.num_folds,
                              shuffle=True,
                              random_state=1)
        num_aux_targets = y_train.shape[-1] - 2
        custom_loss = CustomLoss(
            loss_weight,
            alpha=config.loss_alpha,
            beta=config.loss_beta,
            use_annotator_counts=True,
            weight_from_annotator_counts=lambda x: torch.log(x + 2))
        test_dataset = TextDataset(token_lists=X_test)
        test_prediction = np.zeros(len(test_dataset))
        test_prediction_count = 0
        models = {}
        for i, (train_idx, valid_idx) in enumerate(
                skf.split(X_train, y_train[:, 0] >= 0.5)):
            seed_torch(i)
            np.random.shuffle(negative_indices)
            drop_indices = set(negative_indices[:len(negative_indices) // 2])
            train_idx = [i for i in train_idx if i not in drop_indices]
            train_token_lists = [X_train[i] for i in train_idx]
            valid_token_lists = [X_train[i] for i in valid_idx]
            train_dataset = TextDataset(
                token_lists=train_token_lists,
                targets=y_train[train_idx],
                identities=y_train_identity[train_idx],
                annotator_counts=y_train_annotator_counts[train_idx])
            valid_dataset = TextDataset(
                token_lists=valid_token_lists,
                targets=y_train[valid_idx],
                identities=y_train_identity[valid_idx],
                annotator_counts=y_train_annotator_counts[valid_idx])
            model = LstmGruNet(embedding_matrices=[
                glove_embedding_matrix, fasttext_embedding_matrix
            ],
                               num_aux_targets=num_aux_targets).to(device)
            model, records = train(model,
                                   custom_loss,
                                   train_dataset,
                                   valid_dataset,
                                   device=device,
                                   batch_size=config.batch_size,
                                   num_epochs=config.train_epochs,
                                   tolerance=config.tolerance,
                                   lr=config.lr)
            test_prediction += predict(model, test_dataset, device)
            test_prediction_count += 1
            torch.save(model.state_dict(), OUT_DIR / f'model.{i}.json')

            with open(OUT_DIR / f'records.{i}.json', 'w') as f:
                import json
                json.dump(records, f, indent=4)

            submission = pd.DataFrame({
                'id':
                test['id'],
                'prediction':
                test_prediction / test_prediction_count
            })
            submission.to_csv(SUBMISSION_PATH, index=False)
            display_tables(OUT_DIR)
示例#30
0
# %%

dataset.public_log = public_log

feat_train, feat_test = pd.DataFrame(), pd.DataFrame()

feature_blocks = [
    *[CountEncodingBlock(column=c) for c in ['hour']],
    DateBlock(),
    PublicLogBlock(dataset),
    MetaInformationBlock(),
    UserHistoryBlock(dataset),
]

for block in feature_blocks:
    with timer(prefix='fit {} '.format(block)):
        out_i = block.fit(train_meta)
    assert len(train_meta) == len(out_i), block
    feat_train = pd.concat([feat_train, out_i], axis=1)

for block in feature_blocks:
    with timer(prefix='fit {} '.format(block)):
        out_i = block.transform(test_meta)

    assert len(test_meta) == len(out_i), block
    feat_test = pd.concat([feat_test, out_i], axis=1)
# %%

print(feat_train.columns)
feat_train.head(30)