def preprocess_dfs(use_features, is_local=False, logger=None, debug=True):
    # read dataframes
    with timer("read datasets"):
        if debug:
            nrows = 200000
        else:
            nrows = None

        sub = pd.read_csv(base_path + '/sample_submission.csv')

        # if is_local:
        #     org_train = pickle_load("../input/train.pkl")
        #     org_test = pickle_load("../input/test.pkl")
        # else:
        #     org_train = pd.read_csv(base_path + "/train.csv", nrows=nrows)
        #     org_test = pd.read_csv(base_path + "/test.csv", nrows=nrows)
        org_train = pd.read_pickle(f'{base_path}/train.pkl.gz')
        # org_test = pd.read_csv(f'{base_path}/test.csv')
        org_test = org_train

        org_train = memory_reducer(org_train, verbose=True)
        org_test = org_test[org_test.installation_id.isin(sub.installation_id)]
        org_test.sort_values(['installation_id', 'timestamp'], inplace=True)
        org_test.reset_index(inplace=True)
        org_test = memory_reducer(org_test, verbose=True)

        train_labels = pd.read_csv(base_path + "/train_labels.csv",
                                   nrows=nrows)
        specs = pd.read_csv(base_path + "/specs.csv", nrows=nrows)

    # basic preprocess
    org_train["timestamp"] = pd.to_datetime(org_train["timestamp"])
    org_test["timestamp"] = pd.to_datetime(org_test["timestamp"])

    with timer("merging features"):
        train_df = add_features(use_features,
                                org_train,
                                org_test,
                                train_labels,
                                specs,
                                datatype="train",
                                is_local=is_local,
                                logger=None)
        train_df = train_df.reset_index(drop=True)
        test_df = add_features(use_features,
                               org_train,
                               org_test,
                               train_labels,
                               specs,
                               datatype="test",
                               is_local=is_local,
                               logger=None)
        test_df = test_df.reset_index(drop=True)


#     df = pd.concat([df, feat_df], axis=1)
    print("preprocess done!!")

    return train_df, test_df
Пример #2
0
    def do_adversarial_valid_kfold(self, model_conf, n_splits=2):
        sp = Splitter()
        target = "is_test"
        split_x = self.train["installation_id"]
        split_y = self.train[target]
        seed = 773
        sp.get_kfold_idx(split_x,
                         split_y,
                         seed,
                         n_cv=n_splits,
                         stratified=True,
                         pref="adv")

        target_length = 1
        oof: ndarray = np.zeros(self.train.shape[0])
        prediction = np.zeros(self.test.shape[0])

        clf_list = []

        self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50)
        self.logger.log(logging.DEBUG, model_conf["train_cols"])
        self.validation_scores = []

        for i, (trn_idx, val_idx) in enumerate(sp.idx_list):
            self.logger.log(logging.DEBUG, "-" * 60)
            self.logger.log(logging.DEBUG, f"start training: {i}")

            with timer(f"fold {i}", self.logger):
                train_df, valid_df = self.train.loc[trn_idx], self.train.loc[
                    val_idx]
                model = self.generate_model(model_conf)
                clf, fold_oof, feature_importance_df = model.train(
                    train_df, valid_df, self.logger)

                # calc validation score using clf.best_iteration_
                fold_val_score = get_val_score(valid_df[target], fold_oof)
                self.validation_scores.append(fold_val_score)
                self.logger.log(logging.DEBUG,
                                f"fold_val_score: {fold_val_score:,.5f}")

                clf_list.append(clf)
                oof[val_idx] = fold_oof

                feature_importance_df["fold"] = i
                self.feature_importance.append(feature_importance_df)

        self.logger.log(
            logging.DEBUG,
            f"Total Validation Score: {sum(self.validation_scores) / len(self.validation_scores):,.5f}"
        )

        oof = np.expm1(oof)
        self.train["pred_y"] = oof
        self.feature_importance = pd.concat(self.feature_importance, axis=0)

        return clf_list, oof, prediction, self.feature_importance
Пример #3
0
 def feature_extract(self, org_train, org_test):
     if self.check_feature_exec():
         with timer(f"FE: {self.name}", self.logger):
             a = self.calc_feature(org_train, org_test)
         return a
Пример #4
0
    def do_valid_kfold(self, model_conf, n_splits=5):
        sp = Splitter()
        target = model_conf["target"]
        split_x = self.train["installation_id"]
        split_y = self.train[target]
        seed = 773
        sp.get_kfold_idx(split_x,
                         split_y,
                         seed,
                         n_cv=n_splits,
                         stratified=False,
                         group=True,
                         pref=self.exp_conf["exp_name"])

        oof: ndarray = np.zeros((self.train.shape[0]))
        prediction = np.zeros((self.test.shape[0]))

        clf_list = []

        self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50)
        self.logger.log(logging.DEBUG, model_conf["train_cols"])
        self.validation_scores = []

        optimizers = []
        valid_qwks = []

        for i, (trn_idx, val_idx) in enumerate(sp.idx_list):
            self.logger.log(logging.DEBUG, "-" * 60)
            self.logger.log(logging.DEBUG, f"start training: {i}")

            with timer(f"fold {i}", self.logger):
                train_df, valid_df = self.train.loc[trn_idx], self.train.loc[
                    val_idx]
                model = self.generate_model(model_conf)
                clf, fold_oof, feature_importance_df = model.train(
                    train_df, valid_df, self.logger)
                #                 fold_oof_class = fold_oof.argmax(axis = 1)

                fold_prediction = model.predict(self.test, self.logger)
                #                 fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK")

                optR = OptimizedRounder()
                optR.fit(fold_oof, valid_df[target])
                coefficients = optR.coefficients()
                opt_preds = optR.predict(fold_oof, coefficients)
                fold_qwk = qwk(valid_df[target], opt_preds)
                optimizers.append(optR)
                valid_qwks.append(fold_qwk)

                clf_list.append(clf)
                oof[val_idx] = fold_oof

                prediction += fold_prediction / n_splits

                feature_importance_df["fold"] = i
                self.feature_importance.append(feature_importance_df)


#         self.logger.log(logging.DEBUG,
# f"Total Validation Score: {sum(self.validation_scores) /
# len(self.validation_scores):,.5f}")

        self.feature_importance = pd.concat(self.feature_importance, axis=0)

        return clf_list, oof, prediction, self.feature_importance, optimizers, valid_qwks
Пример #5
0
    def do_valid_kfold(self,
                       model_conf,
                       n_splits=5,
                       trn_mode='simple',
                       val_mode='simple'):
        sp = Splitter()
        target = model_conf["target"]
        split_x = self.train["installation_id"]
        split_y = self.train[target]
        seed = 773
        sp.get_kfold_idx(split_x,
                         split_y,
                         seed,
                         n_cv=n_splits,
                         stratified=False,
                         group=True,
                         pref=self.exp_conf["exp_name"])

        oof: ndarray = np.zeros((self.train.shape[0]))
        labels = np.zeros((self.train.shape[0]))
        prediction = np.zeros((self.test.shape[0]))

        clf_list = []

        self.logger.log(logging.DEBUG, "[train cols] " + "-" * 50)
        self.logger.log(logging.DEBUG, model_conf["train_cols"])
        self.validation_scores = []

        for i, (trn_idx, val_idx) in enumerate(sp.idx_list):
            self.logger.log(logging.DEBUG, "-" * 60)
            self.logger.log(logging.DEBUG, f"start training: {i}")

            with timer(f"fold {i}", self.logger):
                _train = self.train.copy()
                if self.another_train:
                    _train = pd.concat([_train, self.another_train])
                    trn_idx = np.concatenate([trn_idx, self.another_train_idx])
                if trn_mode == 'simple':
                    pass
                elif trn_mode == 'last_truncated':
                    trn_idx = self.get_last_trancated_idx(_train, trn_idx)
                if val_mode == 'simple':
                    pass
                elif val_mode == 'last_truncated':
                    val_idx = self.get_last_trancated_idx(_train, val_idx)

                train_df, valid_df = _train.loc[trn_idx], _train.loc[val_idx]

                model = self.generate_model(model_conf)
                clf, fold_oof, feature_importance_df = model.train(
                    train_df, valid_df, self.logger)
                #                 fold_oof_class = fold_oof.argmax(axis = 1)

                fold_prediction = model.predict(self.test, self.logger)
                #                 fold_val_score = get_val_score(valid_df[target], fold_oof_class, "QWK")

                # calc validation score using best iteration
                #                 self.validation_scores.append(fold_val_score)
                #                 self.logger.log(logging.DEBUG, f"fold_val_score: {fold_val_score:,.5f}")

                clf_list.append(clf)
                oof[val_idx] = fold_oof
                labels[val_idx] = valid_df['accuracy_group'].values

                prediction += fold_prediction / n_splits

                feature_importance_df["fold"] = i
                self.feature_importance.append(feature_importance_df)


#         self.logger.log(logging.DEBUG,
# f"Total Validation Score: {sum(self.validation_scores) /
# len(self.validation_scores):,.5f}")

        self.feature_importance = pd.concat(self.feature_importance, axis=0)

        return clf_list, oof, prediction, self.feature_importance, labels