'utrip_id')[categorical_cols].last().reset_index()
    X_test['city_id'] = test_trips['city_id']
    X_test = X_test.reset_index(drop=True)

    test_dataset = BookingDataset(X_test, is_train=False)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1)

    del train_test, test, test_trips
    gc.collect()

    model_paths = [
        '../input/booking-bi-lstm-ep1/logdir_nn000',
    ]
    for mp in model_paths:
        for fold_id in (0, ):
            runner = CustomRunner(device=device)
            model = BookingNN(len(target_le.classes_))
            pred = []
            for prediction in tqdm(
                    runner.predict_loader(
                        loader=test_loader,
                        resume=f'{mp}/fold{fold_id}/checkpoints/best.pth',
                        model=model,
                    )):
                pred.append(
                    target_le.inverse_transform(
                        np.argsort(prediction.cpu().numpy()[-1, :])[-4:]))
            pred = np.array(pred)
            np.save(
                f"y_pred{mp.replace('/', '_').replace('.', '')}_fold{fold_id}",
                pred)
示例#2
0
                batch_size=256,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=True,
            )
            valid_loader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=1,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=False,
            )

            runner = CustomRunner(device=device)
            model = BookingNN(
                n_city_id=len(target_le.classes_),
                n_booker_country=len(cat_le["booker_country"].classes_),
                n_device_class=len(cat_le["device_class"].classes_),
                n_affiliate_id=len(cat_le["affiliate_id"].classes_),
                n_month_checkin=len(cat_le["month_checkin"].classes_),
                n_hotel_country=len(cat_le["past_hotel_country"].classes_))
            criterion = torch.nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=30, eta_min=1e-6)
            logdir = f'logdir_{run_name}/fold{fold_id}'

            loaders = {'train': train_loader}
            runner.train(
示例#3
0
                batch_size=256,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=True,
            )
            valid_loader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=1,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=False,
            )

            runner = CustomRunner(device=device)
            model = BookingNN(
                n_city_id=len(target_le.classes_),
                n_booker_country=len(cat_le["booker_country"].classes_),
                n_device_class=len(cat_le["device_class"].classes_),
                n_affiliate_id=len(cat_le["affiliate_id"].classes_),
            )
            criterion = torch.nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6)
            logdir = f'logdir_{run_name}/fold{fold_id}'

            loaders = {'train': train_loader}
            runner.train(
                model=model,
                criterion=criterion,
def run(config: dict, holdout: bool, debug: bool) -> None:
    log("Run with configuration:")
    log(f"{config}")
    seed_everything(config["seed"])

    with span("Load train and test set:"):
        train_test_set = load_train_test_set(config)
        log(f"{train_test_set.shape}")
        emb_df = pd.read_csv("./data/interim/emb_df.csv")
        n_emb = emb_df.shape[1] - 1
        emb_cols = [str(i) for i in range(n_emb)]
        emb_df.rename(columns={"city_id": "past_city_id"}, inplace=True)

    with span("Preprocessing:"):
        with span("Shift target values for input sequence."):
            unk_city_id = 0
            train_test_set["past_city_id"] = (
                train_test_set.groupby("utrip_id")["city_id"].shift(1).fillna(
                    unk_city_id).astype(int))
            unk_hotel_country = "UNK"
            train_test_set["past_hotel_country"] = (
                train_test_set.groupby("utrip_id")["hotel_country"].shift(
                    1).fillna(unk_hotel_country).astype(str))
            train_test_set = pd.merge(train_test_set,
                                      emb_df,
                                      on="past_city_id",
                                      how="left")
            train_test_set[emb_cols] = train_test_set[emb_cols].fillna(0)
            train_test_set["city_embedding"] = train_test_set[emb_cols].apply(
                lambda x: list(x), axis=1)

        with span("Encode of target values."):
            target_le = preprocessing.LabelEncoder()
            train_test_set["city_id"] = target_le.fit_transform(
                train_test_set["city_id"])
            train_test_set["past_city_id"] = target_le.transform(
                train_test_set["past_city_id"])

        with span("Add features."):
            log("Convert data type of checkin and checkout.")
            train_test_set["checkin"] = pd.to_datetime(
                train_test_set["checkin"])
            train_test_set["checkout"] = pd.to_datetime(
                train_test_set["checkout"])

            log("Create month_checkin feature.")
            train_test_set["month_checkin"] = train_test_set[
                "checkin"].dt.month
            train_test_set["year_checkin"] = train_test_set["checkin"].dt.year

            log("Create days_stay feature.")
            train_test_set["days_stay"] = (
                train_test_set["checkout"] -
                train_test_set["checkin"]).dt.days.apply(lambda x: np.log10(x))

            log("Create num_checkin feature.")
            train_test_set["num_checkin"] = (train_test_set.groupby(
                "utrip_id")["checkin"].rank().apply(lambda x: np.log10(x)))

            log("Create days_move feature.")
            train_test_set["past_checkout"] = train_test_set.groupby(
                "utrip_id")["checkout"].shift(1)
            train_test_set["days_move"] = (
                (train_test_set["checkin"] - train_test_set["past_checkout"]
                 ).dt.days.fillna(0).apply(lambda x: np.log1p(x)))

            log("Create aggregation features.")
            num_visit_drop_duplicates = train_test_set.query("city_id != 0")[[
                "user_id", "city_id"
            ]].drop_duplicates().groupby("city_id").size().apply(
                lambda x: np.log1p(x)).reset_index()
            num_visit_drop_duplicates.columns = [
                "past_city_id", "num_visit_drop_duplicates"
            ]
            num_visit = train_test_set.query("city_id != 0")[[
                "user_id", "city_id"
            ]].groupby("city_id").size().apply(
                lambda x: np.log1p(x)).reset_index()
            num_visit.columns = ["past_city_id", "num_visit"]
            num_visit_same_city = train_test_set[
                train_test_set['city_id'] == train_test_set['city_id'].shift(
                    1)].groupby("city_id").size().apply(
                        lambda x: np.log1p(x)).reset_index()
            num_visit_same_city.columns = [
                "past_city_id", "num_visit_same_city"
            ]
            train_test_set = pd.merge(train_test_set,
                                      num_visit_drop_duplicates,
                                      on="past_city_id",
                                      how="left")
            train_test_set = pd.merge(train_test_set,
                                      num_visit,
                                      on="past_city_id",
                                      how="left")
            train_test_set = pd.merge(train_test_set,
                                      num_visit_same_city,
                                      on="past_city_id",
                                      how="left")
            train_test_set["num_visit_drop_duplicates"].fillna(0, inplace=True)
            train_test_set["num_visit"].fillna(0, inplace=True)
            train_test_set["num_visit_same_city"].fillna(0, inplace=True)
            train_test_set["num_stay_consecutively"] = train_test_set.groupby(
                ["utrip_id", "past_city_id"])["past_city_id"].rank(
                    method="first").fillna(1).apply(lambda x: np.log1p(x))

        with span("Encode of categorical values."):
            cat_le = {}
            for c in CATEGORICAL_COLS:
                le = preprocessing.LabelEncoder()
                train_test_set[c] = le.fit_transform(
                    train_test_set[c].fillna("UNK").astype(str).values)
                cat_le[c] = le

        train = train_test_set[train_test_set["row_num"].isnull()]
        test = train_test_set[~train_test_set["row_num"].isnull()]

        with span("aggregate features by utrip_id"):
            x_train, x_test_using_train, x_test = [], [], []
            for c in ["city_id", "past_city_id"
                      ] + CATEGORICAL_COLS + NUMERICAL_COLS:
                x_train.append(train.groupby("utrip_id")[c].apply(list))
                x_test.append(test.groupby("utrip_id")[c].apply(list))
                x_test_using_train.append(
                    test.groupby("utrip_id")[c].apply(lambda x: list(x)[:-1]))
            x_train = pd.concat(x_train, axis=1)
            x_test = pd.concat(x_test, axis=1)
            x_test_using_train = pd.concat(x_test_using_train, axis=1)

        with span("sampling training data"):
            x_train["n_trips"] = x_train["city_id"].map(lambda x: len(x))
            x_test_using_train["n_trips"] = x_test_using_train["city_id"].map(
                lambda x: len(x))
            x_train = (x_train.query("n_trips > 2").sort_values(
                "n_trips").reset_index(drop=True))
            x_test_using_train = (
                x_test_using_train.sort_values("n_trips").reset_index(
                    drop=True))
            x_test = x_test.reset_index(drop=True)
            log(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

        if debug:
            log("'--debug' specified. Shrink data size into 1000.")
            x_train = x_train.iloc[:1000]
            x_test = x_test.iloc[:1000]
            config["params"]["num_epochs"] = 2
            log(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

    with span("Prepare data loader for test:"):
        test_dataset = Dataset(x_test, is_train=False)
        test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=1,
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=Collator(is_train=False),
            shuffle=False,
        )

    with span("Get folds:"):
        cv = StratifiedKFold(
            n_splits=config["fold"]["n_splits"],
            shuffle=config["fold"]["shuffle"],
        )
        folds = cv.split(x_train, pd.cut(x_train["n_trips"], 5, labels=False))

    log("Training:")
    oof_preds = np.zeros((len(x_train), len(target_le.classes_)),
                         dtype=np.float32)
    test_preds = np.zeros((len(x_test), len(target_le.classes_)),
                          dtype=np.float32)

    for i_fold, (trn_idx, val_idx) in enumerate(folds):
        if holdout and i_fold > 0:
            break
        with span(f"Fold = {i_fold}"):
            x_trn = x_train.loc[trn_idx, :]
            x_val = x_train.loc[val_idx, :]
            x_trn = pd.concat([x_trn, x_test_using_train],
                              axis=0,
                              ignore_index=True)
            train_dataset = Dataset(x_trn, is_train=True)
            valid_dataset = Dataset(x_val, is_train=True)
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=config["params"]["bacth_size"],
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=Collator(is_train=True),
                shuffle=True,
            )
            valid_dataloader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=1,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=Collator(is_train=True),
                shuffle=False,
            )
            model_cls = MODELS[config["model_name"]]
            model = model_cls(
                n_city_id=len(target_le.classes_),
                n_booker_country=len(cat_le["booker_country"].classes_),
                n_device_class=len(cat_le["device_class"].classes_),
                n_affiliate_id=len(cat_le["affiliate_id"].classes_),
                n_month_checkin=len(cat_le["month_checkin"].classes_),
                n_hotel_country=len(cat_le["past_hotel_country"].classes_),
                emb_dim=config["params"]["emb_dim"],
                rnn_dim=config["params"]["rnn_dim"],
                dropout=config["params"]["dropout"],
                rnn_dropout=config["params"]["rnn_dropout"],
            )
            if i_fold == 0:
                log(f"{summary(model)}")

            criterion = FocalLossWithOutOneHot(gamma=0.5)
            # Prepare optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [
                        p for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.01,
                },
                {
                    "params": [
                        p for n, p in param_optimizer
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ]
            optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=1e-4,
                weight_decay=0.01,
            )
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=30, eta_min=1e-6)
            logdir = (Path(config["output_dir_path"]) / config["exp_name"] /
                      f"fold{i_fold}")
            loaders = {"train": train_dataloader, "valid": valid_dataloader}
            runner = CustomRunner(device=DEVICE)
            runner.train(
                model=model,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                loaders=loaders,
                main_metric="accuracy04",
                minimize_metric=False,
                logdir=logdir,
                num_epochs=config["params"]["num_epochs"],
                verbose=True,
            )

            log("Predictions using validation data")
            oof_preds[val_idx, :] = np.array(
                list(
                    map(
                        lambda x: x.cpu().numpy()[-1, :],
                        runner.predict_loader(
                            loader=valid_dataloader,
                            resume=f"{logdir}/checkpoints/best.pth",
                            model=model,
                        ),
                    )))
            y_val = x_val["city_id"].map(lambda x: x[-1]).values
            score = top_k_accuracy_score(y_val,
                                         oof_preds[val_idx, :],
                                         k=4,
                                         labels=np.arange(
                                             len(target_le.classes_)))
            log(f"val acc@4: {score}")
            np.save(
                Path(config["output_dir_path"]) / config["exp_name"] /
                f"y_val_pred_fold{i_fold}",
                oof_preds[val_idx, :],
            )

            test_preds_ = np.array(
                list(
                    map(
                        lambda x: x.cpu().numpy()[-1, :],
                        runner.predict_loader(
                            loader=test_dataloader,
                            resume=f"{logdir}/checkpoints/best.pth",
                            model=model,
                        ),
                    )))
            test_preds += test_preds_ / cv.n_splits
            np.save(
                Path(config["output_dir_path"]) / config["exp_name"] /
                f"y_test_pred_fold{i_fold}",
                test_preds_,
            )

    log("Evaluation OOF valies:")
    y_train = x_train["city_id"].map(lambda x: x[-1])
    score = top_k_accuracy_score(y_train,
                                 oof_preds,
                                 k=4,
                                 labels=np.arange(len(target_le.classes_)))
    log(f"oof acc@4: {score}")

    log("Save files:")
    np.save(
        Path(config["output_dir_path"]) / config["exp_name"] / f"y_oof_pred",
        oof_preds,
    )
    np.save(
        Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred",
        test_preds,
    )
示例#5
0
                batch_size=256,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=True,
            )
            valid_loader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=1,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=MyCollator(is_train=True),
                shuffle=False,
            )

            runner = CustomRunner(device=device)
            model = BookingNN(
                n_city_id=len(target_le.classes_),
                n_booker_country=len(cat_le["booker_country"].classes_),
                n_device_class=len(cat_le["device_class"].classes_),
                n_affiliate_id=len(cat_le["affiliate_id"].classes_),
                n_month_checkin=len(cat_le["month_checkin"].classes_),
                n_hotel_country=len(cat_le["past_hotel_country"].classes_))
            criterion = torch.nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=30, eta_min=1e-6)

            oof_preds[va_idx, :] = np.array(
                list(
                    map(
示例#6
0
        X_tr, y_tr = [
            d for i, d in enumerate(train_paths) if i in list(tr_idx)
        ], train_labels.iloc[tr_idx].values
        X_val, y_val = [
            d for i, d in enumerate(train_paths) if i in list(va_idx)
        ], train_labels.iloc[va_idx].values

        train_dataset = SimpleDataset(X_tr, y_tr, transform=None)
        valid_dataset = SimpleDataset(X_val, y_tr, transform=None)

        train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128)
        valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=512)

        loaders = {'train': train_loader, 'valid': valid_loader}
        runner = CustomRunner(device=device)

        model = EfficientNet.from_pretrained('efficientnet-b3')
        model._fc = torch.nn.Sequential(
            torch.nn.Linear(1536, 1),
            Squeeze(),
        )

        criterion = RMSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=30,
                                                               eta_min=1e-6)

        logdir = f'../output/logdir_{run_name}/fold{fold_id}'
        runner.train(