'utrip_id')[categorical_cols].last().reset_index() X_test['city_id'] = test_trips['city_id'] X_test = X_test.reset_index(drop=True) test_dataset = BookingDataset(X_test, is_train=False) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1) del train_test, test, test_trips gc.collect() model_paths = [ '../input/booking-bi-lstm-ep1/logdir_nn000', ] for mp in model_paths: for fold_id in (0, ): runner = CustomRunner(device=device) model = BookingNN(len(target_le.classes_)) pred = [] for prediction in tqdm( runner.predict_loader( loader=test_loader, resume=f'{mp}/fold{fold_id}/checkpoints/best.pth', model=model, )): pred.append( target_le.inverse_transform( np.argsort(prediction.cpu().numpy()[-1, :])[-4:])) pred = np.array(pred) np.save( f"y_pred{mp.replace('/', '_').replace('.', '')}_fold{fold_id}", pred)
batch_size=256, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=True, ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=False, ) runner = CustomRunner(device=device) model = BookingNN( n_city_id=len(target_le.classes_), n_booker_country=len(cat_le["booker_country"].classes_), n_device_class=len(cat_le["device_class"].classes_), n_affiliate_id=len(cat_le["affiliate_id"].classes_), n_month_checkin=len(cat_le["month_checkin"].classes_), n_hotel_country=len(cat_le["past_hotel_country"].classes_)) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=30, eta_min=1e-6) logdir = f'logdir_{run_name}/fold{fold_id}' loaders = {'train': train_loader} runner.train(
batch_size=256, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=True, ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=False, ) runner = CustomRunner(device=device) model = BookingNN( n_city_id=len(target_le.classes_), n_booker_country=len(cat_le["booker_country"].classes_), n_device_class=len(cat_le["device_class"].classes_), n_affiliate_id=len(cat_le["affiliate_id"].classes_), ) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6) logdir = f'logdir_{run_name}/fold{fold_id}' loaders = {'train': train_loader} runner.train( model=model, criterion=criterion,
def run(config: dict, holdout: bool, debug: bool) -> None: log("Run with configuration:") log(f"{config}") seed_everything(config["seed"]) with span("Load train and test set:"): train_test_set = load_train_test_set(config) log(f"{train_test_set.shape}") emb_df = pd.read_csv("./data/interim/emb_df.csv") n_emb = emb_df.shape[1] - 1 emb_cols = [str(i) for i in range(n_emb)] emb_df.rename(columns={"city_id": "past_city_id"}, inplace=True) with span("Preprocessing:"): with span("Shift target values for input sequence."): unk_city_id = 0 train_test_set["past_city_id"] = ( train_test_set.groupby("utrip_id")["city_id"].shift(1).fillna( unk_city_id).astype(int)) unk_hotel_country = "UNK" train_test_set["past_hotel_country"] = ( train_test_set.groupby("utrip_id")["hotel_country"].shift( 1).fillna(unk_hotel_country).astype(str)) train_test_set = pd.merge(train_test_set, emb_df, on="past_city_id", how="left") train_test_set[emb_cols] = train_test_set[emb_cols].fillna(0) train_test_set["city_embedding"] = train_test_set[emb_cols].apply( lambda x: list(x), axis=1) with span("Encode of target values."): target_le = preprocessing.LabelEncoder() train_test_set["city_id"] = target_le.fit_transform( train_test_set["city_id"]) train_test_set["past_city_id"] = target_le.transform( train_test_set["past_city_id"]) with span("Add features."): log("Convert data type of checkin and checkout.") train_test_set["checkin"] = pd.to_datetime( train_test_set["checkin"]) train_test_set["checkout"] = pd.to_datetime( train_test_set["checkout"]) log("Create month_checkin feature.") train_test_set["month_checkin"] = train_test_set[ "checkin"].dt.month train_test_set["year_checkin"] = train_test_set["checkin"].dt.year log("Create days_stay feature.") train_test_set["days_stay"] = ( train_test_set["checkout"] - train_test_set["checkin"]).dt.days.apply(lambda x: np.log10(x)) log("Create num_checkin feature.") train_test_set["num_checkin"] = (train_test_set.groupby( "utrip_id")["checkin"].rank().apply(lambda x: np.log10(x))) log("Create days_move feature.") train_test_set["past_checkout"] = train_test_set.groupby( "utrip_id")["checkout"].shift(1) train_test_set["days_move"] = ( (train_test_set["checkin"] - train_test_set["past_checkout"] ).dt.days.fillna(0).apply(lambda x: np.log1p(x))) log("Create aggregation features.") num_visit_drop_duplicates = train_test_set.query("city_id != 0")[[ "user_id", "city_id" ]].drop_duplicates().groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit_drop_duplicates.columns = [ "past_city_id", "num_visit_drop_duplicates" ] num_visit = train_test_set.query("city_id != 0")[[ "user_id", "city_id" ]].groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit.columns = ["past_city_id", "num_visit"] num_visit_same_city = train_test_set[ train_test_set['city_id'] == train_test_set['city_id'].shift( 1)].groupby("city_id").size().apply( lambda x: np.log1p(x)).reset_index() num_visit_same_city.columns = [ "past_city_id", "num_visit_same_city" ] train_test_set = pd.merge(train_test_set, num_visit_drop_duplicates, on="past_city_id", how="left") train_test_set = pd.merge(train_test_set, num_visit, on="past_city_id", how="left") train_test_set = pd.merge(train_test_set, num_visit_same_city, on="past_city_id", how="left") train_test_set["num_visit_drop_duplicates"].fillna(0, inplace=True) train_test_set["num_visit"].fillna(0, inplace=True) train_test_set["num_visit_same_city"].fillna(0, inplace=True) train_test_set["num_stay_consecutively"] = train_test_set.groupby( ["utrip_id", "past_city_id"])["past_city_id"].rank( method="first").fillna(1).apply(lambda x: np.log1p(x)) with span("Encode of categorical values."): cat_le = {} for c in CATEGORICAL_COLS: le = preprocessing.LabelEncoder() train_test_set[c] = le.fit_transform( train_test_set[c].fillna("UNK").astype(str).values) cat_le[c] = le train = train_test_set[train_test_set["row_num"].isnull()] test = train_test_set[~train_test_set["row_num"].isnull()] with span("aggregate features by utrip_id"): x_train, x_test_using_train, x_test = [], [], [] for c in ["city_id", "past_city_id" ] + CATEGORICAL_COLS + NUMERICAL_COLS: x_train.append(train.groupby("utrip_id")[c].apply(list)) x_test.append(test.groupby("utrip_id")[c].apply(list)) x_test_using_train.append( test.groupby("utrip_id")[c].apply(lambda x: list(x)[:-1])) x_train = pd.concat(x_train, axis=1) x_test = pd.concat(x_test, axis=1) x_test_using_train = pd.concat(x_test_using_train, axis=1) with span("sampling training data"): x_train["n_trips"] = x_train["city_id"].map(lambda x: len(x)) x_test_using_train["n_trips"] = x_test_using_train["city_id"].map( lambda x: len(x)) x_train = (x_train.query("n_trips > 2").sort_values( "n_trips").reset_index(drop=True)) x_test_using_train = ( x_test_using_train.sort_values("n_trips").reset_index( drop=True)) x_test = x_test.reset_index(drop=True) log(f"x_train: {x_train.shape}, x_test: {x_test.shape}") if debug: log("'--debug' specified. Shrink data size into 1000.") x_train = x_train.iloc[:1000] x_test = x_test.iloc[:1000] config["params"]["num_epochs"] = 2 log(f"x_train: {x_train.shape}, x_test: {x_test.shape}") with span("Prepare data loader for test:"): test_dataset = Dataset(x_test, is_train=False) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=False), shuffle=False, ) with span("Get folds:"): cv = StratifiedKFold( n_splits=config["fold"]["n_splits"], shuffle=config["fold"]["shuffle"], ) folds = cv.split(x_train, pd.cut(x_train["n_trips"], 5, labels=False)) log("Training:") oof_preds = np.zeros((len(x_train), len(target_le.classes_)), dtype=np.float32) test_preds = np.zeros((len(x_test), len(target_le.classes_)), dtype=np.float32) for i_fold, (trn_idx, val_idx) in enumerate(folds): if holdout and i_fold > 0: break with span(f"Fold = {i_fold}"): x_trn = x_train.loc[trn_idx, :] x_val = x_train.loc[val_idx, :] x_trn = pd.concat([x_trn, x_test_using_train], axis=0, ignore_index=True) train_dataset = Dataset(x_trn, is_train=True) valid_dataset = Dataset(x_val, is_train=True) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=config["params"]["bacth_size"], num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=True), shuffle=True, ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=Collator(is_train=True), shuffle=False, ) model_cls = MODELS[config["model_name"]] model = model_cls( n_city_id=len(target_le.classes_), n_booker_country=len(cat_le["booker_country"].classes_), n_device_class=len(cat_le["device_class"].classes_), n_affiliate_id=len(cat_le["affiliate_id"].classes_), n_month_checkin=len(cat_le["month_checkin"].classes_), n_hotel_country=len(cat_le["past_hotel_country"].classes_), emb_dim=config["params"]["emb_dim"], rnn_dim=config["params"]["rnn_dim"], dropout=config["params"]["dropout"], rnn_dropout=config["params"]["rnn_dropout"], ) if i_fold == 0: log(f"{summary(model)}") criterion = FocalLossWithOutOneHot(gamma=0.5) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW( optimizer_grouped_parameters, lr=1e-4, weight_decay=0.01, ) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=30, eta_min=1e-6) logdir = (Path(config["output_dir_path"]) / config["exp_name"] / f"fold{i_fold}") loaders = {"train": train_dataloader, "valid": valid_dataloader} runner = CustomRunner(device=DEVICE) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, main_metric="accuracy04", minimize_metric=False, logdir=logdir, num_epochs=config["params"]["num_epochs"], verbose=True, ) log("Predictions using validation data") oof_preds[val_idx, :] = np.array( list( map( lambda x: x.cpu().numpy()[-1, :], runner.predict_loader( loader=valid_dataloader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) y_val = x_val["city_id"].map(lambda x: x[-1]).values score = top_k_accuracy_score(y_val, oof_preds[val_idx, :], k=4, labels=np.arange( len(target_le.classes_))) log(f"val acc@4: {score}") np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_val_pred_fold{i_fold}", oof_preds[val_idx, :], ) test_preds_ = np.array( list( map( lambda x: x.cpu().numpy()[-1, :], runner.predict_loader( loader=test_dataloader, resume=f"{logdir}/checkpoints/best.pth", model=model, ), ))) test_preds += test_preds_ / cv.n_splits np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred_fold{i_fold}", test_preds_, ) log("Evaluation OOF valies:") y_train = x_train["city_id"].map(lambda x: x[-1]) score = top_k_accuracy_score(y_train, oof_preds, k=4, labels=np.arange(len(target_le.classes_))) log(f"oof acc@4: {score}") log("Save files:") np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_oof_pred", oof_preds, ) np.save( Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred", test_preds, )
batch_size=256, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=True, ) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=1, num_workers=os.cpu_count(), pin_memory=True, collate_fn=MyCollator(is_train=True), shuffle=False, ) runner = CustomRunner(device=device) model = BookingNN( n_city_id=len(target_le.classes_), n_booker_country=len(cat_le["booker_country"].classes_), n_device_class=len(cat_le["device_class"].classes_), n_affiliate_id=len(cat_le["affiliate_id"].classes_), n_month_checkin=len(cat_le["month_checkin"].classes_), n_hotel_country=len(cat_le["past_hotel_country"].classes_)) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=30, eta_min=1e-6) oof_preds[va_idx, :] = np.array( list( map(
X_tr, y_tr = [ d for i, d in enumerate(train_paths) if i in list(tr_idx) ], train_labels.iloc[tr_idx].values X_val, y_val = [ d for i, d in enumerate(train_paths) if i in list(va_idx) ], train_labels.iloc[va_idx].values train_dataset = SimpleDataset(X_tr, y_tr, transform=None) valid_dataset = SimpleDataset(X_val, y_tr, transform=None) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128) valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=512) loaders = {'train': train_loader, 'valid': valid_loader} runner = CustomRunner(device=device) model = EfficientNet.from_pretrained('efficientnet-b3') model._fc = torch.nn.Sequential( torch.nn.Linear(1536, 1), Squeeze(), ) criterion = RMSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6) logdir = f'../output/logdir_{run_name}/fold{fold_id}' runner.train(