le = preprocessing.LabelEncoder()
        train_test[c] = le.fit_transform(
            train_test[c].astype(str).fillna('unk').values)

    test = train_test[~train_test['row_num'].isnull()]
    test_trips = test[test['city_id'] != test['city_id'].shift(1)].query(
        'city_id!=0').groupby('utrip_id')['city_id'].apply(
            lambda x: x.values).reset_index()

    X_test = test[test['city_id'] != test['city_id'].shift(1)].query(
        'city_id!=0').groupby(
            'utrip_id')[categorical_cols].last().reset_index()
    X_test['city_id'] = test_trips['city_id']
    X_test = X_test.reset_index(drop=True)

    test_dataset = BookingDataset(X_test, is_train=False)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1)

    del train_test, test, test_trips
    gc.collect()

    model_paths = [
        '../input/booking-bi-lstm-ep1/logdir_nn000',
    ]
    for mp in model_paths:
        for fold_id in (0, ):
            runner = CustomRunner(device=device)
            model = BookingNN(len(target_le.classes_))
            pred = []
            for prediction in tqdm(
                    runner.predict_loader(
Exemplo n.º 2
0
            train[train['city_id'] != train['city_id'].shift(1)].groupby(
                "utrip_id")[c].apply(list))
        X_test.append(
            test[test['city_id'] != test['city_id'].shift(1)].groupby(
                "utrip_id")[c].apply(list))
    X_train = pd.concat(X_train, axis=1)
    X_test = pd.concat(X_test, axis=1)

    X_train['n_trips'] = X_train['city_id'].map(lambda x: len(x))
    X_train = X_train.query('n_trips > 2').sort_values('n_trips').reset_index(
        drop=True)
    X_test = X_test.reset_index(drop=True)

    cv = StratifiedKFold(n_splits=5, shuffle=False)

    test_dataset = BookingDataset(X_test, is_train=False)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=1,
        num_workers=os.cpu_count(),
        pin_memory=True,
        collate_fn=MyCollator(is_train=False),
        shuffle=False,
    )
    del train_test, train, test, X_test
    gc.collect()

    for fold_id, (tr_idx, va_idx) in enumerate(
            cv.split(X_train, pd.cut(X_train['n_trips'], 5, labels=False))):
        if fold_id in (0, 1, 2, 3, 4):
            X_tr = X_train.loc[tr_idx, :]