Exemplo n.º 1
0
def run(try_num, config):
    args = get_args()

    print('args', args, flush=True)
    print('config:', config.to_dict(), flush=True)

    set_seed(config.rand_seed)

    pretrained_model = f"tf_efficientnet_b3_ns"
    model_dir = f'deepinsight-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv(f"../input/lish-moa/train_features.csv")
    train_targets = pd.read_csv(f"../input/lish-moa/train_targets_scored.csv")
    test_features = pd.read_csv(f"../input/lish-moa/test_features.csv")

    if config.dae_path:
        dae_features = pd.read_csv(config.dae_path)

    if args.debug:
        train_features = train_features.iloc[:500]
        train_targets = train_targets.iloc[:500]
        if config.dae_path:
            dae_features = pd.concat([dae_features.iloc[:500], dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(dict(
            kfolds=3,
            n_epoch=3
        ))

    train_features = train_features.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)
    train_targets = train_targets.sort_values(by=["sig_id"], axis=0, inplace=False).reset_index(drop=True)

    cat_features_columns = ["cp_dose", 'cp_time']
    num_feature_columns = [c for c in train_features.columns
                           if c != "sig_id" and c not in cat_features_columns + ['cp_type']]
    all_features_columns = cat_features_columns + num_feature_columns
    target_columns = [c for c in train_targets.columns if c != "sig_id"]
    g_feature_columns = [c for c in num_feature_columns if c.startswith("g-")]
    c_feature_columns = [c for c in num_feature_columns if c.startswith("c-")]

    if config.dae_path:
        if config.dae_strategy == 'replace':
            train_features, test_features = assign_dae_features(
                train_features, test_features, dae_features, len(num_feature_columns))
        else:
            train_features, test_features, dae_feature_columns = merge_dae_features(
                train_features, test_features, dae_features, len(g_feature_columns), len(c_feature_columns))
            all_features_columns += dae_feature_columns

    train_targets = train_targets.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)
    train_features = train_features.loc[train_features['cp_type'] == 'trt_cp'].reset_index(drop=True)

    if config.normalizer == 'rank':
        train_features, test_features = normalize(train_features, test_features, num_feature_columns)

    for df in [train_features, test_features]:
        df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1})
        df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
        df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1})

    if config.variance_target_type == 1:
        pickle_path = f'{model_dir}/variance_reduction.pkl'

        variance_target_features = num_feature_columns
        if config.dae_path and config.dae_strategy != 'replace':
            variance_target_features += dae_feature_columns

        if not os.path.exists(pickle_path):
            vt = variance_reduction_fit(train_features, variance_target_features, config.variance_threshold)
            save_pickle(vt, pickle_path)

        vt = load_pickle(pickle_path)
        train_features = variance_reduction_transform(vt, train_features, variance_target_features)
        test_features = variance_reduction_transform(vt, test_features, variance_target_features)
        print('(variance_reduction) Number of features after applying:', len(train_features.columns), flush=True)
        all_features_columns = list(train_features.columns[1:])

    skf = MultilabelStratifiedKFold(n_splits=config.kfolds, shuffle=True, random_state=config.rand_seed)
    y_labels = np.sum(train_targets.drop("sig_id", axis=1), axis=0).index.tolist()
    logger = Logger()

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        if args.only_pred:
            print('Skip training', flush=True)
            break

        print(f'Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)

        X_train = train_features.loc[train_index, all_features_columns].copy().values
        y_train = train_targets.iloc[train_index, 1:].copy().values
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values

        if config.normalizer == 'log':
            scaler = LogScaler()
            if config.norm_apply_all:
                scaler.fit(X_train)
                X_train = scaler.transform(X_train)
                X_valid = scaler.transform(X_valid)
            else:
                target_features = [i for i, c in enumerate(all_features_columns) if c in num_feature_columns]
                non_target_features = [i for i, c in enumerate(all_features_columns) if c not in num_feature_columns]

                scaler.fit(X_train[:, target_features])
                X_train_tr = scaler.transform(X_train[:, target_features])
                X_valid_tr = scaler.transform(X_valid[:, target_features])
                X_train = np.concatenate([X_train[:, non_target_features], X_train_tr], axis=1)
                X_valid = np.concatenate([X_valid[:, non_target_features], X_valid_tr], axis=1)
            save_pickle(scaler, f'{model_dir}/scaler-{fold_index}.pkl')

        transformer = DeepInsightTransformer(
            feature_extractor=config.extractor,
            pixels=config.resolution,
            perplexity=config.perplexity,
            random_state=config.rand_seed,
            n_jobs=-1
        ).fit(X_train)

        save_pickle(transformer, f'{model_dir}/transformer-{fold_index}.pkl')

        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)

        if config.smoothing is not None:
            if config.weighted_loss_weights is not None:
                indices = get_minority_target_index(train_targets, threshold=config.weighted_loss_threshold)
                indices = [int(i not in indices) for i, c in enumerate(target_columns)]
                train_loss_function = SmoothBCEwLogits(
                    smoothing=config.smoothing,
                    weight=config.weighted_loss_weights,
                    weight_targets=indices,
                    n_labels=len(target_columns))
            else:
                train_loss_function = SmoothBCEwLogits(smoothing=config.smoothing)
        else:
            train_loss_function = bce_loss

        eval_loss_function = bce_loss

        optimizer = optim.Adam(model.parameters(), weight_decay=config.weight_decay, lr=config.learning_rate)

        if config.scheduler_type == 'ca':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.t_max, eta_min=0, last_epoch=-1)
        elif config.scheduler_type == 'ms':
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.ms_scheduler_milestones, gamma=0.1)
        else:
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', factor=0.1, patience=config.rp_patience, eps=1e-4, verbose=True)

        early_stopping = EarlyStopping(patience=7)
        best_score = np.inf
        start_time = time.time()

        for epoch in range(config.n_epoch):

            if config.swap_enable:
                dataset = MoAImageSwapDataset(
                    X_train,
                    y_train,
                    transformer,
                    image_size=config.image_size,
                    swap_prob=config.swap_prob,
                    swap_portion=config.swap_portion)
            else:
                dataset = MoAImageDataset(X_train, y_train, transformer, image_size=config.image_size)

            dataloader = DataLoader(
                dataset,
                batch_size=config.batch_size,
                shuffle=True,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            loss = loop_train(model, train_loss_function, dataloader, optimizer)

            if config.scheduler_type == 'rp':
                scheduler.step(loss)
            else:
                scheduler.step()
                for param_group in optimizer.param_groups:
                    print('current learning rate:', param_group['lr'])

            del dataset, dataloader

            dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
            dataloader = DataLoader(
                dataset,
                batch_size=config.infer_batch_size,
                shuffle=False,
                num_workers=8,
                pin_memory=True,
                drop_last=False)
            valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)

            del dataset, dataloader

            logger.update({'fold': fold_index, 'epoch': epoch + 1, 'train_loss': loss, 'val_loss': valid_loss})
            print(f'epoch {epoch + 1}/{config.n_epoch}  -  train_loss: {loss:.5f}  -  ' +
                  f'valid_loss: {valid_loss:.5f}  -  elapsed: {time_format(time.time() - start_time)}', flush=True)

            if valid_loss < best_score:
                best_score = valid_loss
                torch.save(model.state_dict(), f'./{model_dir}/deepinsight-{fold_index}.pt')

            if early_stopping.should_stop(valid_loss):
                print('Early stopping', flush=True)
                break

        print(f'Done -> Fold {fold_index}/{config.kfolds}  -  best_valid_loss: {best_score:.5f}  -  ' +
              f'elapsed: {time_format(time.time() - start_time)}', flush=True)

        torch.cuda.empty_cache()
        gc.collect()

        if args.return_first_fold:
            logger.save(f'{model_dir}/log.csv')
            return

    test_preds = np.zeros((test_features.shape[0], len(target_columns)))
    start_time = time.time()
    print('Start infarence', flush=True)

    oof_preds = np.zeros((len(train_features), len(target_columns)))
    eval_loss_function = bce_loss

    for fold_index, (train_index, val_index) in enumerate(skf.split(train_features, train_targets[y_labels])):
        print(f'Infarence Fold: {fold_index}', train_index.shape, val_index.shape, flush=True)
        X_valid = train_features.loc[val_index, all_features_columns].copy().values
        y_valid = train_targets.iloc[val_index, 1:].copy().values
        X_test = test_features[all_features_columns].values

        if config.normalizer == 'log':
            scaler = load_pickle(f'{model_dir}/scaler-{fold_index}.pkl')
            X_valid = scaler.transform(X_valid)
            X_test = scaler.transform(X_test)

        transformer = load_pickle(f'{model_dir}/transformer-{fold_index}.pkl')
        model = MoAEfficientNet(
            pretrained_model_name=pretrained_model,
            fc_size=config.fc_size,
            drop_rate=config.drop_rate,
            drop_connect_rate=config.drop_connect_rate,
            weight_init='goog',
        ).to(DEVICE)
        model.load_state_dict(torch.load(f'./{model_dir}/deepinsight-{fold_index}.pt'))

        dataset = MoAImageDataset(X_valid, y_valid, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)
        valid_loss, valid_preds = loop_valid(model, eval_loss_function, dataloader)
        print(f'Fold {fold_index}/{config.kfolds}  -  fold_valid_loss: {valid_loss:.5f}', flush=True)
        logger.update({'fold': fold_index, 'val_loss': valid_loss})

        oof_preds[val_index, :] = valid_preds

        dataset = TestDataset(X_test, None, transformer, image_size=config.image_size)
        dataloader = DataLoader(
            dataset,
            batch_size=config.infer_batch_size,
            shuffle=False,
            num_workers=8,
            pin_memory=True,
            drop_last=False)

        preds = loop_preds(model, dataloader)
        test_preds += preds / config.kfolds

    oof_preds_df = train_targets.copy()
    oof_preds_df.loc[:, target_columns] = oof_preds.clip(0, 1)
    oof_preds_df.to_csv(f'{model_dir}/oof_preds.csv', index=False)
    oof_loss = mean_log_loss(train_targets.loc[:, target_columns].values, oof_preds)

    print(f'OOF Validation Loss: {oof_loss:.6f}', flush=True)
    print(f'Done infarence  Elapsed {time_format(time.time() - start_time)}', flush=True)
    logger.update({'fold': 'oof', 'val_loss': oof_loss})
    logger.save(f'{model_dir}/log.csv')

    submission = pd.DataFrame(data=test_features['sig_id'].values, columns=['sig_id'])
    submission = submission.reindex(columns=['sig_id'] + target_columns)
    submission.loc[:, target_columns] = test_preds.clip(0, 1)
    submission.loc[test_features['cp_type'] == 0, submission.columns[1:]] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)
Exemplo n.º 2
0
def run(try_num, config):
    output_dir = f'./dae-out-{try_num}'

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    args = get_args()

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    if args.debug:
        train_features = train_features.loc[:500]
        config.update(dict(n_epochs=3, n_folds=2))

    all_features = pd.concat([train_features,
                              test_features]).reset_index(drop=True)
    g_features_columns = [
        col for col in all_features.columns if col.startswith('g-')
    ]
    c_features_columns = [
        col for col in all_features.columns if col.startswith('c-')
    ]
    feature_columns = g_features_columns + c_features_columns
    n_features = len(feature_columns)

    kfold = MultilabelStratifiedKFold(n_splits=config.n_folds,
                                      random_state=42,
                                      shuffle=True)
    logger = Logger()

    for fold_index, (train_idx, valid_idx) in enumerate(
            kfold.split(all_features.values, all_features.values)):
        print('Fold: ', fold_index + 1, flush=True)

        x_train = all_features.loc[train_idx]
        x_valid = all_features.loc[valid_idx]

        model = new_autoencoder(config.model_kind,
                                n_features=n_features).to(DEVICE)
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config.learning_rate,
                                     weight_decay=1e-5)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode='min',
                                                               factor=0.1,
                                                               patience=3,
                                                               eps=1e-4,
                                                               verbose=True)
        early_stopping = EarlyStopping(patience=10)
        best_score = np.inf

        for epoch in range(config.n_epochs):
            dataset = DaeDataset(x_train,
                                 feature_columns,
                                 noise_ratio=config.noise_ratio)
            dataloader = DataLoader(dataset,
                                    batch_size=config.batch_size,
                                    shuffle=True)

            train_loss = loop_train(model, criterion, dataloader, optimizer)

            dataset = DaeDataset(x_valid,
                                 feature_columns,
                                 noise_ratio=config.noise_ratio)
            dataloader = DataLoader(dataset,
                                    batch_size=config.valid_batch_size,
                                    shuffle=False)
            valid_loss, _ = loop_valid(model, criterion, dataloader)

            scheduler.step(valid_loss)

            logger.update({
                'fold': fold_index,
                'epoch': epoch + 1,
                'train_loss': train_loss,
                'val_loss': valid_loss
            })
            print(
                f'epoch {epoch + 1}/{config.n_epochs}  -  train_loss: {train_loss:.5f}  -  '
                + f'valid_loss: {valid_loss:.5f}',
                flush=True)

            if valid_loss < best_score:
                best_score = valid_loss
                torch.save(model.state_dict(),
                           f'./{output_dir}/dae_fold_weight_{fold_index}.pt')

            if early_stopping.should_stop(valid_loss):
                print('Early stopping', flush=True)
                break

    logger.save(f'./{output_dir}/dae_log.csv')
    oof_preds = []

    for fold_index in range(config.n_folds):
        model = new_autoencoder(config.model_kind,
                                n_features=n_features).to(DEVICE)
        model.load_state_dict(
            torch.load(f'./{output_dir}/dae_fold_weight_{fold_index}.pt'))
        model.eval()

        dataset = DaeDataset(all_features,
                             feature_columns,
                             noise_ratio=config.noise_ratio)
        dataloader = DataLoader(dataset,
                                batch_size=config.valid_batch_size,
                                shuffle=False)

        loss, preds = loop_valid(model, nn.MSELoss(), dataloader)

        logger.update({'fold': fold_index, 'val_loss': loss})
        print('Evaluation   fold: {}  -  valid_loss: {:.5f}'.format(
            fold_index, loss),
              flush=True)

        oof_preds.append(preds)

    print('A Whole Evaluation Score: {:.5f}'.format(
        mean_squared_error(all_features.loc[:, feature_columns].values,
                           np.mean(oof_preds, axis=0))),
          flush=True)

    # for i, preds in enumerate(oof_preds):
    #     create_pred_feature_df(preds, all_features).to_csv(f'./{output_dir}/dae_features_{i}.csv', index=False)
    create_pred_feature_df(np.mean(oof_preds, axis=0), all_features).to_csv(
        f'./{output_dir}/dae_features_mean.csv', index=False)
Exemplo n.º 3
0
def run(try_num, config):
    logger = Logger()
    args = get_args()

    print('config:', config.to_dict(), flush=True)
    print('args:', args, flush=True)
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    model_dir = f'blending-01-nn-{try_num}'

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    train_features = pd.read_csv('../input/lish-moa/train_features.csv')
    train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    dae_features = pd.read_csv(config.dae_path)
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')

    if args.debug:
        train_features = train_features[:500]
        train_targets = train_targets[:500]
        dae_features = pd.concat(
            [dae_features.iloc[:500],
             dae_features.iloc[-3982:]]).reset_index(drop=True)

        config.update(
            dict(
                n_folds=3,
                seeds=[222],
                n_epochs=3,
                batch_size=128,
            ))

    target_columns = [col for col in train_targets.columns if col != 'sig_id']
    n_targets = len(target_columns)

    train_features, train_targets, test_features = preprocess(
        config, model_dir, train_features, train_targets, test_features,
        dae_features)
    features_columns = [
        col for col in train_features.columns if col not in [
            'sig_id', 'cp_type', 'cp_time', 'cp_dose', 'cp_type_ctl_vehicle',
            'cp_type_trt_cp'
        ]
    ]

    metric_loss_function = nn.BCELoss()

    if config.weighted_loss_strategy == 1:
        indices = get_minority_target_index(
            train_targets, threshold=config.weighted_loss_threshold)
        indices = [int(i not in indices) for i, c in enumerate(target_columns)]
        smooth_loss_function = SmoothBCELoss(
            smoothing=config.smoothing,
            weight=config.weighted_loss_weights,
            weight_targets=indices,
            n_labels=n_targets)
    else:
        smooth_loss_function = SmoothBCELoss(smoothing=config.smoothing)

    kfold = MultilabelStratifiedKFold(n_splits=config.n_folds,
                                      random_state=42,
                                      shuffle=True)

    for seed_index, seed in enumerate(config.seeds):
        if args.only_pred:
            print('Skip training', flush=True)
            break

        print(f'Train seed {seed}', flush=True)
        set_seed(seed)

        for fold_index, (train_indices, val_indices) in enumerate(
                kfold.split(train_targets[target_columns].values,
                            train_targets[target_columns].values)):
            print(f'Train fold {fold_index + 1}', flush=True)

            x_train = train_features.loc[train_indices, features_columns]
            y_train = train_targets.loc[train_indices, target_columns]
            x_val = train_features.loc[val_indices, features_columns]
            y_val = train_targets.loc[val_indices, target_columns]

            model = new_model(config.model_kind,
                              len(features_columns)).to(DEVICE)
            checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt'
            optimizer = optim.Adam(model.parameters(),
                                   weight_decay=config.weight_decay,
                                   lr=config.learning_rate)
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                             mode='min',
                                                             factor=0.1,
                                                             patience=3,
                                                             eps=1e-4,
                                                             verbose=True)

            best_loss = np.inf

            for epoch in range(config.n_epochs):
                dataset = MoaDataset(x_train.values, y_train.values)
                dataloader = DataLoader(dataset,
                                        batch_size=config.batch_size,
                                        shuffle=True,
                                        drop_last=True)

                train_loss = loop_train(model,
                                        dataloader,
                                        optimizer,
                                        loss_functions=(
                                            smooth_loss_function,
                                            metric_loss_function,
                                        ))

                dataset = MoaDataset(x_val.values, y_val.values)
                dataloader = DataLoader(dataset,
                                        batch_size=config.val_batch_size,
                                        shuffle=False)
                valid_loss, _ = loop_valid(model, dataloader,
                                           metric_loss_function)

                print(
                    'Epoch {}/{}   -   loss: {:5.5f}   -   val_loss: {:5.5f}'.
                    format(epoch + 1, config.n_epochs, train_loss, valid_loss),
                    flush=True)

                logger.update({
                    'epoch': epoch + 1,
                    'loss': train_loss,
                    'val_loss': valid_loss
                })

                scheduler.step(valid_loss)

                if valid_loss < best_loss:
                    best_loss = valid_loss
                    torch.save(model.state_dict(), checkpoint_path)

    oof_preds = np.zeros((len(train_features), len(config.seeds), n_targets))
    test_preds = np.zeros((len(test_features), n_targets))

    for seed_index in range(len(config.seeds)):
        seed = config.seeds[seed_index]

        print(f'Inference for seed {seed}', flush=True)

        _test_preds_in_seed = np.zeros((len(test_features), n_targets))

        for fold_index, (_, valid_indices) in enumerate(
                kfold.split(train_targets[target_columns].values,
                            train_targets[target_columns].values)):
            x_val = train_features.loc[valid_indices, features_columns]
            y_val = train_targets.loc[valid_indices, target_columns]

            checkpoint_path = f'{model_dir}/repeat-{seed}_Fold-{fold_index + 1}.pt'
            model = new_model(config.model_kind,
                              len(features_columns)).to(DEVICE)
            model.load_state_dict(torch.load(checkpoint_path))

            dataset = MoaDataset(x_val.values, y_val.values)
            dataloader = DataLoader(dataset,
                                    batch_size=config.val_batch_size,
                                    shuffle=False)
            preds = loop_pred(model, dataloader)

            oof_preds[valid_indices, seed_index, :] = preds

            dataset = MoaDataset(test_features[features_columns].values, None)
            dataloader = DataLoader(dataset,
                                    batch_size=config.val_batch_size,
                                    shuffle=False)
            preds = loop_pred(model, dataloader)

            _test_preds_in_seed += preds / config.n_folds

        score = mean_log_loss(train_targets.loc[:, target_columns].values,
                              oof_preds[:, seed_index, :],
                              n_targets=n_targets)
        test_preds += _test_preds_in_seed / len(config.seeds)

        print(f'Score for this seed {score:5.5f}', flush=True)
        logger.update({'val_loss': score})

    # Evalucate validation score
    oof_preds = np.mean(oof_preds, axis=1)
    score = mean_log_loss(train_targets.loc[:, target_columns].values,
                          oof_preds,
                          n_targets=n_targets)
    print(f'Overall score is {score:5.5f}', flush=True)

    # Save validation prediction
    oof_pred_df = train_targets.copy()
    oof_pred_df.iloc[:, 1:] = oof_preds
    oof_pred_df.to_csv(f'{model_dir}/oof_pred.csv', index=False)

    # Save log
    logger.update({'val_loss': score})
    logger.save(f'{model_dir}/log.csv')

    # Save Test Prediction
    test_features = pd.read_csv('../input/lish-moa/test_features.csv')
    submission = create_submission(test_features, ['sig_id'] + target_columns)
    submission[target_columns] = test_preds
    submission.loc[test_features['cp_type'] == 'ctl_vehicle',
                   target_columns] = 0
    submission.to_csv(f'{model_dir}/submission.csv', index=False)