Exemplo n.º 1
0
def search_blending_weight(
    predictions: List[np.ndarray],
    target: np.ndarray,
    n_iter: int,
    func: Callable[[np.ndarray, np.ndarray], float] = rmse,
    is_higher_better: bool = False,
) -> np.ndarray:
    best_weights = np.zeros(len(predictions))
    best_score = -np.inf if is_higher_better else np.inf

    for i in range(n_iter):
        seed_everything(i)
        dice = np.random.rand(len(predictions))
        weights = dice / dice.sum()
        blended = np.zeros(len(predictions[0]))
        for weight, pred in zip(weights, predictions):
            blended += weight * pred
        score = func(blended, target)
        if is_higher_better:
            if score > best_score:
                best_score = score
                best_weights = weights
        else:
            if score < best_score:
                best_score = score
                best_weights = weights
    return best_score, best_weights
Exemplo n.º 2
0
def run_baseline():
    time_experiment = datetime.now().strftime("%m%d%Y_%H%M")
    seed_everything(0)

    ds = Dataset()
    ds.load_dataset()

    from sklearn.preprocessing import LabelEncoder

    nan_constant = -999
    for col, col_type in ds.X_train.dtypes.iteritems():
        if col_type == "object":
            ds.X_train[col] = ds.X_train[col].fillna(nan_constant)
            ds.X_test[col] = ds.X_test[col].fillna(nan_constant)

            lbl = LabelEncoder()
            lbl.fit(list(ds.X_train[col].values) + list(ds.X_test[col].values))
            ds.X_train[col] = lbl.transform(list(ds.X_train[col].values))
            ds.X_test[col] = lbl.transform(list(ds.X_test[col].values))

            if nan_constant in lbl.classes_:
                nan_transformed = lbl.transform([nan_constant])[0]
                ds.X_train.loc[ds.X_train[col] == nan_transformed,
                               col] = np.nan
                ds.X_test.loc[ds.X_test[col] == nan_transformed, col] = np.nan

        if col in ds.categorical_cols:
            ds.X_train[col] = ds.X_train[col].fillna(-1).astype("category")
            ds.X_test[col] = ds.X_test[col].fillna(-1).astype("category")

    lgb_params = {
        "n_estimators": 50000,
        "early_stopping_rounds": 200,
        "num_leaves": 256,
        "learning_rate": 0.03,
        "max_depth": 9,
        "objective": "binary",
        "metric": "auc",
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "scale_pos_weight":
        5.5,  #  mimic a fraud rate ~20% - 0.2*N_legit/N_fraud
        "boosting_type": "gbdt",
        "seed": 1337,
        "n_jobs": -1,
        "verbosity": -1,
    }

    folds = KFold(n_splits=5, random_state=0, shuffle=False)
    result = run_train_predict(ds, clf_lgb, lgb_params, folds, None)

    path_to_preds = f"baseline_lgb_{time_experiment}"
    ds.submission["isFraud"] = result["prediction"]
    ds.write_submission(path_to_preds)
Exemplo n.º 3
0
def main(cfg: DictConfig):
    print('VinBigData Training Classification')
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Config  -------------------------------------------------------------------
    data_dir = cfg.data.data_dir
    seed_everything(cfg.data.seed)

    load_dotenv('.env')
    wandb.login()
    wandb_logger = WandbLogger(project='VinBigData-Classification', reinit=True)
    wandb_logger.log_hyperparams(dict(cfg.data))
    wandb_logger.log_hyperparams(dict(cfg.train))
    wandb_logger.log_hyperparams(dict(cfg.aug_kwargs_classification))

    # Data Module  -------------------------------------------------------------------
    transform = ImageTransform(cfg, type='classification')
    cv = StratifiedKFold(n_splits=cfg.data.n_splits)
    dm = ChestXrayDataModule(data_dir, cfg, transform, cv, data_type='classification', sample=False)

    # Model  -----------------------------------------------------------
    net = Timm_model(cfg.train.backbone, out_dim=1)

    # Loss fn  -----------------------------------------------------------
    criterion = nn.BCEWithLogitsLoss()

    # Optimizer, Scheduler  -----------------------------------------------------------
    optimizer = optim.Adam(net.parameters(), lr=cfg.train.lr)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.train.epoch, eta_min=0)
    # Lightning Module
    model = XrayLightningClassification(net, cfg, criterion, optimizer, scheduler)

    # Trainer  --------------------------------------------------------------------------
    trainer = Trainer(
        logger=wandb_logger,
        log_every_n_steps=100,
        max_epochs=cfg.train.epoch,
        gpus=-1,
        num_sanity_val_steps=0,
        # deterministic=True,
        amp_level='O2',
        amp_backend='apex'
    )

    # Train
    trainer.fit(model, datamodule=dm)

    # Stop Logging
    wandb.finish()

    for p in model.weight_paths:
        os.remove(p)
Exemplo n.º 4
0
        "--accumulation",
        type=int,
        default=1,
        help="Number of accumulation steps.",
    )
    parser.add_argument("--warmup",
                        type=int,
                        default=3,
                        help="Number of warmup epochs.")
    parser.add_argument("--out_suff",
                        type=str,
                        help="Suffix added to the saved model name.")

    args = parser.parse_args()

    seed_everything(13)
    writer = SummaryWriter("runs/test_run_logP_warmup15")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    print("Using device:", device)
    print()

    # Additional info when using cuda
    if device.type == "cuda":
        print(torch.cuda.get_device_name(0))
        print("Memory Usage:")
        print(
            "Allocated:",
            round(torch.cuda.memory_allocated(0) / 1024**3, 1),
            "GB",
Exemplo n.º 5
0
if __name__ == "__main__":
    sys.path.append("./")

    pool = cp.cuda.MemoryPool(cp.cuda.malloc_managed)
    cp.cuda.set_allocator(pool.malloc)

    warnings.filterwarnings("ignore")

    parser = get_preprocess_parser()
    args = parser.parse_args()

    config = load_config(args.config)
    configure_logger(args.config, log_dir=args.log_dir, debug=args.debug)

    seed_everything(config["seed_everything"])

    logging.info(f"config: {args.config}")
    logging.info(f"debug: {args.debug}")

    config["args"] = dict()
    config["args"]["config"] = args.config

    # make output dir
    output_root_dir = Path(config["output_dir"])
    feature_dir = Path(config["dataset"]["feature_dir"])

    config_name = args.config.split("/")[-1].replace(".yml", "")
    output_dir = output_root_dir / config_name
    output_dir.mkdir(parents=True, exist_ok=True)
Exemplo n.º 6
0
def main(cfg: DictConfig):
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Random Seed
    seed_everything(cfg.train.seed)

    # Model  ####################################################################
    net = ENet(model_name=cfg.train.model_name)
    transform = ImageTransform(img_size=cfg.data.img_size)

    # Comet.ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
                            project_name=cfg.comet_ml.project_name)
    # Log Parameters
    experiment.log_parameters(dict(cfg.exp))
    experiment.log_parameters(dict(cfg.data))
    experiment.log_parameters(dict(cfg.train))
    # Log Model Graph
    experiment.set_model_graph(str(net))

    # Lightning Module  #########################################################
    model = LightningSystem(net, cfg, experiment)
    datamodule = DataModule(data_dir, cfg, transform, cv)

    checkpoint_callback = ModelCheckpoint(filepath='./checkpoint',
                                          save_top_k=1,
                                          verbose=True,
                                          monitor='avg_val_loss',
                                          mode='min',
                                          prefix=cfg.exp.exp_name + '_')

    trainer = Trainer(logger=False,
                      max_epochs=cfg.train.epoch,
                      checkpoint_callback=checkpoint_callback,
                      gpus=1)

    # Train & Test  ############################################################
    # Train
    trainer.fit(model, datamodule=datamodule)
    experiment.log_metric('best_auc', model.best_auc)
    checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0]
    experiment.log_asset(file_data=checkpoint_path)

    # Test
    for i in range(test_num):
        trainer.test(model)

    # Submit
    sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'sub_{cfg.exp.exp_name}.csv')

    # oof
    oof_dataset = datamodule.oof_dataset
    oof_dataloader = DataLoader(oof_dataset,
                                batch_size=cfg.train.batch_size,
                                pin_memory=False,
                                shuffle=False,
                                drop_last=False)
    for i in range(10):
        trainer.test(model, test_dataloaders=oof_dataloader)

    # Submit
    sub_list = glob.glob('submission*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'oof_{cfg.exp.exp_name}.csv')
Exemplo n.º 7
0
def main(cfg: DictConfig):
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    seed_everything(cfg.data.seed)

    # wandb
    wandb.init(project='VinBigData-Detection')
    wandb.config.update(dict(cfg.data))
    wandb.config.update(dict(cfg.train))
    wandb.config.update(dict(cfg.aug_kwargs_detection))
    wandb.config.update(dict(cfg.classification_kwargs))

    # omegaconf -> dict
    rep_aug_kwargs = OmegaConf.to_container(cfg.aug_kwargs_detection)

    class_name_dict = {
        0: 'Aortic enlargement',
        1: 'Atelectasis',
        2: 'Calcification',
        3: 'Cardiomegaly',
        4: 'Consolidation',
        5: 'ILD',
        6: 'Infiltration',
        7: 'Lung Opacity',
        8: 'Nodule/Mass',
        9: 'Other lesion',
        10: 'Pleural effusion',
        11: 'Pleural thickening',
        12: 'Pneumothorax',
        13: 'Pulmonary fibrosis',
    }

    # Setting  --------------------------------------------------
    data_dir = cfg.data.data_dir
    output_dir = cfg.data.output_dir
    img_size = cfg.data.img_size
    backbone = cfg.data.backbone
    use_class14 = cfg.data.use_class14

    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)

    if use_class14:
        class_name_dict.update({14: 'No finding'})

    # Register Dataset  --------------------------------------------------
    anno_df = pd.read_csv(os.path.join(data_dir, 'train_wbf_th0.7.csv'))

    if cfg.data.use_class14:
        pass
    else:
        anno_df = anno_df[anno_df['class_id'] != 14].reset_index(drop=True)

    # Extract rad id
    if cfg.data.rad_id != 'all':
        anno_df = anno_df[anno_df['rad_id'].isin(cfg.data.rad_id)].reset_index()

    if debug:
        anno_df = anno_df.head(100)

    # Split train, valid data - random
    if 'valid' in cfg.data.split_method:
        split_rate = float(cfg.data.split_method.split('_')[1]) / 100
        unique_image_ids = anno_df['image_id'].values
        unique_image_ids = np.random.RandomState(cfg.data.seed).permutation(unique_image_ids)
        train_image_ids = unique_image_ids[:int(len(unique_image_ids) * (1 - split_rate))]
        valid_image_ids = unique_image_ids[int(len(unique_image_ids) * (1 - split_rate)):]
        DatasetCatalog.register("xray_valid", lambda d='valid': get_xray_dict(anno_df, data_dir, cfg, valid_image_ids))
        MetadataCatalog.get("xray_valid").set(thing_classes=list(class_name_dict.values()))

    else:
        train_image_ids = anno_df['image_id'].values
    DatasetCatalog.register("xray_train", lambda d='train': get_xray_dict(anno_df, data_dir, cfg, train_image_ids))
    MetadataCatalog.get("xray_train").set(thing_classes=list(class_name_dict.values()))

    DatasetCatalog.register("xray_test", lambda d='test': get_test_xray_dict(data_dir))
    MetadataCatalog.get("xray_test").set(thing_classes=list(class_name_dict.values()))

    # Config  --------------------------------------------------
    detectron2_cfg = get_cfg()
    detectron2_cfg.aug_kwargs = CN(rep_aug_kwargs)
    detectron2_cfg.merge_from_file(model_zoo.get_config_file(backbone))
    detectron2_cfg.DATASETS.TRAIN = ("xray_train",)
    if 'valid' in cfg.data.split_method:
        detectron2_cfg.DATASETS.TEST = ("xray_valid",)
        detectron2_cfg.TEST.EVAL_PERIOD = cfg.train.max_iter // 10
    else:
        detectron2_cfg.DATASETS.TEST = ()
    detectron2_cfg.INPUT.MIN_SIZE_TRAIN = (img_size,)
    detectron2_cfg.INPUT.MAX_SIZE_TRAIN = img_size
    detectron2_cfg.DATALOADER.NUM_WORKERS = cfg.train.num_workers
    detectron2_cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(backbone)
    detectron2_cfg.SOLVER.IMS_PER_BATCH = cfg.train.ims_per_batch
    detectron2_cfg.SOLVER.BASE_LR = cfg.train.lr
    detectron2_cfg.SOLVER.MAX_ITER = cfg.train.max_iter
    detectron2_cfg.SOLVER.LR_SCHEDULER_NAME = "WarmupCosineLR"
    detectron2_cfg.SOLVER.WARMUP_ITERS = 2000
    detectron2_cfg.SOLVER.CHECKPOINT_PERIOD = 200000
    detectron2_cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = cfg.train.batch_size_per_image
    detectron2_cfg.MODEL.ROI_HEADS.NUM_CLASSES = 15 if use_class14 else 14
    detectron2_cfg.OUTPUT_DIR = output_dir
    detectron2_cfg.SEED = cfg.data.seed
    detectron2_cfg.PIXEL_MEAN = [103.530, 116.280, 123.675]
    detectron2_cfg.PIXEL_STD = [1.0, 1.0, 1.0]

    # Train  --------------------------------------------------
    os.makedirs(detectron2_cfg.OUTPUT_DIR, exist_ok=True)
    # trainer = DefaultTrainer(detectron2_cfg)
    trainer = MyTrainer(detectron2_cfg)
    trainer.resume_or_load(resume=True)
    trainer.train()

    # Rename Last Weight
    renamed_model = f"{backbone.split('.')[0].replace('/', '-')}.pth"
    os.rename(os.path.join(cfg.data.output_dir, 'model_final.pth'),
              os.path.join(cfg.data.output_dir, renamed_model))

    # Logging
    for model_path in glob.glob(os.path.join(cfg.data.output_dir, '*.pth')):
        wandb.save(model_path)

    # Inference Setting  ------------------------------------------------------
    detectron2_cfg = get_cfg()
    detectron2_cfg.merge_from_file(model_zoo.get_config_file(backbone))
    detectron2_cfg.MODEL.ROI_HEADS.NUM_CLASSES = 15 if use_class14 else 14
    detectron2_cfg.MODEL.WEIGHTS = os.path.join(output_dir, renamed_model)  # path to the model we just trained
    detectron2_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = cfg.data.score_th   # set a custom testing threshold

    predictor = DefaultPredictor(detectron2_cfg)
    dataset_dicts = get_test_xray_dict(data_dir)

    # Visualize  ------------------------------------------------------
    target_image_ids = ['9a5094b2563a1ef3ff50dc5c7ff71345',
                        '22b8e616a61bbc4caaed0cf23b7159df',
                        '001d127bad87592efe45a5c7678f8b8d',
                        '008b3176a7248a0a189b5731ac8d2e95']

    for th in [0, 0.2, 0.5, 0.7]:
        visualize(target_image_ids, data_dir, output_dir, predictor, score_th=th)

    # Metrics
    if os.path.exists(os.path.join(output_dir, 'metrics.json')):
        metrics_df = pd.read_json(os.path.join(output_dir, 'metrics.json'), orient="records", lines=True)
        mdf = metrics_df.sort_values("iteration")

        mdf3 = mdf[~mdf["bbox/AP75"].isna()].reset_index(drop=True)
        for i in range(len(mdf3)):
            row = mdf3.iloc[i]
            wandb.log({'AP40': row["bbox/AP75"] / 100.})

        best_score = mdf3["bbox/AP75"].max() / 100.
        wandb.log({'Best-AP40-Score': best_score})

    # Inference  ------------------------------------------------------
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    sub = get_submission(dataset_dicts, cfg, predictor, device)

    now = datetime.datetime.now() + datetime.timedelta(hours=9)
    now = now.strftime("%Y%m%d-%H%M%S")

    filename = f'submission_{now}.csv'
    sub.to_csv(os.path.join('./submission', filename), index=False)
    wandb.save(os.path.join('./submission', filename))
    time.sleep(30)

    wandb.finish()
    DatasetCatalog.clear()
def main(cfg: DictConfig):
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)

    seed_everything(cfg.train.seed)
    # Comet.ml
    experiment = Experiment(api_key=API_KEY, project_name=PROJECT_NAME)

    # Load Data  ################################################################
    # Chris Dataset
    chris_image_size = cfg.data.load_size
    data_dir = f'./input/_Chris_Dataset_{chris_image_size}'
    train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

    img_paths = {
        'train': glob.glob(os.path.join(data_dir, 'train', '*.jpg')),
        'test': glob.glob(os.path.join(data_dir, 'test', '*.jpg'))
    }

    # Cross Validation  #########################################################
    # GroupKFold
    cv = GroupKFold(n_splits=5)
    train['fold'] = -1
    for i, (trn_idx, val_idx) in enumerate(
            cv.split(train,
                     train['target'],
                     groups=train['patient_id'].tolist())):
        train.loc[val_idx, 'fold'] = i

    # Preprocessing  ############################################################
    # Drop Image
    drop_image_name = [
        'ISIC_4579531', 'ISIC_7918608', 'ISIC_0948240', 'ISIC_4904364',
        'ISIC_8780369', 'ISIC_8770180', 'ISIC_7148656', 'ISIC_7408392',
        'ISIC_9959813', 'ISIC_1894141', 'ISIC_6633174', 'ISIC_3001941',
        'ISIC_4259290', 'ISIC_6833905', 'ISIC_7452152', 'ISIC_2744859',
        'ISIC_5464206', 'ISIC_6596403', 'ISIC_0711790', 'ISIC_5644568',
        'ISIC_5843094', 'ISIC_8904326', 'ISIC_4963405', 'ISIC_9839042',
        'ISIC_1355907', 'ISIC_0694037', 'ISIC_9513918', 'ISIC_0787851',
        'ISIC_2932886', 'ISIC_2336763', 'ISIC_4064330', 'ISIC_7358293',
        'ISIC_5789052', 'ISIC_7828320', 'ISIC_8277969', 'ISIC_1080647',
        'ISIC_3238159', 'ISIC_8480913', 'ISIC_3790692', 'ISIC_0612624',
        'ISIC_1242543', 'ISIC_4036915', 'ISIC_8174647', 'ISIC_2956783',
        'ISIC_3302289', 'ISIC_6761105', 'ISIC_2152755', 'ISIC_9169000',
        'ISIC_6852275', 'ISIC_4432898', 'ISIC_5459207', 'ISIC_7418664',
        'ISIC_5136612', 'ISIC_9174738', 'ISIC_3160301', 'ISIC_7140636',
        'ISIC_7718384', 'ISIC_9336675', 'ISIC_4282719', 'ISIC_4330005',
        'ISIC_9828463', 'ISIC_6511141', 'ISIC_5335139', 'ISIC_5104921',
        'ISIC_0695575', 'ISIC_0610141', 'ISIC_5946998', 'ISIC_0464315',
        'ISIC_6556513', 'ISIC_3688407', 'ISIC_7730443', 'ISIC_4358550',
        'ISIC_6461484', 'ISIC_9690422', 'ISIC_5374076', 'ISIC_1793200',
        'ISIC_1389620', 'ISIC_8098274', 'ISIC_6425888', 'ISIC_6321076',
        'ISIC_4298309', 'ISIC_2981912', 'ISIC_3650938', 'ISIC_4288522',
        'ISIC_9459785', 'ISIC_1938535', 'ISIC_5576241', 'ISIC_6567889',
        'ISIC_2768800', 'ISIC_6023795', 'ISIC_9281339', 'ISIC_6712494',
        'ISIC_1811256', 'ISIC_5157055', 'ISIC_3943097', 'ISIC_7194471',
        'ISIC_0361529', 'ISIC_9797578', 'ISIC_3575926', 'ISIC_6166824',
        'ISIC_8828670', 'ISIC_6953126', 'ISIC_4430815', 'ISIC_8146054',
        'ISIC_9305209', 'ISIC_4263017', 'ISIC_9314144', 'ISIC_1330763',
        'ISIC_4792936', 'ISIC_1823608', 'ISIC_4910683', 'ISIC_9360142',
        'ISIC_2863809', 'ISIC_4748668', 'ISIC_5681315', 'ISIC_3202829',
        'ISIC_3450978', 'ISIC_9704624', 'ISIC_4350914', 'ISIC_3587744',
        'ISIC_8190321', 'ISIC_1766413', 'ISIC_2872769', 'ISIC_3186625',
        'ISIC_0170059', 'ISIC_4858099', 'ISIC_0314462', 'ISIC_2811886',
        'ISIC_2140099', 'ISIC_9514450', 'ISIC_1195354', 'ISIC_8325872',
        'ISIC_0227038', 'ISIC_6342641', 'ISIC_4162828', 'ISIC_7597293',
        'ISIC_5278307', 'ISIC_3774190', 'ISIC_2957196', 'ISIC_4443545',
        'ISIC_3455136', 'ISIC_0610499', 'ISIC_8483008', 'ISIC_0243683',
        'ISIC_9028131', 'ISIC_8507102', 'ISIC_7128535', 'ISIC_4085552',
        'ISIC_2940763', 'ISIC_1219894', 'ISIC_1043313', 'ISIC_6587979',
        'ISIC_7050773', 'ISIC_3230164', 'ISIC_5159557', 'ISIC_7854457',
        'ISIC_2582493', 'ISIC_5161114', 'ISIC_5238910', 'ISIC_6515221',
        'ISIC_7771339', 'ISIC_9274260', 'ISIC_8054626', 'ISIC_1178847',
        'ISIC_0236778', 'ISIC_6704518', 'ISIC_4214813', 'ISIC_0322818',
        'ISIC_0230209', 'ISIC_7682938', 'ISIC_1852500', 'ISIC_3699454',
        'ISIC_4693693', 'ISIC_9574591', 'ISIC_3465766', 'ISIC_1826803',
        'ISIC_6234881', 'ISIC_2417958', 'ISIC_8142203', 'ISIC_5019268',
        'ISIC_3251719', 'ISIC_4654808', 'ISIC_1027856', 'ISIC_3262153',
        'ISIC_4681838', 'ISIC_6594555', 'ISIC_8623291', 'ISIC_3167092',
        'ISIC_8791163', 'ISIC_1538510', 'ISIC_3962218', 'ISIC_2160145',
        'ISIC_7690654', 'ISIC_9464203', 'ISIC_4673844', 'ISIC_9481260',
        'ISIC_5407240', 'ISIC_5179742', 'ISIC_8851901', 'ISIC_7433711',
        'ISIC_5777548', 'ISIC_2164933', 'ISIC_7194695', 'ISIC_7115605',
        'ISIC_7560157', 'ISIC_1323909', 'ISIC_0307958', 'ISIC_8015259',
        'ISIC_3089729', 'ISIC_3048886', 'ISIC_0861066', 'ISIC_6110309',
        'ISIC_9103289', 'ISIC_2853454', 'ISIC_1436572', 'ISIC_9650546',
        'ISIC_8208962', 'ISIC_5218561', 'ISIC_3285862', 'ISIC_5361506',
        'ISIC_8196660', 'ISIC_0356238', 'ISIC_1156392', 'ISIC_2761440',
        'ISIC_0645462', 'ISIC_4908514', 'ISIC_1374795', 'ISIC_3481768',
        'ISIC_2102371', 'ISIC_4548990', 'ISIC_7200676', 'ISIC_8827725',
        'ISIC_0667149', 'ISIC_7028320', 'ISIC_5485142', 'ISIC_9698871',
        'ISIC_7764481', 'ISIC_8831706', 'ISIC_4478276', 'ISIC_0401250',
        'ISIC_6987824', 'ISIC_7789537', 'ISIC_1114860', 'ISIC_7586566',
        'ISIC_0343061', 'ISIC_1442157', 'ISIC_9161937', 'ISIC_5904214',
        'ISIC_8335489', 'ISIC_9994768', 'ISIC_4384331', 'ISIC_0639415',
        'ISIC_0982984', 'ISIC_2195070', 'ISIC_9022865', 'ISIC_0159060',
        'ISIC_4933735', 'ISIC_3571989', 'ISIC_8593130', 'ISIC_1585919',
        'ISIC_3907656', 'ISIC_9728805', 'ISIC_6029052', 'ISIC_3582787',
        'ISIC_2205007', 'ISIC_1447559'
    ]
    train = train[~train['image_name'].isin(drop_image_name)].reset_index(
        drop=True)

    # Preprocessing metadata
    # OneHotEncoder
    train, test = preprocessing_meta(train, test)
    features_num = len([
        f for f in train.columns
        if f not in ['image_name', 'patient_id', 'target', 'fold']
    ])

    # Model  ####################################################################
    net = ENet(model_name=cfg.train.model_name, meta_features_num=features_num)
    transform = ImageTransform(img_size=cfg.data.img_size,
                               input_res=chris_image_size)

    # Lightning Module  #########################################################
    model = MelanomaSystem(net, cfg, img_paths, train, test, transform,
                           experiment)

    checkpoint_callback = ModelCheckpoint(filepath='./checkpoint',
                                          save_top_k=1,
                                          verbose=True,
                                          monitor='avg_val_loss',
                                          mode='min',
                                          prefix=cfg.exp.exp_name + '_')

    trainer = Trainer(max_epochs=cfg.train.epoch,
                      checkpoint_callback=checkpoint_callback,
                      gpus=[0])

    # Train & Test  ############################################################
    # Train
    trainer.fit(model)
    experiment.log_metric('best_auc', model.best_auc)
    checkpoint_path = glob.glob(f'./checkpoint/{cfg.exp.exp_name}_*.ckpt')[0]
    experiment.log_asset(file_data=checkpoint_path)

    # Test
    for i in range(test_num):
        trainer.test(model)

    # Submit
    sub_list = glob.glob(f'submission_{cfg.exp.exp_name}*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'submission_all_{cfg.exp.exp_name}.csv')

    # oof
    valid_dataset = MelanomaDataset(train,
                                    img_paths['train'],
                                    transform,
                                    phase='test')
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=cfg.train.batch_size,
                                  pin_memory=False,
                                  shuffle=False,
                                  drop_last=False)
    for i in range(10):
        trainer.test(model, test_dataloaders=valid_dataloader)

    # Submit
    sub_list = glob.glob('submission*.csv')
    _ = summarize_submit(sub_list,
                         experiment,
                         filename=f'submission_oof_{cfg.exp.exp_name}.csv')

    # Reset
    del model, trainer, net, experiment
Exemplo n.º 9
0
def run(config: dict, holdout: bool, debug: bool) -> None:
    log("Run with configuration:")
    log(f"{config}")
    seed_everything(config["seed"])

    with span("Load train and test set:"):
        train_test_set = load_train_test_set(config)
        log(f"{train_test_set.shape}")
        emb_df = pd.read_csv("./data/interim/emb_df.csv")
        n_emb = emb_df.shape[1] - 1
        emb_cols = [str(i) for i in range(n_emb)]
        emb_df.rename(columns={"city_id": "past_city_id"}, inplace=True)

    with span("Preprocessing:"):
        with span("Shift target values for input sequence."):
            unk_city_id = 0
            train_test_set["past_city_id"] = (
                train_test_set.groupby("utrip_id")["city_id"].shift(1).fillna(
                    unk_city_id).astype(int))
            unk_hotel_country = "UNK"
            train_test_set["past_hotel_country"] = (
                train_test_set.groupby("utrip_id")["hotel_country"].shift(
                    1).fillna(unk_hotel_country).astype(str))
            train_test_set = pd.merge(train_test_set,
                                      emb_df,
                                      on="past_city_id",
                                      how="left")
            train_test_set[emb_cols] = train_test_set[emb_cols].fillna(0)
            train_test_set["city_embedding"] = train_test_set[emb_cols].apply(
                lambda x: list(x), axis=1)

        with span("Encode of target values."):
            target_le = preprocessing.LabelEncoder()
            train_test_set["city_id"] = target_le.fit_transform(
                train_test_set["city_id"])
            train_test_set["past_city_id"] = target_le.transform(
                train_test_set["past_city_id"])

        with span("Add features."):
            log("Convert data type of checkin and checkout.")
            train_test_set["checkin"] = pd.to_datetime(
                train_test_set["checkin"])
            train_test_set["checkout"] = pd.to_datetime(
                train_test_set["checkout"])

            log("Create month_checkin feature.")
            train_test_set["month_checkin"] = train_test_set[
                "checkin"].dt.month
            train_test_set["year_checkin"] = train_test_set["checkin"].dt.year

            log("Create days_stay feature.")
            train_test_set["days_stay"] = (
                train_test_set["checkout"] -
                train_test_set["checkin"]).dt.days.apply(lambda x: np.log10(x))

            log("Create num_checkin feature.")
            train_test_set["num_checkin"] = (train_test_set.groupby(
                "utrip_id")["checkin"].rank().apply(lambda x: np.log10(x)))

            log("Create days_move feature.")
            train_test_set["past_checkout"] = train_test_set.groupby(
                "utrip_id")["checkout"].shift(1)
            train_test_set["days_move"] = (
                (train_test_set["checkin"] - train_test_set["past_checkout"]
                 ).dt.days.fillna(0).apply(lambda x: np.log1p(x)))

            log("Create aggregation features.")
            num_visit_drop_duplicates = train_test_set.query("city_id != 0")[[
                "user_id", "city_id"
            ]].drop_duplicates().groupby("city_id").size().apply(
                lambda x: np.log1p(x)).reset_index()
            num_visit_drop_duplicates.columns = [
                "past_city_id", "num_visit_drop_duplicates"
            ]
            num_visit = train_test_set.query("city_id != 0")[[
                "user_id", "city_id"
            ]].groupby("city_id").size().apply(
                lambda x: np.log1p(x)).reset_index()
            num_visit.columns = ["past_city_id", "num_visit"]
            num_visit_same_city = train_test_set[
                train_test_set['city_id'] == train_test_set['city_id'].shift(
                    1)].groupby("city_id").size().apply(
                        lambda x: np.log1p(x)).reset_index()
            num_visit_same_city.columns = [
                "past_city_id", "num_visit_same_city"
            ]
            train_test_set = pd.merge(train_test_set,
                                      num_visit_drop_duplicates,
                                      on="past_city_id",
                                      how="left")
            train_test_set = pd.merge(train_test_set,
                                      num_visit,
                                      on="past_city_id",
                                      how="left")
            train_test_set = pd.merge(train_test_set,
                                      num_visit_same_city,
                                      on="past_city_id",
                                      how="left")
            train_test_set["num_visit_drop_duplicates"].fillna(0, inplace=True)
            train_test_set["num_visit"].fillna(0, inplace=True)
            train_test_set["num_visit_same_city"].fillna(0, inplace=True)
            train_test_set["num_stay_consecutively"] = train_test_set.groupby(
                ["utrip_id", "past_city_id"])["past_city_id"].rank(
                    method="first").fillna(1).apply(lambda x: np.log1p(x))

        with span("Encode of categorical values."):
            cat_le = {}
            for c in CATEGORICAL_COLS:
                le = preprocessing.LabelEncoder()
                train_test_set[c] = le.fit_transform(
                    train_test_set[c].fillna("UNK").astype(str).values)
                cat_le[c] = le

        train = train_test_set[train_test_set["row_num"].isnull()]
        test = train_test_set[~train_test_set["row_num"].isnull()]

        with span("aggregate features by utrip_id"):
            x_train, x_test_using_train, x_test = [], [], []
            for c in ["city_id", "past_city_id"
                      ] + CATEGORICAL_COLS + NUMERICAL_COLS:
                x_train.append(train.groupby("utrip_id")[c].apply(list))
                x_test.append(test.groupby("utrip_id")[c].apply(list))
                x_test_using_train.append(
                    test.groupby("utrip_id")[c].apply(lambda x: list(x)[:-1]))
            x_train = pd.concat(x_train, axis=1)
            x_test = pd.concat(x_test, axis=1)
            x_test_using_train = pd.concat(x_test_using_train, axis=1)

        with span("sampling training data"):
            x_train["n_trips"] = x_train["city_id"].map(lambda x: len(x))
            x_test_using_train["n_trips"] = x_test_using_train["city_id"].map(
                lambda x: len(x))
            x_train = (x_train.query("n_trips > 2").sort_values(
                "n_trips").reset_index(drop=True))
            x_test_using_train = (
                x_test_using_train.sort_values("n_trips").reset_index(
                    drop=True))
            x_test = x_test.reset_index(drop=True)
            log(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

        if debug:
            log("'--debug' specified. Shrink data size into 1000.")
            x_train = x_train.iloc[:1000]
            x_test = x_test.iloc[:1000]
            config["params"]["num_epochs"] = 2
            log(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

    with span("Prepare data loader for test:"):
        test_dataset = Dataset(x_test, is_train=False)
        test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=1,
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=Collator(is_train=False),
            shuffle=False,
        )

    with span("Get folds:"):
        cv = StratifiedKFold(
            n_splits=config["fold"]["n_splits"],
            shuffle=config["fold"]["shuffle"],
        )
        folds = cv.split(x_train, pd.cut(x_train["n_trips"], 5, labels=False))

    log("Training:")
    oof_preds = np.zeros((len(x_train), len(target_le.classes_)),
                         dtype=np.float32)
    test_preds = np.zeros((len(x_test), len(target_le.classes_)),
                          dtype=np.float32)

    for i_fold, (trn_idx, val_idx) in enumerate(folds):
        if holdout and i_fold > 0:
            break
        with span(f"Fold = {i_fold}"):
            x_trn = x_train.loc[trn_idx, :]
            x_val = x_train.loc[val_idx, :]
            x_trn = pd.concat([x_trn, x_test_using_train],
                              axis=0,
                              ignore_index=True)
            train_dataset = Dataset(x_trn, is_train=True)
            valid_dataset = Dataset(x_val, is_train=True)
            train_dataloader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=config["params"]["bacth_size"],
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=Collator(is_train=True),
                shuffle=True,
            )
            valid_dataloader = torch.utils.data.DataLoader(
                valid_dataset,
                batch_size=1,
                num_workers=os.cpu_count(),
                pin_memory=True,
                collate_fn=Collator(is_train=True),
                shuffle=False,
            )
            model_cls = MODELS[config["model_name"]]
            model = model_cls(
                n_city_id=len(target_le.classes_),
                n_booker_country=len(cat_le["booker_country"].classes_),
                n_device_class=len(cat_le["device_class"].classes_),
                n_affiliate_id=len(cat_le["affiliate_id"].classes_),
                n_month_checkin=len(cat_le["month_checkin"].classes_),
                n_hotel_country=len(cat_le["past_hotel_country"].classes_),
                emb_dim=config["params"]["emb_dim"],
                rnn_dim=config["params"]["rnn_dim"],
                dropout=config["params"]["dropout"],
                rnn_dropout=config["params"]["rnn_dropout"],
            )
            if i_fold == 0:
                log(f"{summary(model)}")

            criterion = FocalLossWithOutOneHot(gamma=0.5)
            # Prepare optimizer
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [
                        p for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.01,
                },
                {
                    "params": [
                        p for n, p in param_optimizer
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ]
            optimizer = AdamW(
                optimizer_grouped_parameters,
                lr=1e-4,
                weight_decay=0.01,
            )
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=30, eta_min=1e-6)
            logdir = (Path(config["output_dir_path"]) / config["exp_name"] /
                      f"fold{i_fold}")
            loaders = {"train": train_dataloader, "valid": valid_dataloader}
            runner = CustomRunner(device=DEVICE)
            runner.train(
                model=model,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                loaders=loaders,
                main_metric="accuracy04",
                minimize_metric=False,
                logdir=logdir,
                num_epochs=config["params"]["num_epochs"],
                verbose=True,
            )

            log("Predictions using validation data")
            oof_preds[val_idx, :] = np.array(
                list(
                    map(
                        lambda x: x.cpu().numpy()[-1, :],
                        runner.predict_loader(
                            loader=valid_dataloader,
                            resume=f"{logdir}/checkpoints/best.pth",
                            model=model,
                        ),
                    )))
            y_val = x_val["city_id"].map(lambda x: x[-1]).values
            score = top_k_accuracy_score(y_val,
                                         oof_preds[val_idx, :],
                                         k=4,
                                         labels=np.arange(
                                             len(target_le.classes_)))
            log(f"val acc@4: {score}")
            np.save(
                Path(config["output_dir_path"]) / config["exp_name"] /
                f"y_val_pred_fold{i_fold}",
                oof_preds[val_idx, :],
            )

            test_preds_ = np.array(
                list(
                    map(
                        lambda x: x.cpu().numpy()[-1, :],
                        runner.predict_loader(
                            loader=test_dataloader,
                            resume=f"{logdir}/checkpoints/best.pth",
                            model=model,
                        ),
                    )))
            test_preds += test_preds_ / cv.n_splits
            np.save(
                Path(config["output_dir_path"]) / config["exp_name"] /
                f"y_test_pred_fold{i_fold}",
                test_preds_,
            )

    log("Evaluation OOF valies:")
    y_train = x_train["city_id"].map(lambda x: x[-1])
    score = top_k_accuracy_score(y_train,
                                 oof_preds,
                                 k=4,
                                 labels=np.arange(len(target_le.classes_)))
    log(f"oof acc@4: {score}")

    log("Save files:")
    np.save(
        Path(config["output_dir_path"]) / config["exp_name"] / f"y_oof_pred",
        oof_preds,
    )
    np.save(
        Path(config["output_dir_path"]) / config["exp_name"] / f"y_test_pred",
        test_preds,
    )
Exemplo n.º 10
0
def run(X_seq_train, X_cont_train, y_train, X_seq_test, X_cont_test, timestamp,
        random_state):
    seed_everything(random_state)

    oof_preds = np.zeros(len(X_seq_train))
    test_preds = np.zeros(len(X_seq_test))
    cv_scores = []
    for i, (trn_idx, val_idx) in enumerate(
            get_folds(5, "stratified",
                      random_state).split(X_cont_train, y_train)):
        print(f"fold {i + 1}")
        train_dataset = TensorDataset(
            torch.from_numpy(X_seq_train[trn_idx]).float(),
            torch.from_numpy(X_cont_train[trn_idx]).float(),
            torch.from_numpy(y_train[trn_idx]).float(),
        )
        valid_dataset = TensorDataset(
            torch.from_numpy(X_seq_train[val_idx]).float(),
            torch.from_numpy(X_cont_train[val_idx]).float(),
            torch.from_numpy(y_train[val_idx]).float(),
        )
        test_dataset = TensorDataset(
            torch.from_numpy(X_seq_test).float(),
            torch.from_numpy(X_cont_test).float())

        train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
        valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=128)
        test_loader = DataLoader(test_dataset, shuffle=False, batch_size=128)
        loaders = {"train": train_loader, "valid": valid_loader}

        runner = CustomRunner(device="cuda")

        model = Model(
            in_channels=X_seq_train.shape[1],
            n_cont_features=X_cont_train.shape[1],
            hidden_channels=64,
            kernel_sizes=[3, 5, 7, 15, 21, 51, 101],
            out_dim=1,
        )
        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                               T_max=30,
                                                               eta_min=1e-6)

        logdir = f"./logdir/{timestamp}_fold{i}"
        runner.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            loaders=loaders,
            logdir=logdir,
            num_epochs=30,
            verbose=True,
        )

        pred = np.concatenate(
            list(
                map(
                    lambda x: x.cpu().numpy(),
                    runner.predict_loader(
                        loader=valid_loader,
                        resume=f"{logdir}/checkpoints/best.pth",
                        model=model,
                    ),
                )))
        oof_preds[val_idx] = pred
        score = average_precision_score(y_train[val_idx], pred)
        cv_scores.append(score)
        print("score", score)

        pred = np.concatenate(
            list(
                map(
                    lambda x: x.cpu().numpy(),
                    runner.predict_loader(
                        loader=test_loader,
                        resume=f"{logdir}/checkpoints/best.pth",
                        model=model,
                    ),
                )))
        test_preds += pred / 5
    return oof_preds, test_preds, cv_scores
Exemplo n.º 11
0
import numpy as np
from src.get_folds import Fold
from src.runner import Runner
from src.utils import get_logger, json_dump, seed_everything
from src.submission import create_submission
from features.base import load_features
from models.model_1d_cnn import Model_1DCNN
from multiprocessing import cpu_count
from tensorflow import keras

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

seed_everything(71, gpu_mode=True)

model_map = {'1dcnn': Model_1DCNN}


def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_1dcnn_0.json')
Exemplo n.º 12
0
def main(cfg: DictConfig):
    print('Cassava Leaf Disease Classification')
    cur_dir = hydra.utils.get_original_cwd()
    os.chdir(cur_dir)
    # Config  -------------------------------------------------------------------
    data_dir = './input'
    seed_everything(cfg.data.seed)

    # Comet_ml
    experiment = Experiment(api_key=cfg.comet_ml.api_key,
                            project_name=cfg.comet_ml.project_name,
                            auto_param_logging=False,
                            auto_metric_logging=False)

    # Log Parameters
    experiment.log_parameters(dict(cfg.data))
    experiment.log_parameters(dict(cfg.train))

    # Data Module  ---------------------------------------------------------------
    transform = get_transforms(transform_name=cfg.data.transform,
                               img_size=cfg.data.img_size)
    cv = StratifiedKFold(n_splits=cfg.data.n_splits,
                         shuffle=True,
                         random_state=cfg.data.seed)
    dm = CassavaDataModule(data_dir,
                           cfg,
                           transform,
                           cv,
                           use_merge=True,
                           sample=DEBUG)

    # Model  ----------------------------------------------------------------------
    net = Timm_model(cfg.train.model_type, pretrained=True)

    # Log Model Graph
    experiment.set_model_graph(str(net))

    # Loss fn  ---------------------------------------------------------------------
    df = pd.read_csv('./input/merged.csv')
    weight = df['label'].value_counts().sort_index().tolist()
    weight = [w / len(df) for w in weight]
    weight = torch.tensor(weight).cuda()
    del df

    criterion = get_loss_fn(cfg.train.loss_fn, weight=weight, smoothing=0.05)

    # Optimizer, Scheduler  --------------------------------------------------------
    if cfg.train.use_sam:
        base_optimizer = RAdam
        optimizer = SAM(net.parameters(),
                        base_optimizer,
                        lr=cfg.train.lr,
                        weight_decay=cfg.train.weight_decay)
    else:
        optimizer = RAdam(net.parameters(),
                          lr=cfg.train.lr,
                          weight_decay=cfg.train.weight_decay)

    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=cfg.train.epoch,
                                               eta_min=0)

    # Lightning Module  -------------------------------------------------------------
    model = CassavaLightningSystem(net,
                                   cfg,
                                   criterion=criterion,
                                   optimizer=optimizer,
                                   scheduler=scheduler,
                                   experiment=experiment)

    # Trainer  -------------------------------------------------------------------------
    trainer = Trainer(
        logger=False,
        max_epochs=cfg.train.epoch,
        gpus=-1,
        amp_backend='apex',
        amp_level='O2',
        num_sanity_val_steps=0,  # Skip Sanity Check
        automatic_optimization=False if cfg.train.use_sam else True,
        # resume_from_checkpoint='./checkpoints/epoch=3-step=14047.ckpt'
    )

    # Train
    trainer.fit(model, datamodule=dm)
# data directory
TRAIN_DATA_A_DIR = config.domain_a_dir
TRAIN_DATA_B_DIR = config.domain_b_dir

# save directory
MODEL_G_DIR = f'{ROOT_DIR}/checkpoint/G/{str(No).zfill(3)}'
MODEL_F_DIR = f'{ROOT_DIR}/checkpoint/F/{str(No).zfill(3)}'
MODEL_DA_DIR = f'{ROOT_DIR}/checkpoint/D1/{str(No).zfill(3)}'
MODEL_DB_DIR = f'{ROOT_DIR}/checkpoint/D2/{str(No).zfill(3)}'
LOG_DIR = f'{ROOT_DIR}/output/log/{str(No).zfill(3)}'
FIGURE_DIR = f'{ROOT_DIR}/output/figure/{str(No).zfill(3)}'
PRED_VAL_A_TO_B_DIR = f'{ROOT_DIR}/output/pred_val_a_to_b/{str(No).zfill(3)}'
PRED_VAL_B_TO_A_DIR = f'{ROOT_DIR}/output/pred_val_b_to_a/{str(No).zfill(3)}'

seed_everything(SEED)
kf = KFold(n_splits=N_SPLIT, shuffle=True, random_state=SEED)
img_ids_A = np.array(sorted(os.listdir(TRAIN_DATA_A_DIR)))
tr_ix, va_ix = list(kf.split(img_ids_A, img_ids_A))[FOLD]
train_A, valid_A = img_ids_A[tr_ix], img_ids_A[va_ix]

img_ids_B = np.array(sorted(os.listdir(TRAIN_DATA_B_DIR)))
train_B, valid_B = img_ids_B, img_ids_B

seed_everything(DATA_LOADER_SEED)

if DEBUG:
    train_A = train_A[:2]
    valid_A = valid_A[:2]

train_dataset = PokemonTrainDataset(train_A,
Exemplo n.º 14
0
def run_experiment(version, key):
    time_experiment = datetime.now().strftime("%m%d%Y_%H%M")
    seed_everything(0)

    # Predefine functions
    modellers = {
        "logistic": clf_logistic,
        "lgb": clf_lgb,
        "xgb": clf_xgb,
        "catboost": clf_catboost,
    }

    ########################### READ PARAMETERS
    conf = read_configuration(key)
    classifier = conf["classifier"]
    params = conf["params"]
    split = conf["splits"]

    logger.info("Begin run experiment")

    ########################### LOADING DATASET
    logger.info("Loading dataset")
    ds = Dataset()
    ds.load_dataset(version)

    ########################### BUILD CROSS VALIDATION STRATEGY
    logger.info("Build folds")
    date_ranges = [
        # [["2018-01-01", "2018-05-31"], ["2017-12-01", "2017-12-31"]],
        [["2017-12-01", "2018-04-15"], ["2018-05-01", "2018-05-31"]]
    ]
    splits = {
        "holdout": CustomDateSplitter(ds.X_train["TransactionDT"],
                                      date_ranges),
        "kfold": KFold(n_splits=6, random_state=0, shuffle=False),
    }
    folds = splits[split]

    ########################### PREPROCESSING DATA
    logger.info("Preprocessing data")
    build_processed_dataset(ds)

    gc.collect()

    ########################### TRAIN MODEL
    logger.info(f"Building {classifier} model")
    result = run_train_predict(ds, modellers[classifier], params, folds)

    ########################### SAVING
    if conf["save_predictions"]:
        path_to_preds = f"{key}_{time_experiment}"
        logger.info(f"Saving {key} predictions to {path_to_preds}")
        ds.submission["isFraud"] = result["prediction"]
        ds.write_submission(path_to_preds)

    if conf["save_models"]:
        # ds.save_dataset(f"{key}_processed")
        path_to_models = get_root_dir() / f"models/{key}_{time_experiment}"
        logger.info(f"Saving raw models to {path_to_models}")

        os.mkdir(path_to_models)
        write_params(params, path_to_models / "params.json")
        for i, model in enumerate(result["models"]):
            save_model(model, path_to_models / f"fold_{i}")
        open(path_to_models / "_SUCCESS", "a").close()

    logger.info("End run experiment")
Exemplo n.º 15
0
def main(train_path, test_path, max_features, max_len, glove_path, para_path, model_save_path,
         epochs=4, batch_size=512, seed=1029):
    logger_path = os.path.join(model_save_path, "log.txt")
    setup_logger(out_file=logger_path)

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = df_parallelize_run(train, text_clean_wrapper)
    test = df_parallelize_run(test, text_clean_wrapper)

    tk = Tokenizer(lower=True, filters='', num_words=max_features)
    full_text = list(train['question_text'].values) + list(test['question_text'].values)
    tk.fit_on_texts(full_text)
    train_tokenized = tk.texts_to_sequences(train['question_text'].fillna('missing'))
    test_tokenized = tk.texts_to_sequences(test['question_text'].fillna('missing'))
    word_index = tk.word_index

    X_train = pad_sequences(train_tokenized, maxlen=max_len)
    X_test = pad_sequences(test_tokenized, maxlen=max_len)

    y_train = train['target'].values

    get_embedding = GetEmbedding(max_features, word_index)
    glove = get_embedding.load(glove_path, emb_mean=-0.005838499, emb_std=0.48782197)
    para = get_embedding.load(para_path, emb_mean=-0.0053247833, emb_std=0.49346462)
    embedding_matrix = glove * 0.8 + para * 0.2
    del glove, para;
    gc.collect()

    x_test_cuda = torch.tensor(X_test, dtype=torch.long).cuda()
    test = torch.utils.data.TensorDataset(x_test_cuda)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=10).split(X_train, y_train))

    seed_everything()

    train_preds = np.zeros(len(train))
    test_preds = np.zeros((len(test), len(splits)))

    for i, (train_idx, valid_idx) in enumerate(splits):
        model_path = os.path.join(model_save_path, "model_fold{}".format(i + 1))
        x_train_fold = torch.tensor(X_train[train_idx], dtype=torch.long).cuda()
        y_train_fold = torch.tensor(y_train[train_idx, np.newaxis], dtype=torch.float32).cuda()
        x_val_fold = torch.tensor(X_train[valid_idx], dtype=torch.long).cuda()
        y_val_fold = torch.tensor(y_train[valid_idx, np.newaxis], dtype=torch.float32).cuda()

        train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
        valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

        print(f'Fold {i + 1}')

        seed_everything(seed + i)
        model = NeuralNet(max_features, max_len, embedding_matrix)
        model.cuda()

        optimizer = torch.optim.Adam(model.parameters())
        scheduler = CosineAnnealingLR(optimizer, T_max=3)
        loss = torch.nn.BCEWithLogitsLoss(reduction='mean').cuda()

        trainer = Trainer(model, train_loader, valid_loader, y_val_fold, test_loader, loss, optimizer, scheduler,
                          model_path, epochs, batch_size)
        valid_preds_fold, test_preds_fold = trainer.run(validate=True)

        train_preds[valid_idx] = valid_preds_fold
        test_preds[:, i] = test_preds_fold

    search_result = threshold_search(y_train, train_preds)
    print(search_result)
    test_preds = test_preds.mean(1) > search_result['threshold']
    return test_preds
Exemplo n.º 16
0
from src.runner import Runner
from src.utils import get_logger, json_dump, seed_everything
from src.submission import create_submission
from features.base import load_features
from models.model_lightgbm import Model_LightGBM
from multiprocessing import cpu_count
import lightgbm as lgb
from sklearn.model_selection import train_test_split

import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

seed_everything(71)

model_map = {'lightgbm': Model_LightGBM}


def main():
    # =========================================
    # === Settings
    # =========================================
    # Get logger
    logger = get_logger(__name__)
    logger.info('Settings')

    # Get argument
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default='./configs/model_0.json')
Exemplo n.º 17
0
    parser.add_argument('--output_channel', type=int, default=256)
    parser.add_argument('--hidden_size', type=int, default=256)
    parser.add_argument('--num_fiducial',
                        type=int,
                        default=20,
                        help='number of fiducial points of TPS-STN')
    args = parser.parse_args()

    assert args.dataset_name in CONFIGS
    if args.checkpoint_path:
        seed = round(datetime.utcnow().timestamp()
                     ) % 10000  # warning! in resume need change seed
    else:
        seed = args.seed

    utils.seed_everything(seed)

    config = CONFIGS[args.dataset_name](
        data_dir=args.data_dir,
        experiment_name=args.experiment_name,
        experiment_description=args.experiment_description,
        image_w=args.image_w,
        image_h=args.image_h,
        num_epochs=args.num_epochs,
        bs=args.bs,
        num_workers=args.num_workers,
        seed=seed,
        batch_max_length=args.batch_max_length,
        FeatureExtraction=args.FeatureExtraction,
        SequenceModeling=args.SequenceModeling,
        Prediction=args.Prediction,
Exemplo n.º 18
0
def main(cfg):
    SEED = cfg.values.seed
    BATCH_SIZE = cfg.values.train_args.batch_size
    IMAGE_SIZE = cfg.values.image_size
    USE_KFOLD = cfg.values.use_kfold
    NUM_FOLD = cfg.values.train_args.num_fold if USE_KFOLD else 0

    seed_everything(SEED)

    print(f'Cuda is Available ? : {torch.cuda.is_available()}\n')

    data_df = pd.read_csv('E:/seti-breakthrough-listen/train_labels.csv')

    data_df['file_path'] = data_df['id'].apply(get_train_file_path)

    train_transform = albumentations.Compose([
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE),
        albumentations.HorizontalFlip(),
        albumentations.VerticalFlip(),
        # albumentations.Normalize(mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)),
        albumentations.pytorch.transforms.ToTensorV2()
    ])

    val_transform = albumentations.Compose([
        albumentations.Resize(IMAGE_SIZE, IMAGE_SIZE),
        # albumentations.Normalize(mean=(0.5, 0.5, 0.5), std=(0.25, 0.25, 0.25)),
        albumentations.pytorch.transforms.ToTensorV2()
    ])

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=NUM_FOLD,
                                shuffle=True,
                                random_state=SEED)

        for k, (train_index,
                val_index) in enumerate(kfold.split(data_df,
                                                    data_df['target'])):
            print('\n')
            cpprint('=' * 15 + f'{k + 1}-Fold Cross Validation' + '=' * 15)
            train_df = data_df.iloc[train_index].reset_index(drop=True)
            val_df = data_df.iloc[val_index].reset_index(drop=True)

            train_loader = get_dataloader(df=train_df,
                                          transform=train_transform,
                                          batch_size=BATCH_SIZE,
                                          shuffle=True)
            val_loader = get_dataloader(df=val_df,
                                        transform=val_transform,
                                        batch_size=BATCH_SIZE,
                                        shuffle=False)

            val_labels = val_df['target'].values.tolist()
            train(cfg, train_loader, val_loader, val_labels, k + 1)

    else:
        print('\n')
        cpprint('=' * 15 + f'Start Training' + '=' * 15)
        train_df, val_df = train_test_split(data_df,
                                            test_size=0.2,
                                            shuffle=True,
                                            stratify=data_df['target'],
                                            random_state=SEED)

        train_loader = get_dataloader(df=train_df,
                                      transform=train_transform,
                                      batch_size=BATCH_SIZE,
                                      shuffle=True)
        val_loader = get_dataloader(df=val_df,
                                    transform=val_transform,
                                    batch_size=BATCH_SIZE,
                                    shuffle=False)

        val_labels = val_df['target'].values.tolist()
        train(cfg, train_loader, val_loader, val_labels, 0)
import torch
import torch.utils
import mlflow
import datetime
import matplotlib.pyplot as plt
import cv2
from tqdm.autonotebook import tqdm
from src.types import Boxes

from typing import List, Dict, Any

config = Config(".")

config.n_folds = 0

seed_everything(config.seed)
transforms: Transforms = get_transforms()
start_time = datetime.datetime.now().isoformat()

with timer("load raw data"):
    data: WheatData = get_data(config)

cv_num = 0
with timer("prepare dataloader and fitter"):
    train_image_ids, train_df, val_image_ids, val_df = data.get_fold(cv_num)

    train_dataset: WheatDataset = get_wheat_dataset(
        config.INPUT_DIR,
        train_image_ids,
        train_df,
        "train",
import mlflow
from torch_optimizer import RAdam

from warmup_scheduler import GradualWarmupScheduler

from src.utils import seed_everything, ImageTransform
from src.utils import Trainer, QWKLoss, Trainer_multifold, get_dataloaders, Santa
from src.model import ModelEFN, ModelEFN_2

if os.name == 'nt':
    sep = '\\'
else:
    sep = '/'

seed = 42
seed_everything(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


@hydra.main('config.yml')
def main(cfg: DictConfig):
    # Config  ################################################################
    IMAGE_NUM = cfg.data.image_num
    IMAGE_SIZE = cfg.data.image_size
    exp_name = cfg.data.exp
    model_name = f'efficientnet-{cfg.data.model_name}'
    BATCH_SIZE = cfg.training.batch_size
    lr = cfg.training.lr
    NUM_EPOCHS = cfg.training.num_epoch
    FOLD = cfg.training.fold
    OPTIMIZER = cfg.training.optimizer
Exemplo n.º 21
0
from torchvision import transforms
from tqdm import tqdm

from src.data_loader import get_data_loader
from src.evaluation import performance_plot
from src.loss import get_loss_function
from src.model import DecoderRNN, DecoderRNNUpdated, EncoderCNN
from src.optimizer import get_optimizer
from src.utils import Config, seed_everything

COMMET_ML_API_KEY = os.environ.get("COMMET_ML_API_KEY")
experiment = Experiment(api_key=COMMET_ML_API_KEY,
                        project_name="image_caption_generation")

print("Seed everything. Ensure reproducibility...")
seed_everything(seed=42)

if __name__ == "__main__":

    config = Config("config.yaml")
    if config.DEV_MODE:
        warnings.warn(f"Running in dev_mode: {config.DEV_MODE}")

    # Move models to GPU if CUDA is available.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # (Optional) TODO #2: Amend the image transform below.
    transform_train = transforms.Compose([
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.RandomHorizontalFlip(
Exemplo n.º 22
0
from src.train import train
from src import config
from src.utils import seed_everything
import pandas as pd
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from datetime import datetime

if __name__ == '__main__':

    print("SEED number : %d" % config.SEED)
    seed_everything(config.SEED)
    now = datetime.now()
    now = f'{now.year}{now.month}{now.day}{now.hour}{now.minute}'

    # Define param
    nfold = 5
    model_name = 'b4'
    n_epochs = 25
    pretraining = True
    #     weight_path = "input/efn_b4_nfNone_ep13_vl0.3339_vk0.7768_acc0.7882.pt"
    weight_path = None

    print("N Fold : {}, Model : EFN_{}, N_epochs : {}".format(
        nfold, model_name, n_epochs))

    # load Dataset
    # train_csv = pd.read_csv(os.path.join(config.DATA_PATH, 'prev_curr_train_v2.csv'))
    train_csv = pd.read_csv(
        os.path.join(config.DATA_PATH, 'prev_curr_train.csv'))
    # train_csv = pd.read_csv(os.path.join(config.DATA_PATH, 'prev_curr_train_v1_1.csv'))
Exemplo n.º 23
0
import numpy as np
import pandas as pd
from sklearn import preprocessing
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.datasets import load_train_test, BookingDataset
from src.models import BookingNN
from src.utils import seed_everything
from src.runner import CustomRunner

if __name__ == '__main__':

    seed_everything(0)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    categorical_cols = [
        'user_id',
        # 'device_class',
        # 'affiliate_id',
        'booker_country',
        # 'hotel_country'
    ]

    train_test = load_train_test()
    cat_dims = [int(train_test[col].nunique()) for col in categorical_cols]
    emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

    target_le = preprocessing.LabelEncoder()
    def __init__(self, settings, config):
        self.settings = settings
        self.config = config
        self.logger = get_logger()

        seed_everything(seed=settings.SEED)