Exemplo n.º 1
0
def main(cfg):
    if cfg.wandb.project:
        import wandb
        from wandb.keras import WandbCallback
        wandb.init(project=cfg.wandb.project)
        callbacks = [WandbCallback()]
    else:
        callbacks = []

    csv_path = Path(to_absolute_path(__file__)).parent.joinpath(
        "meta", f"{cfg.data.db}.csv")
    df = pd.read_csv(str(csv_path))
    train, val = train_test_split(df, random_state=42, test_size=0.1)
    train_gen = ImageSequence(cfg, train, "train")
    val_gen = ImageSequence(cfg, val, "val")

    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        model = get_model(cfg)
        opt = get_optimizer(cfg)
        scheduler = get_scheduler(cfg)
        model.compile(optimizer=opt,
                      loss=[
                          "sparse_categorical_crossentropy",
                          "sparse_categorical_crossentropy"
                      ],
                      metrics=['accuracy'])

    dir_parent_checkpoint = "/content/drive/My Drive/deep_learning/age-gender-estimation"
    if os.path.exists(dir_parent_checkpoint):
        checkpoint_dir = Path(dir_parent_checkpoint).joinpath("checkpoint")
    else:
        checkpoint_dir = Path(
            to_absolute_path(__file__)).parent.joinpath("checkpoint")

    checkpoint_dir.mkdir(exist_ok=True)
    print(f"checkpoint_dir: {checkpoint_dir}")

    filename = "_".join([
        cfg.model.model_name,
        str(cfg.model.img_size), "weights.{epoch:02d}-{val_loss:.2f}.hdf5"
    ])
    callbacks.extend([
        LearningRateScheduler(schedule=scheduler),
        ModelCheckpoint(str(checkpoint_dir) + "/" + filename,
                        monitor="val_loss",
                        verbose=1,
                        save_best_only=True,
                        mode="auto")
    ])

    model.fit(train_gen,
              epochs=cfg.train.epochs,
              callbacks=callbacks,
              validation_data=val_gen,
              workers=multiprocessing.cpu_count())
Exemplo n.º 2
0
def main(cfg):
    if cfg.wandb.project:
        import wandb
        from wandb.keras import WandbCallback
        wandb.init(project=cfg.wandb.project)
        callbacks = [WandbCallback()]
    else:
        callbacks = []
    weight_file = cfg.train.weight_file

    csv_path = Path(to_absolute_path(__file__)).parent.joinpath("meta", f"{cfg.data.db}.csv")
    df = pd.read_csv(str(csv_path))
    train, val = train_test_split(df, random_state=42, test_size=0.2)
    train_gen = ImageSequence(cfg, train, "train")
    val_gen = ImageSequence(cfg, val, "val")

    strategy = tf.distribute.MirroredStrategy()
    initial_epoch = 0
    if weight_file:
        _, file_meta, *_ = weight_file.split('.')
        prev_epoch, new_epoch, _ = file_meta.split('-')
        initial_epoch = int(prev_epoch) + int(new_epoch)
    with strategy.scope():
        model = get_model(cfg)
        opt = get_optimizer(cfg)
        scheduler = get_scheduler(cfg, initial_epoch)
        model.compile(optimizer=opt,
                      loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
                      metrics=['accuracy'])
    if cfg.train.is_collab:
        checkpoint_dir = Path(to_absolute_path(__file__)).parent.parent.joinpath('drive', 'MyDrive', 'AgeGenderCheckpoint')
    else:
        checkpoint_dir = Path(to_absolute_path(__file__)).parent.joinpath('checkpoints')
    checkpoint_dir.mkdir(exist_ok=True)

    filename = "_".join([cfg.model.model_name,
                         str(cfg.model.img_size),
                         f"weights.{initial_epoch:02d}-" + "{epoch:02d}-{val_loss:.2f}.hdf5"])
    callbacks.extend([
        LearningRateScheduler(schedule=scheduler),
        get_logger(checkpoint_dir, initial_epoch, cfg.train.lr),
        ModelCheckpoint(str(checkpoint_dir) + "/" + filename,
                        monitor="val_loss",
                        verbose=1,
                        save_best_only=True,
                        mode="auto")
    ])

    if weight_file:
      model.load_weights(str(checkpoint_dir) + "/" + weight_file)
    model.fit(train_gen, epochs=cfg.train.epochs, callbacks=callbacks, validation_data=val_gen,
              workers=multiprocessing.cpu_count())
Exemplo n.º 3
0
def do_train(cfg, model):
    log.info('\n')
    log.info('** start training **')

    # get criterion -----------------------------
    criterion = factory.get_loss(cfg)

    # get optimization --------------------------
    optim = factory.get_optim(cfg, model.parameters())

    # initial -----------------------------------
    best = {
        'loss': float('inf'),
        'score': 0.0,
        'epoch': -1,
    }

    # re-load model -----------------------------
    if cfg.resume_from:
        log.info('\n')
        log.info(f're-load model from {cfg.resume_from}')
        detail = load_model(cfg.resume_from, model, optim)
        best.update({
            'loss': detail['loss'],
            'score': detail['score'],
            'epoch': detail['epoch'],
        })

    # setting dataset ---------------------------
    log.info('\n')
    log.info('** dataset **')
    folds = [fold for fold in range(cfg.n_fold) if cfg.fold != fold]
    log.info(f'fold_train:    {folds}')
    log.info(f'fold_valid:    [{cfg.fold}]')

    loader_train = factory.get_dataloader(cfg.data.train, folds)
    loader_valid = factory.get_dataloader(cfg.data.valid, [cfg.fold])
    log.info(loader_train)
    log.info(loader_valid)

    # scheduler ---------------------------------
    scheduler = factory.get_scheduler(cfg, optim, best['epoch'])

    if cfg.apex:
        amp.initialize(model, optim, opt_level='O1')

    for epoch in range(best['epoch'] + 1, cfg.epoch):
        log.info(f'---epoch {epoch}---')
        set_seed(epoch)

        ## train model --------------------------
        run_nn(cfg.data.train,
               'train',
               model,
               loader_train,
               criterion=criterion,
               optim=optim,
               apex=cfg.apex)

        ## valid model --------------------------
        with torch.no_grad():
            val = run_nn(cfg.data.valid,
                         'valid',
                         model,
                         loader_valid,
                         criterion=criterion)

        detail = {
            'score': val['score'],
            'loss': val['loss'],
            'epoch': epoch,
        }
        if val['loss'] <= best['loss']:
            best.update(detail)

        save_model(model, optim, detail, cfg.fold,
                   os.path.join(cfg.workdir, 'checkpoint'))

        log.info('[best] ep:%d loss:%.4f score:%.4f' %
                 (best['epoch'], best['loss'], best['score']))

        scheduler.step(val['loss'])  # reducelronplateau
Exemplo n.º 4
0
def train(cfg, model):
    criterion = factory.get_criterion(cfg)
    # optim = torch.optim.Adam(model.parameters(), lr=1e-3)
    optim = factory.get_optimizer(cfg, model.parameters())

    best = {
        'loss': float('inf'),
        'score': 0.0,
        'epoch': -1,
    }
    if "resume_from" in cfg.keys() and cfg["resume_from"]:
        detail = utils.load_model(cfg["resume_from"], model, optim=optim)
        best.update({
            'loss': detail['loss'],
            'score': detail['score'],
            'epoch': detail['epoch'],
        })

        # to set lr manually after resumed
        for param_group in optim.param_groups:
            param_group['lr'] = cfg["optimizer"]["param"]["lr"]
        log(f"initial lr {utils.get_lr(optim)}")

    scheduler, is_reduce_lr = factory.get_scheduler(cfg, optim)
    log(f"is_reduce_lr: {is_reduce_lr}")

    loader_train = factory.get_loader_train(cfg)
    loader_valid = factory.get_loader_valid(cfg)

    log('train data: loaded %d records' % len(loader_train.dataset))
    log('valid data: loaded %d records' % len(loader_valid.dataset))

    log('apex %s' % cfg["apex"])
    if cfg["apex"]:
        amp.initialize(model, optim, opt_level='O1')

    for epoch in range(best['epoch'] + 1, cfg["epoch"]):

        log(f'\n----- epoch {epoch} -----')

        run_nn(cfg,
               'train',
               model,
               loader_train,
               criterion=criterion,
               optim=optim,
               apex=cfg["apex"])

        with torch.no_grad():
            val = run_nn(cfg,
                         'valid',
                         model,
                         loader_valid,
                         criterion=criterion)

        detail = {
            'score': val['score'],
            'loss': val['loss'],
            'epoch': epoch,
        }
        if val['loss'] <= best['loss']:
            best.update(detail)
            utils.save_model(model,
                             optim,
                             detail,
                             cfg["fold"],
                             output_dir,
                             best=True)

        utils.save_model(model, optim, detail, cfg["fold"], output_dir)

        log('[best] ep:%d loss:%.4f score:%.4f' %
            (best['epoch'], best['loss'], best['score']))

        if is_reduce_lr:
            scheduler.step(val['loss'])  # reducelronplateau
        else:
            scheduler.step()
Exemplo n.º 5
0
def main(cfg):
    if cfg.wandb.project:
        import wandb
        from wandb.keras import WandbCallback
        wandb.init(project=cfg.wandb.project)
        callbacks = [WandbCallback()]
    else:
        callbacks = []
        
    data_path = Path("/pfs/faces/data/imdb_crop")
    #data_path = Path("/home/raoulfasel/Documents/pachyderm/age_gender_estimation/data/imdb_crop")
    
    csv_path = Path(to_absolute_path("./")).joinpath("meta", f"{cfg.data.db}.csv")
    #csv_path = Path(to_absolute_path("/pfs/faces")).joinpath("meta", f"{cfg.data.db}.csv")
    print(csv_path)
    df = pd.read_csv(str(csv_path))
    train, val = train_test_split(df, random_state=42, test_size=0.1)
    train_gen = ImageSequence(cfg, train, "train", data_path)
    val_gen = ImageSequence(cfg, val, "val", data_path)

    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        model = get_model(cfg)
        opt = get_optimizer(cfg)
        scheduler = get_scheduler(cfg)
        model.compile(optimizer=opt,
                      loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
                      metrics=['accuracy'])

    #checkpoint_dir = Path(to_absolute_path("age_gender_estimation")).joinpath("checkpoint")
    checkpoint_dir = Path(to_absolute_path("/pfs/build")).joinpath("checkpoint")

    print(checkpoint_dir)
    checkpoint_dir.mkdir(exist_ok=True)
    filename = "_".join([cfg.model.model_name,
                         str(cfg.model.img_size),
                         "weights.{epoch:02d}-{val_loss:.2f}.hdf5"])
    callbacks.extend([
        LearningRateScheduler(schedule=scheduler),
        ModelCheckpoint(str(checkpoint_dir) + "/" + filename,
                        monitor="val_loss",
                        verbose=1,
                        save_best_only=True,
                        mode="auto")
    ])

    model.fit(train_gen, epochs=cfg.train.epochs, callbacks=callbacks, validation_data=val_gen,
              workers=multiprocessing.cpu_count())
    
    model.save("tensorflow_deployment_package/tensorflow_model.h5")

    with open('/opt/ubiops/token', 'r') as reader:
        API_TOKEN = reader.read()
    client = ubiops.ApiClient(ubiops.Configuration(api_key={'Authorization': API_TOKEN}, 
                                               host='https://api.ubiops.com/v2.1'))
    api = ubiops.CoreApi(client)
    
    # Create the deployment
    deployment_template = ubiops.DeploymentCreate(
        name=DEPLOYMENT_NAME,
        description='Tensorflow deployment',
        input_type='structured',
        output_type='structured',
        input_fields=[
            ubiops.DeploymentInputFieldCreate(
                name='input_image',
                data_type='blob',
            ),
        ],
        output_fields=[
            ubiops.DeploymentOutputFieldCreate(
                name='output_image',
                data_type='blob'
            ),
        ],
        labels={"demo": "tensorflow"}
    )

    api.deployments_create(
        project_name=PROJECT_NAME,
        data=deployment_template
    )

    # Create the version
    version_template = ubiops.DeploymentVersionCreate(
        version=DEPLOYMENT_VERSION,
        language='python3.8',
        instance_type="2048mb",
        minimum_instances=0,
        maximum_instances=1,
        maximum_idle_time=1800 # = 30 minutes
    )

    api.deployment_versions_create(
        project_name=PROJECT_NAME,
        deployment_name=DEPLOYMENT_NAME,
        data=version_template
    )

    # Zip the deployment package
    shutil.make_archive('tensorflow_deployment_package', 'zip', '.', 'tensorflow_deployment_package')

    # Upload the zipped deployment package
    file_upload_result =api.revisions_file_upload(
        project_name=PROJECT_NAME,
        deployment_name=DEPLOYMENT_NAME,
        version=DEPLOYMENT_VERSION,
        file='tensorflow_deployment_package.zip'
    )