Exemplo n.º 1
0
def run_experiment(args):

    logger.info("\n***********************************************"
                f"\n************* Experiment: {args.task.name} ************"
                "\n************************************************")
    ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url)
    ml_logger.init_experiment(
        experiment_name=args.logging.mlflow_experiment,
        run_name=args.logging.mlflow_run_name,
        nested=args.logging.mlflow_nested,
    )

    validate_args(args)
    distributed = bool(args.general.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(
        use_cuda=args.general.cuda,
        local_rank=args.general.local_rank,
        fp16=args.general.fp16,
    )

    args.parameter.batch_size = int(args.parameter.batch_size //
                                    args.parameter.gradient_accumulation_steps)
    if n_gpu > 1:
        args.parameter.batch_size = args.parameter.batch_size * n_gpu
    set_all_seeds(args.general.seed)

    # Prepare Data
    tokenizer = Tokenizer.load(args.parameter.model,
                               do_lower_case=args.parameter.lower_case)

    processor = Processor.load(
        tokenizer=tokenizer,
        max_seq_len=args.parameter.max_seq_len,
        data_dir=args.general.data_dir,
        **args.task.toDict(
        ),  # args is of type DotMap and needs conversion to std python dicts
    )

    data_silo = DataSilo(
        processor=processor,
        batch_size=args.parameter.batch_size,
        distributed=distributed,
    )

    class_weights = None
    if args.parameter.balance_classes:
        task_names = list(processor.tasks.keys())
        if len(task_names) > 1:
            raise NotImplementedError(
                f"Balancing classes is currently not supported for multitask experiments. Got tasks:  {task_names} "
            )
        class_weights = data_silo.calculate_class_weights(
            task_name=task_names[0])

    model = get_adaptive_model(
        lm_output_type=args.parameter.lm_output_type,
        prediction_heads=args.parameter.prediction_head,
        layer_dims=args.parameter.layer_dims,
        model=args.parameter.model,
        device=device,
        class_weights=class_weights,
        embeds_dropout_prob=args.parameter.embeds_dropout_prob,
    )

    # Init optimizer

    # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this?
    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=args.parameter.learning_rate,
        warmup_proportion=args.parameter.warmup_proportion,
        loss_scale=args.general.loss_scale,
        fp16=args.general.fp16,
        n_batches=len(data_silo.loaders["train"]),
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        n_epochs=args.parameter.epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.parameter.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        fp16=args.general.fp16,
        local_rank=args.general.local_rank,
        warmup_linear=warmup_linear,
        evaluate_every=args.logging.eval_every,
        device=device,
    )

    model = trainer.train(model)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.task.name}"
    )
    processor.save(f"{args.general.output_dir}/{model_name}")
    model.save(f"{args.general.output_dir}/{model_name}")
Exemplo n.º 2
0
def run_experiment(args):
    logger.info("\n***********************************************"
                f"\n************* Experiment: {args.task.name} ************"
                "\n************************************************")
    ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url)
    ml_logger.init_experiment(
        experiment_name=args.logging.mlflow_experiment,
        run_name=args.logging.mlflow_run_name,
        nested=args.logging.mlflow_nested,
    )

    validate_args(args)
    distributed = bool(args.general.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(
        use_cuda=args.general.cuda,
        local_rank=args.general.local_rank,
        use_amp=args.general.use_amp,
    )

    args.parameter.batch_size = int(args.parameter.batch_size //
                                    args.parameter.gradient_accumulation_steps)

    set_all_seeds(args.general.seed)

    # Prepare Data
    tokenizer = Tokenizer.load(args.parameter.model,
                               do_lower_case=args.parameter.lower_case)

    processor = Processor.load(
        tokenizer=tokenizer,
        max_seq_len=args.parameter.max_seq_len,
        data_dir=Path(args.general.data_dir),
        **args.task.toDict(
        ),  # args is of type DotMap and needs conversion to std python dicts
    )

    data_silo = DataSilo(
        processor=processor,
        batch_size=args.parameter.batch_size,
        distributed=distributed,
    )

    class_weights = None
    if args.parameter.balance_classes:
        task_names = list(processor.tasks.keys())
        if len(task_names) > 1:
            raise NotImplementedError(
                f"Balancing classes is currently not supported for multitask experiments. Got tasks:  {task_names} "
            )
        class_weights = data_silo.calculate_class_weights(
            task_name=task_names[0])

    model = get_adaptive_model(
        lm_output_type=args.parameter.lm_output_type,
        prediction_heads=args.parameter.prediction_head,
        layer_dims=args.parameter.layer_dims,
        model=args.parameter.model,
        device=device,
        class_weights=class_weights,
        embeds_dropout_prob=args.parameter.embeds_dropout_prob,
    )

    # Init optimizer
    optimizer_opts = args.optimizer.optimizer_opts.toDict(
    ) if args.optimizer.optimizer_opts else None
    schedule_opts = args.optimizer.schedule_opts.toDict(
    ) if args.optimizer.schedule_opts else None
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=args.optimizer.learning_rate,
        schedule_opts=schedule_opts,
        optimizer_opts=optimizer_opts,
        use_amp=args.general.use_amp,
        n_batches=len(data_silo.loaders["train"]),
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        n_epochs=args.parameter.epochs,
        device=device)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.task.name}"
    )

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    if "early_stopping" in args:
        early_stopping = EarlyStopping(
            metric=args.task.metric,
            mode=args.early_stopping.mode,
            save_dir=Path(
                f"{args.general.output_dir}/{model_name}_early_stopping"
            ),  # where to save the best model
            patience=args.early_stopping.
            patience  # number of evaluations to wait for improvement before terminating the training
        )
    else:
        early_stopping = None

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.parameter.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        use_amp=args.general.use_amp,
        local_rank=args.general.local_rank,
        lr_schedule=lr_schedule,
        evaluate_every=args.logging.eval_every,
        device=device,
        early_stopping=early_stopping)

    model = trainer.train()

    processor.save(Path(f"{args.general.output_dir}/{model_name}"))
    model.save(Path(f"{args.general.output_dir}/{model_name}"))

    ml_logger.end_run()
Exemplo n.º 3
0
def run_experiment(args):
    validate_args(args)
    directory_setup(output_dir=args.output_dir, do_train=args.do_train)
    distributed = bool(args.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(use_cuda=args.cuda,
                                               local_rank=args.local_rank,
                                               fp16=args.fp16)

    args.batch_size = args.batch_size // args.gradient_accumulation_steps
    if n_gpu > 1:
        args.batch_size = args.batch_size * n_gpu
    set_all_seeds(args.seed)

    # Prepare Data
    tokenizer = BertTokenizer.from_pretrained(args.model,
                                              do_lower_case=args.lower_case)
    processor = Processor.load(
        processor_name=args.processor_name,
        tokenizer=tokenizer,
        max_seq_len=args.max_seq_len,
        data_dir=args.data_dir,
    )

    data_silo = DataSilo(processor=processor,
                         batch_size=args.batch_size,
                         distributed=distributed)

    class_weights = None
    if args.balance_classes:
        class_weights = data_silo.class_weights

    model = get_adaptive_model(
        lm_output_type=args.lm_output_type,
        prediction_heads=args.prediction_head,
        layer_dims=args.layer_dims,
        model=args.model,
        device=device,
        class_weights=class_weights,
        fp16=args.fp16,
        embeds_dropout_prob=args.embeds_dropout_prob,
        local_rank=args.local_rank,
        n_gpu=n_gpu,
    )

    # Init optimizer

    # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this?
    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion,
        loss_scale=args.loss_scale,
        fp16=args.fp16,
        n_examples=data_silo.n_samples("train"),
        batch_size=args.batch_size,
        grad_acc_steps=args.gradient_accumulation_steps,
        n_epochs=args.epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.gradient_accumulation_steps,
        fp16=args.fp16,
        warmup_linear=warmup_linear,
        evaluate_every=args.eval_every,
        device=device,
    )

    model = trainer.train(model)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.name}"
    )
    processor.save(f"saved_models/{model_name}")
    model.save(f"saved_models/{model_name}")
Exemplo n.º 4
0
def run_experiment(args):
    validate_args(args)
    distributed = bool(args.general.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(
        use_cuda=args.general.cuda,
        local_rank=args.general.local_rank,
        fp16=args.general.fp16,
    )

    args.parameter.batch_size = int(args.parameter.batch_size //
                                    args.parameter.gradient_accumulation_steps)
    if n_gpu > 1:
        args.parameter.batch_size = args.parameter.batch_size * n_gpu
    set_all_seeds(args.general.seed)

    # Prepare Data
    tokenizer = BertTokenizer.from_pretrained(
        args.parameter.model, do_lower_case=args.parameter.lower_case)
    # processor = Processor.load(
    #     tokenizer=tokenizer,
    #     max_seq_len=args.parameter.max_seq_len,
    #     data_dir=args.general.data_dir,
    #     train_filename=args.task.train_filename,
    #     dev_filename=args.task.dev_filename,
    #     test_filename=args.task.test_filename,
    #     dev_split=args.task.dev_split,
    #     metrics=args.task.metrics,
    #     **args.task.toDict(),  # args is of type DotMap and needs conversion to std python dicts
    # )
    processor = Processor.load(
        tokenizer=tokenizer,
        max_seq_len=args.parameter.max_seq_len,
        data_dir=args.general.data_dir,
        **args.task.toDict(
        ),  # args is of type DotMap and needs conversion to std python dicts
    )

    data_silo = DataSilo(
        processor=processor,
        batch_size=args.parameter.batch_size,
        distributed=distributed,
    )

    class_weights = None
    if args.parameter.balance_classes:
        class_weights = data_silo.class_weights

    model = get_adaptive_model(
        lm_output_type=args.parameter.lm_output_type,
        prediction_heads=args.parameter.prediction_head,
        layer_dims=args.parameter.layer_dims,
        model=args.parameter.model,
        device=device,
        class_weights=class_weights,
        embeds_dropout_prob=args.parameter.embeds_dropout_prob,
    )

    # Init optimizer

    # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this?
    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=args.parameter.learning_rate,
        warmup_proportion=args.parameter.warmup_proportion,
        loss_scale=args.general.loss_scale,
        fp16=args.general.fp16,
        n_batches=len(data_silo.loaders["train"]),
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        n_epochs=args.parameter.epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.parameter.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        fp16=args.general.fp16,
        local_rank=args.general.local_rank,
        warmup_linear=warmup_linear,
        evaluate_every=args.logging.eval_every,
        device=device,
    )

    model = trainer.train(model)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.task.name}"
    )
    processor.save(f"{args.general.output_dir}/{model_name}")
    model.save(f"{args.general.output_dir}/{model_name}")