Exemplo n.º 1
0
def main(arg_list=None):
    run_args, remaining_args = parse_args(arg_list)
    remaining_args += ["--checkpoint-dir", run_args.checkpoint_dir]

    # If no config is supplied, try to load the config that should have been saved with the ckpts.
    if "--config" not in remaining_args:
        config_path = find_checkpoint_config(run_args.checkpoint_dir)
        remaining_args += ["--config", config_path]

    bert_args = utils.parse_bert_args(remaining_args)
    if not run_args.no_logger_setup:
        setup_logger(logging.getLevelName('INFO'))

    # Force variable weights in inference mode - otherwise we can't override the model weights for
    # validating each new checkpoint.
    bert_args.variable_weights_inference = True
    # Required to allow squeezed models to fit.
    bert_args.max_copy_merge_size = 32000

    logger.info("Program Start")

    # `parse_bert_args` will suffix the user-supplied checkpoint path with the current date/time.
    # To avoid modifying core Bert code, we'll just remove the suffix (we don't need the created
    # config).
    shutil.rmtree(bert_args.checkpoint_dir)
    bert_args.checkpoint_dir = os.path.dirname(bert_args.checkpoint_dir)

    logger.info(
        f"Validating over checkpoints in directory {bert_args.checkpoint_dir}")
    return validate_checkpoints(run_args, utils.get_validation_args(bert_args))
Exemplo n.º 2
0
    # Define a specific Handler for this file that removes the root name.
    console = logging.StreamHandler()
    console.setLevel(log_level)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s',
                                  '%Y-%m-%d %H:%M:%S')
    console.setFormatter(formatter)
    logger.addHandler(console)
    logger.propagate = False


if __name__ == "__main__":

    args = utils.parse_bert_args()

    setup_logger(logging.getLevelName(args.log_level))

    logger.info("Program Start")
    logger.info("Hostname: " + socket.gethostname())
    logger.info("Command Executed: " + str(sys.argv))

    # Run the main inference/training session by default
    if args.inference or not args.no_training:
        main(args)

    # If this was a training session and validation isn't disabled; validate.
    if not args.inference and not args.no_validation and not args.no_model_save:
        logger.info("Doing Validation")
        main(utils.get_validation_args(args))

    logger.info("Program Finished")
Exemplo n.º 3
0
def main(args):
    set_library_seeds(args.seed)

    config = bert_config_from_args(args)

    initializers = bert_pretrained_initialisers(config, args)

    logger.info("Building Model")
    # Specifying ai.onnx opset9 for the slice syntax
    # TODO: Change slice to opset10
    model = Bert(config,
                 builder=popart.Builder(
                     opsets={"ai.onnx": 9, "ai.onnx.ml": 1, "ai.graphcore": 1}),
                 initializers=initializers)

    indices, positions, segments, masks, labels = bert_add_inputs(args, model)
    logits = bert_logits_graph(model, indices, positions, segments, masks)

    if args.inference:
        outputs = bert_add_infer_outputs(model, logits)
        losses = []
        writer = None
    else:
        predictions, probs = bert_infer_graph(model, logits)
        losses = bert_loss_graph(model, probs, labels)
        outputs = bert_add_validation_outputs(model, predictions, losses)
        writer = bert_writer(args)

    dataset = get_bert_dataset(model, args, [indices, positions, segments, masks, labels])
    logger.info(f"Dataset length: {len(dataset)}")

    data_flow = popart.DataFlow(dataset.batches_per_step, outputs)

    iteration = Iteration(
        args,
        batches_per_step=dataset.batches_per_step,
        steps_per_epoch=len(dataset),
        writer=writer,
        recording_steps=args.aggregate_metrics_over_steps)

    request_ipus, required_ipus = calc_required_ipus(args, model)

    device = acquire_device(args, request_ipus)

    if args.inference:
        session, anchors = bert_inference_session(model, args, data_flow, losses, device)
        logger.info("Inference Started")
        bert_infer_loop(args, session,
                        dataset, logits, anchors,
                        iteration)
        device.detach()
    else:
        if not args.no_training:
            optimizer_factory = ScheduledOptimizerFactory(args,
                                                          iteration,
                                                          model.pipeline_stage_tensors)

            session, anchors = bert_training_session(model,
                                                     args,
                                                     data_flow,
                                                     losses,
                                                     device,
                                                     optimizer_factory)
            logger.info("Training Started")
            bert_train_loop(args, session, writer,
                            dataset, labels, predictions, losses, anchors,
                            iteration, optimizer_factory)

            device.detach()
            logger.info("Training Finished")
        if not args.no_validation:
            logger.info("Doing Validation")
            main(utils.get_validation_args(args))

    return session, iteration