Пример #1
0
def load_models(
    clf_timestamp: str = production_models._CLASSIFIER["timestamp"],
    det_timestamp: str = production_models._DETECTOR["timestamp"],
) -> Tuple[torch.nn.Module, torch.nn.Module]:
    """ Loads the given time stamps for the classification and detector models.

    Args:
        clf_timestamp: Which classification model to load.
        det_timestamp: Which detection model to load.

    Returns:
        Returns both models.
    """

    clf_model = classifier.Classifier(timestamp=clf_timestamp,
                                      half_precision=torch.cuda.is_available())
    clf_model.eval()

    # TODO(alex): Pass in the confidence for the detector.
    det_model = detector.Detector(
        timestamp=det_timestamp,
        confidence=0.2,
        half_precision=torch.cuda.is_available(),
    )
    det_model.eval()

    # Do FP16 when inferencing
    if torch.cuda.is_available():
        det_model.cuda()
        det_model.half()

        clf_model.cuda()
        clf_model.half()

    return clf_model, det_model
Пример #2
0
def benchmark(timestamp: str, model_type: str, batch_size: int,
              run_time: float) -> None:
    """Benchmarks a model.

    This function will load the specified model, create a random tensor from the
    model's internal height and width and the given batch then perform
    forward passes through the model for :attr:`run_time` seconds.

    Args:
        timestamp: The model's specific timestamp.
        model_type: Which type of model this is.
        batch_size: The batch size to benchmark the model on.
        run_time: How long to run the benchmark in seconds.
    """
    # Construct the model.
    if model_type == "classifier":
        model = classifier.Classifier(timestamp=timestamp, half_precision=True)
    elif model_type == "detector":
        model = detector.Detector(timestamp=timestamp, half_precision=True)
    else:
        raise ValueError(f"Unsupported model type: {model_type}.")

    batch = torch.randn((batch_size, 3, model.image_size, model.image_size))

    if torch.cuda.is_available():
        model.cuda()
        model.half()
        batch = batch.cuda().half()

    print("Starting inference.")
    start_loop = time.perf_counter()
    times = []
    while time.perf_counter() - start_loop < run_time:
        start = time.perf_counter()
        model(batch)
        times.append(time.perf_counter() - start)

    latency = sum(times) / len(times)

    print(
        f"Total time: {sum(times):.4f}.\n"
        f"Average batch inference time: {latency:.4f}s. FPS: {batch_size / latency:.2f}."
    )
Пример #3
0
def train(
    local_rank: int,
    world_size: int,
    model_cfg: dict,
    train_cfg: dict,
    save_dir: pathlib.Path,
    initial_timestamp: str = None,
) -> None:
    """Entrypoint for training. This is where most of the logic is executed.

    Args:
        local_rank: Which GPU subprocess rank this is executed in. For CPU and 1 GPU,
            this is 0.
        world_size: How many processes are being run.
        model_cfg: The model definition dictionary.
        train_cfg: The training config dictionary.
        save_dir: Where to write checkpoints.
        initial_timestamp: Which model to start from.
    """

    # Do some general setup. When using distributed training and Apex, the device needs
    # to be set before loading the model.
    use_cuda = torch.cuda.is_available()
    device = torch.device(f"cuda:{local_rank}" if use_cuda else "cpu")
    if use_cuda:
        torch.cuda.set_device(local_rank)

    is_main = local_rank == 0
    if is_main:
        log = logger.Log(save_dir / "log.txt")

    # If we are using distributed training, initialize the backend through which process
    # can communicate to each other.
    if world_size > 1:
        torch.distributed.init_process_group(
            "nccl", world_size=world_size, rank=local_rank
        )

    # TODO(alex) these paths should be in the generate config
    batch_size = train_cfg.get("batch_size", 4)
    train_loader, train_sampler = create_data_loader(
        batch_size,
        generate_config.DATA_DIR / "clf_train",
        world_size=world_size,
        val=False,
        img_size=model_cfg.get("image_size", 224),
    )
    eval_loader, _ = create_data_loader(
        batch_size,
        generate_config.DATA_DIR / "clf_val",
        world_size=world_size,
        val=True,
        img_size=model_cfg.get("image_size", 224),
    )

    if is_main:
        log.info(f"Train dataset: {train_loader.dataset}")
        log.info(f"Val dataset: {eval_loader.dataset}")

    scores = {"best_model_score": 0, "best_ema_score": 0}
    best_scores_path = pathlib.Path(save_dir / "best_scores.json")
    best_scores_path.write_text(json.dumps({}))

    clf_model = classifier.Classifier(
        backbone=model_cfg.get("backbone", None),
        num_classes=model_cfg.get("num_classes", 2),
    )
    if initial_timestamp is not None:
        clf_model.load_state_dict(
            torch.load(initial_timestamp / "classifier.pt", map_location="cpu")
        )
    clf_model.to(device)

    if is_main:
        log.info(f"Model: \n {clf_model}")

    optimizer = utils.create_optimizer(train_cfg["optimizer"], clf_model)
    use_mixed_precision = train_cfg.get("mixed-precision", True)
    if use_mixed_precision:
        if is_main:
            log.info("Mixed-precision (AMP) enabled.")
        scaler = torch.cuda.amp.GradScaler()
        scaler

    ema_model = ema.Ema(clf_model)

    if world_size > 1:
        clf_model = torch.nn.parallel.DistributedDataParallel(
            clf_model, device_ids=[local_rank]
        )

    epochs = train_cfg.get("epochs", 0)
    assert epochs > 0, "Please supply epoch > 0"

    # Create the learning rate scheduler.
    lr_scheduler = None
    if train_cfg["optimizer"]["type"].lower() == "sgd":
        lr_config = train_cfg.get("lr_schedule", {})
        warm_up_percent = lr_config.get("warmup_fraction", 0)
        start_lr = float(lr_config.get("start_lr"))
        max_lr = float(lr_config.get("max_lr"))
        end_lr = float(lr_config.get("end_lr"))
        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=max_lr,
            total_steps=len(train_loader) * epochs,
            final_div_factor=start_lr / end_lr,
            div_factor=max_lr / start_lr,
            pct_start=warm_up_percent,
        )

    loss_fn = torch.nn.CrossEntropyLoss()
    global_step = 0

    for epoch in range(epochs):
        all_losses = []

        # Set the train loader's epoch so data will be re-shuffled.
        if world_size > 1:
            train_sampler.set_epoch(epoch)

        for idx, (data, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            global_step += 1

            # BHWC -> BCHW
            data = data.permute(0, 3, 1, 2)
            data = data.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            out = clf_model(data)

            loss = loss_fn(out, labels)
            all_losses.append(loss.item())

            # Propogate the gradients back through the model.
            if use_mixed_precision:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()

            if lr_scheduler is not None:
                lr_scheduler.step()

            ema_model.update(clf_model)

            if idx % _LOG_INTERVAL == 0 and is_main:
                lr = optimizer.param_groups[0]["lr"]
                log.info(
                    f"Epoch: {epoch} step {idx}, loss "
                    f"{sum(all_losses) / len(all_losses):.5}. lr: {lr:.4}"
                )

        # Call evaluation function
        if is_main and epoch >= train_cfg.get("eval_start_epoch", 10):
            improved_scores = set()
            log.info("Starting eval.")
            start_val = time.perf_counter()
            clf_model.eval()
            model_score = evaluate(clf_model, eval_loader, device)
            clf_model.train()

            if model_score > scores["best_model_score"]:
                scores["best_model_score"] = model_score
                improved_scores.add("best_model_score")
                # TODO(alex): Fix this .module
                utils.save_model(clf_model, save_dir / "classifier.pt")

            ema_score = evaluate(ema_model, eval_loader, device)
            if ema_score > scores["best_ema_score"]:
                scores["best_ema_score"] = ema_score
                improved_scores.add("ema-acc")
                utils.save_model(ema_model.ema_model, save_dir / "ema-classifier.pt")

            # Write the best metrics to a file so we know which model weights to load.
            if improved_scores:
                best_scores = json.loads(best_scores_path.read_text())
                best_scores.update(scores)
                best_scores_path.write_text(json.dumps(best_scores))

            log.info(f"Eval took {time.perf_counter() - start_val:.4f}s.")
            log.info(f"Improved metrics: {improved_scores}.")
            log.info(
                f"Epoch {epoch}, Training loss {sum(all_losses) / len(all_losses):.5f}\n"
                f"Best model accuracy: {scores['best_model_score']:.5f}\n"
                f"Best EMA accuracy: {scores['best_ema_score']:.5f} \n"
            )
            log.metric("Model score", model_score, epoch)
            log.metric("Best model score", scores["best_model_score"], epoch)
            log.metric("EMA score", ema_score, epoch)
            log.metric("Best EMA score", scores["best_ema_score"], epoch)
            log.metric("Training loss", sum(all_losses) / len(all_losses), epoch)
Пример #4
0
 def test_rexnet_lite0(self) -> None:
     model = classifier.Classifier(num_classes=2, backbone="rexnet-lite0")
     self.assertTrue(self._test_model_output(model, 2))
Пример #5
0
 def test_vovnet_39(self) -> None:
     model = classifier.Classifier(num_classes=2, backbone="vovnet-39")
     self.assertTrue(self._test_model_output(model, 2))
Пример #6
0
 def test_vovnet_19_slim_dw(self) -> None:
     model = classifier.Classifier(num_classes=2,
                                   backbone="vovnet-19-slim-dw")
     self.assertTrue(self._test_model_output(model, 2))