Exemplo n.º 1
0
    def setup(self, config):
        # Create model.
        model = ResNet18(config)
        model.conv1 = nn.Conv2d(
            1, 64, kernel_size=7, stride=1, padding=3, bias=False)

        # Create optimizer.
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=config.get("lr", 0.1),
            momentum=config.get("momentum", 0.9))

        # Load in training and validation data.
        train_dataset = load_mnist_data(True, True)
        validation_dataset = load_mnist_data(False, False)

        if config["test_mode"]:
            train_dataset = Subset(train_dataset, list(range(64)))
            validation_dataset = Subset(validation_dataset, list(range(64)))

        train_loader = DataLoader(
            train_dataset, batch_size=config["batch_size"], num_workers=2)
        validation_loader = DataLoader(
            validation_dataset, batch_size=config["batch_size"], num_workers=2)

        # Create loss.
        criterion = nn.CrossEntropyLoss()

        # Register all components.
        self.model, self.optimizer, self.criterion = self.register(
            models=model, optimizers=optimizer, criterion=criterion)
        self.register_data(
            train_loader=train_loader, validation_loader=validation_loader)
Exemplo n.º 2
0
def train_func(config):
    epochs = config.pop("epochs", 3)
    model = ResNet18(config)
    model = train.torch.prepare_model(model)

    # Create optimizer.
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.1),
        momentum=config.get("momentum", 0.9),
    )

    # Load in training and validation data.
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])  # meanstd transformation

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    with FileLock(".ray.lock"):
        train_dataset = CIFAR10(root="~/data",
                                train=True,
                                download=True,
                                transform=transform_train)
        validation_dataset = CIFAR10(root="~/data",
                                     train=False,
                                     download=False,
                                     transform=transform_test)

    if config.get("test_mode"):
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"])
    validation_loader = DataLoader(validation_dataset,
                                   batch_size=config["batch_size"])

    train_loader = train.torch.prepare_data_loader(train_loader)
    validation_loader = train.torch.prepare_data_loader(validation_loader)

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    results = []

    for _ in range(epochs):
        train_epoch(train_loader, model, criterion, optimizer)
        result = validate_epoch(validation_loader, model, criterion)
        train.report(**result)
        results.append(result)

    return results
Exemplo n.º 3
0
def get_model(model_checkpoint_path):
    model_state = torch.load(model_checkpoint_path)

    model = ResNet18(None)
    model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=1, padding=3, bias=False)
    model.load_state_dict(model_state["models"][0])

    return model
    def setup(self, config):
        # Create model.
        model = ResNet18(config)

        # Create optimizer.
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=config.get("lr", 0.1),
            momentum=config.get("momentum", 0.9))

        # Load in training and validation data.
        transform_train = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])  # meanstd transformation

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])
        with FileLock(".ray.lock"):
            train_dataset = CIFAR10(
                root="~/data",
                train=True,
                download=True,
                transform=transform_train)
            validation_dataset = CIFAR10(
                root="~/data",
                train=False,
                download=False,
                transform=transform_test)

        if config["test_mode"]:
            train_dataset = Subset(train_dataset, list(range(64)))
            validation_dataset = Subset(validation_dataset, list(range(64)))

        train_loader = DataLoader(
            train_dataset, batch_size=config[BATCH_SIZE], num_workers=2)
        validation_loader = DataLoader(
            validation_dataset, batch_size=config[BATCH_SIZE], num_workers=2)

        # Create scheduler.
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[150, 250, 350], gamma=0.1)

        # Create loss.
        criterion = nn.CrossEntropyLoss()

        # Register all components.
        self.model, self.optimizer, self.criterion, self.scheduler = \
            self.register(models=model, optimizers=optimizer,
                          criterion=criterion, schedulers=scheduler)
        self.register_data(
            train_loader=train_loader, validation_loader=validation_loader)
Exemplo n.º 5
0
def train(config, checkpoint_dir=None):
    import horovod.torch as hvd

    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = ResNet18(None).to(device)
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=config["lr"],
    )
    epoch = 0

    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            model_state, optimizer_state, epoch = torch.load(f)

        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    criterion = nn.CrossEntropyLoss()
    optimizer = hvd.DistributedOptimizer(optimizer)
    np.random.seed(1 + hvd.rank())
    torch.manual_seed(1234)
    # To ensure consistent initialization across workers,
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    trainset = ray.get(config["data"])
    trainloader = DataLoader(trainset,
                             batch_size=int(config["batch_size"]),
                             shuffle=True,
                             num_workers=4)

    for epoch in range(epoch, 40):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            tune.report(loss=running_loss / epoch_steps)
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" %
                      (epoch + 1, i + 1, running_loss / epoch_steps))

        with distributed_checkpoint_dir(step=epoch) as checkpoint_dir:
            print("this checkpoint dir: ", checkpoint_dir)
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict(), epoch), path)
Exemplo n.º 6
0
def train_func(config):
    device = torch.device(
        f"cuda:{sgd.local_rank()}" if torch.cuda.is_available() else "cpu")

    epochs = config.pop("epochs", 3)
    model = ResNet18(config)
    model = model.to(device)
    model = DistributedDataParallel(
        model,
        device_ids=[device.index] if torch.cuda.is_available() else None)

    # Create optimizer.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=config.get("lr", 0.1),
                                momentum=config.get("momentum", 0.9))

    # Load in training and validation data.
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])  # meanstd transformation

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    with FileLock(".ray.lock"):
        train_dataset = CIFAR10(root="~/data",
                                train=True,
                                download=True,
                                transform=transform_train)
        validation_dataset = CIFAR10(root="~/data",
                                     train=False,
                                     download=False,
                                     transform=transform_test)

    if config.get("test_mode"):
        train_dataset = Subset(train_dataset, list(range(64)))
        validation_dataset = Subset(validation_dataset, list(range(64)))

    train_loader = DataLoader(train_dataset,
                              batch_size=config["batch_size"],
                              sampler=DistributedSampler(train_dataset))
    validation_loader = DataLoader(
        validation_dataset,
        batch_size=config["batch_size"],
        sampler=DistributedSampler(validation_dataset))

    # Create loss.
    criterion = nn.CrossEntropyLoss()

    results = []

    for _ in range(epochs):
        train(train_loader, model, criterion, optimizer, device)
        result = validate(validation_loader, model, criterion, device)
        sgd.report(**result)
        results.append(result)

    return results