def train_func(): train.torch.accelerate(amp=True) model = torchvision.models.resnet101() model = train.torch.prepare_model(model) train.save_checkpoint(model=model)
def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): train.save_checkpoint(epoch=i)
def train_func_checkpoint(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 2 for i in range(checkpoint["epoch"], 5): train.save_checkpoint(epoch=i) return 1
def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1)
def train_fn(): model = torch.nn.Linear(1, 1) # Wrap in DDP. model = train.torch.prepare_model(model) # Save DDP wrapped model. train.save_checkpoint(model=model) # Report DDP wrapped model. train.report(model=model)
def on_save(self, args, state, control, **kwargs): # Save is called after evaluation. checkpoint_path = Path( transformers.trainer.get_last_checkpoint( args.output_dir)).absolute() if checkpoint_path: train.save_checkpoint( **{ NODE_IP_KEY: get_node_ip_address(), CHECKPOINT_PATH_ON_NODE_KEY: str(checkpoint_path), })
def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(config): batch_size = config.get("batch_size", 32) hidden_size = config.get("hidden_size", 1) lr = config.get("lr", 1e-2) epochs = config.get("epochs", 3) train_dataset_shard = train.get_dataset_shard("train") validation_dataset = train.get_dataset_shard("validation") model = nn.Linear(1, hidden_size) model = train.torch.prepare_model(model) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) results = [] for _ in range(epochs): train_torch_dataset = train_dataset_shard.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) validation_torch_dataset = validation_dataset.to_torch( label_column="y", feature_columns=["x"], label_column_dtype=torch.float, feature_column_dtypes=torch.float, batch_size=batch_size, ) device = train.torch.get_device() train_epoch(train_torch_dataset, model, loss_fn, optimizer, device) if train.world_rank() == 0: result = validate_epoch(validation_torch_dataset, model, loss_fn, device) else: result = {} train.report(**result) results.append(result) train.save_checkpoint(model=model) return results
def training_loop(config): # Create model. model = ResNet18(config) model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=1, padding=3, bias=False) model = train.torch.prepare_model(model) # Create optimizer. optimizer = torch.optim.SGD( model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9), ) # Load in training and validation data. train_dataset = load_mnist_data(True, True) validation_dataset = load_mnist_data(False, False) if config["test_mode"]: train_dataset = Subset(train_dataset, list(range(64))) validation_dataset = Subset(validation_dataset, list(range(64))) train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], num_workers=2) validation_loader = DataLoader(validation_dataset, batch_size=config["batch_size"], num_workers=2) train_loader = train.torch.prepare_data_loader(train_loader) validation_loader = train.torch.prepare_data_loader(validation_loader) # Create loss. criterion = nn.CrossEntropyLoss() for epoch_idx in range(2): train_epoch(train_loader, model, criterion, optimizer) validation_loss = validate_epoch(validation_loader, model, criterion) train.save_checkpoint(model_state_dict=model.module.state_dict()) train.report(**validation_loss)
def train_func(config): num_epochs = config.get("num_epochs", 10) log_interval = config.get("log_interval", 10) use_cuda = config.get("use_cuda", False) save_model_as_dict = config.get("save_model_as_dict", False) model, optimizer, train_loader, train_sampler = setup(config) results = [] for epoch in range(num_epochs): loss = train_epoch(model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda) results.append(loss) if save_model_as_dict: train.save_checkpoint(model=model.state_dict()) else: train.save_checkpoint(model=model) print("losses of each epoch:") print(results) return results
def train_func(): twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: # Setup model. model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) loss_fn = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Setup data. input = torch.randn(1000, 1) labels = input * 2 dataset = torch.utils.data.TensorDataset(input, labels) dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) dataloader = train.torch.prepare_data_loader(dataloader) # Train. for epoch in range(5): with record_function("train_epoch"): for X, y in dataloader: pred = model(X) loss = loss_fn(pred, y) optimizer.zero_grad() loss.backward() optimizer.step() with record_function("train_checkpoint"): state_dict = model.state_dict() consume_prefix_in_state_dict_if_present(state_dict, "module.") train.save_checkpoint(epoch=epoch, model_weights=state_dict) p.step() with record_function("train_report"): profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def train_func(): for i in range(10): train.report(test=i) train.save_checkpoint(hello="world")
def train_func(): train.save_checkpoint(epoch=0)
def train_func(): train.save_checkpoint(loss=3) # best train.save_checkpoint(loss=7) # worst, deleted train.save_checkpoint(loss=5)
def train_func(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(1)
def train_mismatch(): train.save_checkpoint(epoch=0) train.report(index=0) # skip checkpoint train.report(index=1)
def on_epoch_end(self, epoch, logs=None): train.save_checkpoint(**{"model": self.model.get_weights()}) train.report(**logs)
def train_func(): checkpoint = train.load_checkpoint() train.report(**checkpoint) train.save_checkpoint(**checkpoint) return checkpoint[key]
def train_loop_per_worker(config): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 checkpoint = train.load_checkpoint() if checkpoint: model_state = checkpoint["model_state"] optimizer_state = checkpoint["optimizer_state"] epoch = checkpoint["epoch"] net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader(trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 train.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) train.save_checkpoint( model_state=net.state_dict(), optimizer_state=optimizer.state_dict(), epoch=epoch, )
def train_func(): assert train.load_checkpoint() is None for i in range(3): train.save_checkpoint(epoch=i) return 1
def train_slow(): for i in range(2): train.save_checkpoint(epoch=i) time.sleep(5) train.report(index=i) time.sleep(5)
def train_func(): if (train.world_rank()) == 0: train.save_checkpoint(epoch=0) else: train.report(iter=0)
def train_func(): model = torch.nn.Linear(1, 1).state_dict() train.save_checkpoint(model=model)
def train_func(config): use_gpu = config["use_gpu"] num_epochs = config["num_epochs"] batch_size = config["batch_size"] num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"] dropout_prob = config["dropout_prob"] num_features = config["num_features"] print("Defining model, loss, and optimizer...") # Setup device. device = torch.device(f"cuda:{train.local_rank()}" if use_gpu and torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Setup data. train_dataset_pipeline = train.get_dataset_shard("train_dataset") train_dataset_epoch_iterator = train_dataset_pipeline.iter_epochs() test_dataset = train.get_dataset_shard("test_dataset") test_torch_dataset = test_dataset.to_torch(label_column="label", batch_size=batch_size) net = Net( n_layers=num_layers, n_features=num_features, num_hidden=num_hidden, dropout_every=dropout_every, drop_prob=dropout_prob, ).to(device) print(net.parameters) net = train.torch.prepare_model(net) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(net.parameters(), weight_decay=0.0001) print("Starting training...") for epoch in range(num_epochs): train_dataset = next(train_dataset_epoch_iterator) train_torch_dataset = train_dataset.to_torch(label_column="label", batch_size=batch_size) train_running_loss, train_num_correct, train_num_total = train_epoch( train_torch_dataset, net, device, criterion, optimizer, num_features) train_acc = train_num_correct / train_num_total print(f"epoch [{epoch + 1}]: training accuracy: " f"{train_num_correct} / {train_num_total} = {train_acc:.4f}") test_running_loss, test_num_correct, test_num_total = test_epoch( test_torch_dataset, net, device, criterion) test_acc = test_num_correct / test_num_total print(f"epoch [{epoch + 1}]: testing accuracy: " f"{test_num_correct} / {test_num_total} = {test_acc:.4f}") # Record and log stats. train.report( train_acc=train_acc, train_loss=train_running_loss, test_acc=test_acc, test_loss=test_running_loss, ) # Checkpoint model. module = net.module if isinstance(net, DistributedDataParallel) else net train.save_checkpoint(model_state_dict=module.state_dict()) if train.world_rank() == 0: return module.cpu()
def train_func_checkpoint(): train.save_checkpoint(loss=3) train.save_checkpoint(loss=7)
def train_func(): for i in range(3): train.save_checkpoint(model=i)
def train_mismatch(): train.save_checkpoint(epoch=0)
def train_func(): for i in range(2): train.save_checkpoint(epoch=i) train.report(index=i)
def train_func(): model = build_model().get_weights() train.save_checkpoint(**{MODEL_KEY: model})