def test_correctness(ray_start_2_cpus, num_workers, use_local): layer = nn.Linear(1, 1) ptl_op = TrainingOperator.from_ptl(PTL_Module) trainer1 = TorchTrainer(training_operator_cls=ptl_op, config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train1_stats = trainer1.train() val1_stats = trainer1.validate() trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator, scheduler_step_freq="manual", config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train2_stats = trainer2.train() val2_stats = trainer2.validate() trainer2.shutdown() assert train1_stats["train_loss"] == train2_stats["train_loss"] assert val1_stats["val_loss"] == val2_stats["val_loss"] assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
def train_example(num_workers=1, use_gpu=False): trainer1 = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def main(): setup_default_logging() args, args_text = parse_args() if args.smoke_test: ray.init(num_cpus=int(args.ray_num_workers)) else: ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=loss_creator) trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, use_tqdm=True, use_fp16=args.amp, apex_args={"opt_level": "O1"}, config={ "args": args, BATCH_SIZE: args.batch_size }, num_workers=args.ray_num_workers) if args.smoke_test: args.epochs = 1 pbar = trange(args.epochs, unit="epoch") for i in pbar: trainer.train(num_steps=1 if args.smoke_test else None) val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown()
def main(args): os.makedirs(args.output_dir, exist_ok=True) print(args) start_time = time.time() config = {"args": args, "num_workers": args.num_workers} trainer = TorchTrainer(training_operator_cls=SegOperator, use_tqdm=True, use_fp16=True, num_workers=config["num_workers"], config=config, use_gpu=torch.cuda.is_available()) for epoch in range(args.epochs): trainer.train() confmat = trainer.validate(reduce_results=False)[0] print(confmat) state_dict = trainer.state_dict() state_dict.update(epoch=epoch, args=args) torch.save(state_dict, os.path.join(args.output_dir, f"model_{epoch}.pth")) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Training time {total_time_str}")
def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) # If using Ray Client, make sure to force model onto CPU. import ray m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected()) print("trained weight: % .2f, bias: % .2f" % (m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def run(num_workers, use_gpu, num_epochs, lr, batch_size, n_hidden, n_layers, n_heads, fan_out, feat_drop, attn_drop, negative_slope, sampling_num_workers): trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, backend="nccl", config={ "lr": lr, "batch_size": batch_size, "n_hidden": n_hidden, "n_layers": n_layers, "n_heads": n_heads, "fan_out": fan_out, "feat_drop": feat_drop, "attn_drop": attn_drop, "negative_slope": negative_slope, "sampling_num_workers": sampling_num_workers }) for i in range(num_epochs): trainer.train() validation_results = trainer.validate() trainer.shutdown() print(validation_results) print("success!")
def test_train(ray_start_2_cpus, num_workers, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local, use_gpu=False) for i in range(3): train_loss1 = trainer.train()["train_loss"] validation_loss1 = trainer.validate()["val_loss"] for i in range(3): train_loss2 = trainer.train()["train_loss"] validation_loss2 = trainer.validate()["val_loss"] assert train_loss2 <= train_loss1, (train_loss2, train_loss1) assert validation_loss2 <= validation_loss1, (validation_loss2, validation_loss1) trainer.shutdown()
def test_single_step(ray_start_2_cpus, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=1, use_local=use_local, use_gpu=False) metrics = trainer.train(num_steps=1) assert metrics[BATCH_COUNT] == 1 val_metrics = trainer.validate(num_steps=1) assert val_metrics[BATCH_COUNT] == 1 trainer.shutdown()
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): Operator = TrainingOperator.from_ptl(LitMNIST) trainer = TorchTrainer( training_operator_cls=Operator, num_workers=num_workers, config={"lr": 1e-3, "batch_size": 64}, use_gpu=use_gpu, use_tqdm=True, ) for i in range(num_epochs): stats = trainer.train() print(stats) print(trainer.validate()) print("Saving model checkpoint to ./model.pt") trainer.save("./model.pt") print("Model Checkpointed!") trainer.shutdown() print("success!")
# Register all of these components with Ray SGD. # This allows Ray SGD to do framework level setup like Cuda, DDP, # Distributed Sampling, FP16. # We also assign the return values of self.register to instance # attributes so we can access it in our custom training/validation # methods. self.model, self.optimizer, self.criterion, self.scheduler = \ self.register(models=model, optimizers=optimizer, criterion=criterion, schedulers=scheduler) # init ray or ray.init(address="auto") to connect to a running cluster. ray.init() # use TorchTrainer to package your model as a ray-object for moving around trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler is used config={ "lr": 0.001, "batch_size": 64 }, num_workers=100, use_gpu=True) for i in range(10): metrics = trainer.train() val_metrics = trainer.validate()
def main(args): if args.smoke_test: ray.init(num_cpus=4) else: ray.init(address=args.address, num_cpus=args.num_workers, log_to_driver=True) # Trainer Initialization trainer = TorchTrainer(training_operator_cls=CIFAR10Module, num_workers=args.num_workers, config={ "lr": args.learning_rate, "lr_decay": args.lr_decay, "eps": args.eps, "momentum": args.momentum, "wd": args.wd, "data_dir": args.data_dir, "batch_size": args.batch_size, "num_workers": args.num_workers, "smoke_test": args.smoke_test }, use_gpu=args.use_gpu, scheduler_step_freq="epoch", use_fp16=args.fp16, use_tqdm=False) train_loss = [] val_loss = [] val_acc = [] path = os.path.join("/root/volume/Paper/MLVC_Internship", args.checkpoint_dir, args.model_name + "_" + str(args.trial)) if not os.path.exists(path): os.mkdir(path) from tabulate import tabulate pbar = trange(args.max_epochs, unit="epoch") for it in pbar: stats = trainer.train(max_retries=1, info=dict(epoch_idx=it, num_epochs=args.max_epochs)) train_loss.append(stats["train_loss"]) val_stats = trainer.validate() val_loss.append(val_stats["val_loss"]) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.save( "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray". format(args.model_name, args.trial, it)) torch.save( [train_loss, val_loss], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss" .format(args.model_name, args.trial, it)) torch.save( [val_acc], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc". format(args.model_name, args.trial, it)) print(val_stats) trainer.shutdown() print("success!")
val_loader = DataLoader(test_shard, batch_size=64) self.register_data(train_loader=train_loader, validation_loader=val_loader) # You can either train the model like this trainer = TorchTrainer(training_operator_cls=CustomOperator, num_workers=num_executors, add_dist_sampler=False, num_cpus_per_worker=1, config={"lr": 0.01}) for i in range(10): stats = trainer.train() print(stats) val_stats = trainer.validate() print(val_stats) trainer.shutdown() # Or you can perform a hyperparameter search using Ray Tune # TorchTrainable = TorchTrainer.as_trainable( # training_operator_cls=CustomOperator, # num_workers=num_executors, # add_dist_sampler=False, # use_gpu=False, # num_cpus_per_worker=1 # ) # analysis = tune.run( # TorchTrainable, # config={"lr": tune.grid_search([0.01, 0.1])},
training_operator_cls=SafetyTrainingOperator, num_workers=1, use_gpu=True, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn }, backend="auto", scheduler_step_freq="epoch") for i in range(100): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), "checkpoint.pt") torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt") m = trainer1.get_model() print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})") # trainer1.shutdown() print("success!") else: m = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) checkpoint = torch.load("invariant_checkpoint.pt", torch.device("cpu")) m.load_state_dict(checkpoint) # trained weight: [[0.0018693 0.05228069]], bias: [-0.5533147] , train_loss = 0.0 # trained weight: [[-0.01369903 0.03511396]], bias: [-0.6535952] , train_loss = 0.0 # trained weight: [[0.00687088 0.26634103]], bias: [-0.6658108] , train_loss = 0.0 # trained weight: torch.tensor([[0.038166143000125885,0.16197167336940765]]), bias: torch.tensor([-2.3122551]) # %%