def main(): os.makedirs(args.output_dir, exist_ok=True) print(args) if args.dataset_root == COCO_ROOT: parser.error('Must specify dataset if specifying dataset_root') cfg = voc start_time = time.time() config = {"args": args, "num_workers": args.num_workers, "cfg": cfg} trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, training_operator_cls=SegOperator, use_tqdm=True, use_fp16=False, num_workers=config["num_workers"], config=config, use_gpu=torch.cuda.is_available()) for epoch in range(args.epochs): trainer.train() state_dict = trainer.state_dict() state_dict.update(epoch=epoch, args=args) torch.save(state_dict, os.path.join(args.output_dir, "model_{}.pth".format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def train_example(num_workers=1, use_gpu=False, test_mode=False): config = { "test_mode": test_mode, "batch_size": 16 if test_mode else 512 // num_workers, "classification_model_path": os.path.join(os.path.dirname(ray.__file__), "util/sgd/torch/examples/mnist_cnn.pt") } trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.BCELoss, training_operator_cls=GANOperator, num_workers=num_workers, config=config, use_gpu=use_gpu, use_tqdm=True) from tabulate import tabulate pbar = trange(5, unit="epoch") for itr in pbar: stats = trainer.train(info=dict(epoch_idx=itr, num_epochs=5)) pbar.set_postfix(dict(loss_g=stats["loss_g"], loss_d=stats["loss_d"])) formatted = tabulate([stats], headers="keys") if itr > 0: # Get the last line of the stats. formatted = formatted.split("\n")[-1] pbar.write(formatted) return trainer
def train_example(num_workers=1, use_gpu=False, test_mode=False): if ray.util.client.ray.is_connected(): # If using Ray Client, make sure model is downloaded on the Server. model_path = ray.get(ray.remote(download_model).remote()) else: model_path = download_model() config = { "test_mode": test_mode, "batch_size": 16 if test_mode else 512 // num_workers, "classification_model_path": model_path } trainer = TorchTrainer(training_operator_cls=GANOperator, num_workers=num_workers, config=config, use_gpu=use_gpu, use_tqdm=True) from tabulate import tabulate pbar = trange(5, unit="epoch") for itr in pbar: stats = trainer.train(info=dict(epoch_idx=itr, num_epochs=5)) pbar.set_postfix(dict(loss_g=stats["loss_g"], loss_d=stats["loss_d"])) formatted = tabulate([stats], headers="keys") if itr > 0: # Get the last line of the stats. formatted = formatted.split("\n")[-1] pbar.write(formatted) return trainer
def train_example(num_workers=1, use_gpu=False, test_mode=False): config = { "test_mode": test_mode, "batch_size": 16 if test_mode else 512 // num_workers, "classification_model_path": os.path.join(os.path.dirname(ray.__file__), "util/sgd/torch/examples/mnist_cnn.pt") } trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.BCELoss, training_operator_cls=GANOperator, num_workers=num_workers, config=config, use_gpu=use_gpu, backend="nccl" if use_gpu else "gloo") from tabulate import tabulate for itr in range(5): stats = trainer.train() formatted = tabulate([stats], headers="keys") if itr > 0: # Get the last line of the stats. formatted = formatted.split("\n")[-1] print(formatted) return trainer
def main(): setup_default_logging() args, args_text = parse_args() if args.smoke_test: ray.init(num_cpus=int(args.ray_num_workers)) else: ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=loss_creator) trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, use_tqdm=True, use_fp16=args.amp, apex_args={"opt_level": "O1"}, config={ "args": args, BATCH_SIZE: args.batch_size }, num_workers=args.ray_num_workers) if args.smoke_test: args.epochs = 1 pbar = trange(args.epochs, unit="epoch") for i in pbar: trainer.train(num_steps=1 if args.smoke_test else None) val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown()
def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) # If using Ray Client, make sure to force model onto CPU. import ray m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected()) print("trained weight: % .2f, bias: % .2f" % (m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def run(num_workers, use_gpu, num_epochs, lr, batch_size, n_hidden, n_layers, n_heads, fan_out, feat_drop, attn_drop, negative_slope, sampling_num_workers): trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, backend="nccl", config={ "lr": lr, "batch_size": batch_size, "n_hidden": n_hidden, "n_layers": n_layers, "n_heads": n_heads, "fan_out": fan_out, "feat_drop": feat_drop, "attn_drop": attn_drop, "negative_slope": negative_slope, "sampling_num_workers": sampling_num_workers }) for i in range(num_epochs): trainer.train() validation_results = trainer.validate() trainer.shutdown() print(validation_results) print("success!")
def train_example(num_workers=1, use_gpu=False): trainer1 = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def main(args): os.makedirs(args.output_dir, exist_ok=True) print(args) start_time = time.time() config = {"args": args, "num_workers": args.num_workers} trainer = TorchTrainer(training_operator_cls=SegOperator, use_tqdm=True, use_fp16=True, num_workers=config["num_workers"], config=config, use_gpu=torch.cuda.is_available()) for epoch in range(args.epochs): trainer.train() confmat = trainer.validate(reduce_results=False)[0] print(confmat) state_dict = trainer.state_dict() state_dict.update(epoch=epoch, args=args) torch.save(state_dict, os.path.join(args.output_dir, f"model_{epoch}.pth")) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f"Training time {total_time_str}")
def train_example(num_replicas=1, use_gpu=False, test_mode=False): config = { "test_mode": test_mode, "classification_model_path": os.path.join(os.path.dirname(ray.__file__), "util/sgd/torch/examples/mnist_cnn.pt") } trainer = TorchTrainer(model_creator, data_creator, optimizer_creator, nn.BCELoss, training_operator_cls=GANOperator, num_replicas=num_replicas, config=config, use_gpu=use_gpu, batch_size=16 if test_mode else 512, backend="nccl" if use_gpu else "gloo") for i in range(5): stats = trainer.train() print(stats) return trainer
def train_example(num_workers=1, use_gpu=False, test_mode=False): config = { "test_mode": test_mode, "batch_size": 16 if test_mode else 512 // num_workers, "classification_model_path": MODEL_PATH } trainer = TorchTrainer(training_operator_cls=GANOperator, num_workers=num_workers, config=config, use_gpu=use_gpu, use_tqdm=True) from tabulate import tabulate pbar = trange(5, unit="epoch") for itr in pbar: stats = trainer.train(info=dict(epoch_idx=itr, num_epochs=5)) pbar.set_postfix(dict(loss_g=stats["loss_g"], loss_d=stats["loss_d"])) formatted = tabulate([stats], headers="keys") if itr > 0: # Get the last line of the stats. formatted = formatted.split("\n")[-1] pbar.write(formatted) return trainer
def test_train(ray_start_2_cpus, num_workers, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local, use_gpu=False) for i in range(3): train_loss1 = trainer.train()["train_loss"] validation_loss1 = trainer.validate()["val_loss"] for i in range(3): train_loss2 = trainer.train()["train_loss"] validation_loss2 = trainer.validate()["val_loss"] assert train_loss2 <= train_loss1, (train_loss2, train_loss1) assert validation_loss2 <= validation_loss1, (validation_loss2, validation_loss1) trainer.shutdown()
def test_single_step(ray_start_2_cpus, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=1, use_local=use_local, use_gpu=False) metrics = trainer.train(num_steps=1) assert metrics[BATCH_COUNT] == 1 val_metrics = trainer.validate(num_steps=1) assert val_metrics[BATCH_COUNT] == 1 trainer.shutdown()
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): Operator = TrainingOperator.from_ptl(LitMNIST) trainer = TorchTrainer( training_operator_cls=Operator, num_workers=num_workers, config={"lr": 1e-3, "batch_size": 64}, use_gpu=use_gpu, use_tqdm=True, ) for i in range(num_epochs): stats = trainer.train() print(stats) print(trainer.validate()) print("Saving model checkpoint to ./model.pt") trainer.save("./model.pt") print("Model Checkpointed!") trainer.shutdown() print("success!")
path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_acc24_00001_1_cost_fn=0,epsilon_input=0_2021-01-21_02-30-49/checkpoint_58/checkpoint-58" # path1 = "/home/edoardo/ray_results/tune_PPO_stopping_car/PPO_StoppingCar_c1c7e_00005_5_cost_fn=0,epsilon_input=0.1_2021-01-17_12-41-27/checkpoint_10/checkpoint-10" val_data = TrainedPolicyDataset(path1, size=(0, 0), seed=4567, traces=False) config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(path1) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() if enable_training: trainer1 = TorchTrainer( training_operator_cls=SafetyTrainingOperator, num_workers=1, use_gpu=True, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn }, backend="auto", scheduler_step_freq="epoch") for i in range(100): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), "checkpoint.pt") torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt") m = trainer1.get_model() print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})") # trainer1.shutdown()
This can be one or more torch optimizer objects. config: Configuration dictionary passed into ``TorchTrainer`` Returns: One or more Torch scheduler objects. """ return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) # __torch_scheduler_end__ # __torch_ray_start__ import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, scheduler_step_freq="epoch", # if scheduler_creator is set config={"lr": 0.001, "batch_size": 64}) # __torch_trainer_end__
ray.init(num_cpus=2) else: ray.init(address="auto") num_workers = 2 if args.local else int(ray.cluster_resources().get(device)) from ray.util.sgd.torch.examples.train_example import LinearDataset print(f"Model: {args.model}") print("Batch size: %d" % args.batch_size) print("Number of %ss: %d" % (device, num_workers)) trainer = TorchTrainer( training_operator_cls=Training, initialization_hook=init_hook, config={ "lr_scaler": num_workers, "model": args.model }, num_workers=num_workers, use_gpu=args.cuda, use_fp16=args.fp16, ) img_secs = [] for x in range(args.num_iters): result = trainer.train() # print(result) img_sec = result["img_sec"] print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device)) img_secs.append(img_sec) # Results
self.model = self.model[0] self.optimizer = self.optimizer[0] # Get the corresponging shard train_shard = train_dataset.get_shard(self.world_rank) train_loader = DataLoader(train_shard, batch_size=64) test_shard = test_dataset.get_shard(self.world_rank) val_loader = DataLoader(test_shard, batch_size=64) self.register_data(train_loader=train_loader, validation_loader=val_loader) # You can either train the model like this trainer = TorchTrainer(training_operator_cls=CustomOperator, num_workers=num_executors, add_dist_sampler=False, num_cpus_per_worker=1, config={"lr": 0.01}) for i in range(10): stats = trainer.train() print(stats) val_stats = trainer.validate() print(val_stats) trainer.shutdown() # Or you can perform a hyperparameter search using Ray Tune # TorchTrainable = TorchTrainer.as_trainable( # training_operator_cls=CustomOperator, # num_workers=num_executors, # add_dist_sampler=False,
# Register all of these components with Ray SGD. # This allows Ray SGD to do framework level setup like Cuda, DDP, # Distributed Sampling, FP16. # We also assign the return values of self.register to instance # attributes so we can access it in our custom training/validation # methods. self.model, self.optimizer, self.criterion, self.scheduler = \ self.register(models=model, optimizers=optimizer, criterion=criterion, schedulers=scheduler) # init ray or ray.init(address="auto") to connect to a running cluster. ray.init() # use TorchTrainer to package your model as a ray-object for moving around trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler is used config={ "lr": 0.001, "batch_size": 64 }, num_workers=100, use_gpu=True) for i in range(10): metrics = trainer.train() val_metrics = trainer.validate()
def main(): parser = HfArgumentParser((ModelArguments, DataProcessingArguments, TrainingArguments, RayArguments)) all_args = parser.parse_args_into_dataclasses() model_args, dataprocessing_args, training_args, ray_args = all_args # For now, let's merge all the sets of args into one, # but soon, we'll keep distinct sets of args, with a # cleaner separation of concerns. args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args), **vars(training_args), **vars(ray_args)) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.".format(args.output_dir)) use_gpu = torch.cuda.is_available() and not args.no_cuda # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) args.output_mode = output_modes[args.task_name] logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logger.info("Training/evaluation parameters %s", args) ray.init(address=args.address) # Training trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, training_operator_cls=TransformerOperator, use_fp16=args.fp16, apex_args={"opt_level": args.fp16_opt_level}, num_workers=args.num_workers, use_gpu=use_gpu, use_tqdm=True, config={"args": args}) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = trainer.get_local_operator().tokenizer local_model = trainer.get_model() epochs_trained = 0 train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", ) trainer.apply_all_workers(lambda: set_seed(args)) if args.do_train: for _ in train_iterator: stats = trainer.train() print("Training stats:", stats) logs = evaluate(args, local_model, tokenizer) print(json.dumps(logs)) # Post-training validation save_and_evaluate_checkpoints(args, local_model, tokenizer)
# Create optimizer. optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Create loss. loss = torch.nn.MSELoss() # Register model, optimizer, and loss. self.model, self.optimizer, self.criterion = self.register( models=model, optimizers=optimizer, criterion=loss) # Register data loaders. self.register_data(train_loader=train_loader, validation_loader=val_loader) ray.init(address='auto') trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=2, use_gpu=False, config={"batch_size": 64}) stats = trainer1.train() print(stats) trainer1.shutdown() print("success!")
optimizers: The return values from ``optimizer_creator``. This can be one or more torch optimizer objects. config: Configuration dictionary passed into ``TorchTrainer`` Returns: One or more Torch scheduler objects. """ return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) # __torch_scheduler_end__ # __torch_ray_start__ import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer( model_creator, data_creator, optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, config={"lr": 0.001}) # __torch_trainer_end__
if __name__ == "__main__": ray.init(address=None if args.local else "auto") num_workers = 2 if args.local else int(ray.cluster_resources().get(device)) from ray.util.sgd.torch.examples.train_example import LinearDataset print("Model: %s" % args.model) print("Batch size: %d" % args.batch_size) print("Number of %ss: %d" % (device, num_workers)) trainer = TorchTrainer( model_creator=lambda cfg: getattr(models, args.model)(), optimizer_creator=lambda model, cfg: optim.SGD( model.parameters(), lr=0.01 * cfg.get("lr_scaler")), data_creator=lambda cfg: LinearDataset(4, 2), initialization_hook=init_hook, config=dict(lr_scaler=num_workers), training_operator_cls=Training, num_workers=num_workers, use_gpu=args.cuda, use_fp16=args.fp16, ) img_secs = [] for x in range(args.num_iters): result = trainer.train() # print(result) img_sec = result["img_sec"] print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device)) img_secs.append(img_sec) # Results
def test_correctness(ray_start_2_cpus, num_workers, use_local): layer = nn.Linear(1, 1) ptl_op = TrainingOperator.from_ptl(PTL_Module) trainer1 = TorchTrainer(training_operator_cls=ptl_op, config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train1_stats = trainer1.train() val1_stats = trainer1.validate() trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator, scheduler_step_freq="manual", config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train2_stats = trainer2.train() val2_stats = trainer2.validate() trainer2.shutdown() assert train1_stats["train_loss"] == train2_stats["train_loss"] assert val1_stats["val_loss"] == val2_stats["val_loss"] assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
def test_save_and_restore(ray_start_2_cpus, num_workers, use_local, tmp_path): # noqa: F811 trainer1 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer1.train() checkpoint_path = os.path.join(tmp_path, "checkpoint") trainer1.save(checkpoint_path) model1 = trainer1.get_model() ints1 = trainer1.apply_all_operators(lambda op: op.get_model().rand_int)[0] trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer2.load(checkpoint_path) model2 = trainer2.get_model() ints2 = trainer2.apply_all_operators(lambda op: op.get_model().rand_int) model1_state_dict = model1.state_dict() model2_state_dict = model2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) for i in ints2: assert i == ints1 trainer2.shutdown()
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=True, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() ray.init(address='auto') trainer1 = TorchTrainer(training_operator_cls=MyTrainingOperator, num_workers=2, use_gpu=False, config=vars(args)) stats = trainer1.train() # lift the trainer to on remote # RemoteTrainer = ray.remote(num_gpus=0.5)(TorchTrainer) # remote_trainer = RemoteTrainer.remote( # training_operator_cls=MyTrainingOperator, num_workers=1, use_gpu=True, config=vars(args)) # # remote_trainer.train.remote() # stats = ray.get([remote_trainer.train.remote()]) # ray.tune.run(TorchTrainer.as_trainable( # training_operator_cls=MyTrainingOperator, num_workers=1, use_gpu=True, config=vars(args))) print(stats) print("success!")
def main(args): if args.smoke_test: ray.init(num_cpus=4) else: ray.init(address=args.address, num_cpus=args.num_workers, log_to_driver=True) # Trainer Initialization trainer = TorchTrainer(training_operator_cls=CIFAR10Module, num_workers=args.num_workers, config={ "lr": args.learning_rate, "lr_decay": args.lr_decay, "eps": args.eps, "momentum": args.momentum, "wd": args.wd, "data_dir": args.data_dir, "batch_size": args.batch_size, "num_workers": args.num_workers, "smoke_test": args.smoke_test }, use_gpu=args.use_gpu, scheduler_step_freq="epoch", use_fp16=args.fp16, use_tqdm=False) train_loss = [] val_loss = [] val_acc = [] path = os.path.join("/root/volume/Paper/MLVC_Internship", args.checkpoint_dir, args.model_name + "_" + str(args.trial)) if not os.path.exists(path): os.mkdir(path) from tabulate import tabulate pbar = trange(args.max_epochs, unit="epoch") for it in pbar: stats = trainer.train(max_retries=1, info=dict(epoch_idx=it, num_epochs=args.max_epochs)) train_loss.append(stats["train_loss"]) val_stats = trainer.validate() val_loss.append(val_stats["val_loss"]) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.save( "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.ray". format(args.model_name, args.trial, it)) torch.save( [train_loss, val_loss], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.loss" .format(args.model_name, args.trial, it)) torch.save( [val_acc], "/root/volume/Paper/MLVC_Internship/checkpoint/{}_{}/epoch_{}.acc". format(args.model_name, args.trial, it)) print(val_stats) trainer.shutdown() print("success!")
optimizers: The return values from ``optimizer_creator``. This can be one or more torch optimizer objects. config: Configuration dictionary passed into ``TorchTrainer`` Returns: One or more Torch scheduler objects. """ return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) # __torch_scheduler_end__ # __torch_ray_start__ import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, config={"lr": 0.001, "batch_size": 64}) # __torch_trainer_end__
import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __backwards_compat_start__ from ray.util.sgd import TorchTrainer MyTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, data_creator=data_creator) trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler_creator is passed in config={"lr": 0.001, "batch_size": 64}) # __backwards_compat_end__ trainer.shutdown() # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler is used config={"lr": 0.001, "batch_size": 64}) # __torch_trainer_end__
config['comet_ml_workspace'] = args.comet_ml_workspace config['comet_ml_project_name'] = args.comet_ml_project_name config['comet_ml_save_model'] = args.comet_ml_save_model # Make sure that all None configuration are correctly formated as None, not a string for key, val in config.items(): if str(val).lower() == 'none': config[key] = None # Start ray # ray.init(address='auto', resources=dict(CPU=120, GPU=120)) ray.init(address='auto') print('DEBUG: Started Ray.') # NOTE: These could actually just be the current VM's resources. If it's the head node, # we might need some extra resources just to add new nodes. print(f'DEBUG: The cluster\'s total resources: \n{ray.cluster_resources()}') print(f'DEBUG: The cluster\'s currently available resources: \n{ray.available_resources()}') # Create the trainer trainer = TorchTrainer( model_creator=utils.eICU_model_creator, data_creator=utils.eICU_data_creator, optimizer_creator=utils.eICU_optimizer_creator, training_operator_cls=utils.eICU_Operator, num_workers=config.get('num_workers', 1), config=config, use_gpu=True, use_fp16=config.get('use_fp16', False), use_tqdm=True) print(f'DEBUG: Created the TorchTrainer object.') # Train the model for epoch in du.utils.iterations_loop(range(config.get('n_epochs', 1)), see_progress=config.get('see_progress', True), desc='Epochs'): stats = trainer.train(info=dict(epoch_idx=epoch))