def test_save_and_restore(ray_start_2_cpus, num_workers, use_local, tmp_path): # noqa: F811 trainer1 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer1.train() checkpoint_path = os.path.join(tmp_path, "checkpoint") trainer1.save(checkpoint_path) model1 = trainer1.get_model() ints1 = trainer1.apply_all_operators(lambda op: op.get_model().rand_int)[0] trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=Operator, num_workers=num_workers, use_local=use_local) trainer2.load(checkpoint_path) model2 = trainer2.get_model() ints2 = trainer2.apply_all_operators(lambda op: op.get_model().rand_int) model1_state_dict = model1.state_dict() model2_state_dict = model2.state_dict() assert set(model1_state_dict.keys()) == set(model2_state_dict.keys()) for k in model1_state_dict: assert torch.equal(model1_state_dict[k], model2_state_dict[k]) for i in ints2: assert i == ints1 trainer2.shutdown()
def train_example(num_workers=1, use_gpu=False): trainer1 = TorchTrainer( model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) # If using Ray Client, make sure to force model onto CPU. import ray m = trainer1.get_model(to_cpu=ray.util.client.ray.is_connected()) print("trained weight: % .2f, bias: % .2f" % (m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def main(): parser = HfArgumentParser((ModelArguments, DataProcessingArguments, TrainingArguments, RayArguments)) all_args = parser.parse_args_into_dataclasses() model_args, dataprocessing_args, training_args, ray_args = all_args # For now, let's merge all the sets of args into one, # but soon, we'll keep distinct sets of args, with a # cleaner separation of concerns. args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args), **vars(training_args), **vars(ray_args)) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.".format(args.output_dir)) use_gpu = torch.cuda.is_available() and not args.no_cuda # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) args.output_mode = output_modes[args.task_name] logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logger.info("Training/evaluation parameters %s", args) ray.init(address=args.address) # Training trainer = TorchTrainer(model_creator=model_creator, data_creator=data_creator, optimizer_creator=optimizer_creator, training_operator_cls=TransformerOperator, use_fp16=args.fp16, apex_args={"opt_level": args.fp16_opt_level}, num_workers=args.num_workers, use_gpu=use_gpu, use_tqdm=True, config={"args": args}) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = trainer.get_local_operator().tokenizer local_model = trainer.get_model() epochs_trained = 0 train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", ) trainer.apply_all_workers(lambda: set_seed(args)) if args.do_train: for _ in train_iterator: stats = trainer.train() print("Training stats:", stats) logs = evaluate(args, local_model, tokenizer) print(json.dumps(logs)) # Post-training validation save_and_evaluate_checkpoints(args, local_model, tokenizer)
use_gpu=True, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn }, backend="auto", scheduler_step_freq="epoch") for i in range(100): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), "checkpoint.pt") torch.save(trainer1.get_model().state_dict(), "invariant_checkpoint.pt") m = trainer1.get_model() print(f"trained weight: torch.tensor([[{m[0].weight.data.cpu().numpy()[0][0]},{m[0].weight.data.cpu().numpy()[0][1]}]]), bias: torch.tensor({m[0].bias.data.cpu().numpy()})") # trainer1.shutdown() print("success!") else: m = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) checkpoint = torch.load("invariant_checkpoint.pt", torch.device("cpu")) m.load_state_dict(checkpoint) # trained weight: [[0.0018693 0.05228069]], bias: [-0.5533147] , train_loss = 0.0 # trained weight: [[-0.01369903 0.03511396]], bias: [-0.6535952] , train_loss = 0.0 # trained weight: [[0.00687088 0.26634103]], bias: [-0.6658108] , train_loss = 0.0 # trained weight: torch.tensor([[0.038166143000125885,0.16197167336940765]]), bias: torch.tensor([-2.3122551]) # %% m.cpu() random.seed(0)
"hidden_size": 1, # used in model_creator "batch_size": 1024, # used in data_creator "path": path1, # path to load the agent nn "path_invariant": path_invariant, # the path to the invariant network }, backend="auto", scheduler_step_freq="epoch") for i in range(50): stats = trainer1.train() print(stats) print(trainer1.validate()) torch.save(trainer1.state_dict(), os.path.join(utils.get_save_dir(), "checkpoint.pt")) torch.save(trainer1.get_model()[0].state_dict(), os.path.join(utils.get_save_dir(), "retrained_agent.pt")) agent_model, invariant_model = trainer1.get_model() else: sequential_nn = convert_ray_policy_to_sequential(policy).cpu() sequential_nn.load_state_dict( torch.load(os.path.join(utils.get_save_dir(), "retrained_agent.pt"))) agent_model = sequential_nn invariant_model = torch.nn.Sequential(torch.nn.Linear(2, 50), torch.nn.ReLU(), torch.nn.Linear(50, 1), torch.nn.Tanh()) invariant_model.load_state_dict( torch.load( path_invariant,