"policies": policies, "policy_mapping_fn": lambda agent_id: "ppo_policy", }, # "num_gpus": 0, # "num_gpus_per_worker": 0, "callbacks": PlayerScoreCallbacks }) if restore_checkpoint: trainer.restore(checkpoint_path) start = time.time() try: for i in range(num_iter): res = trainer.train() print("Iteration {}. policy result: {}".format(i, res)) if i % eval_every == 0: trainer_eval.set_weights(trainer.get_weights(["ppo_policy"])) res = trainer_eval.train() if i % checkpoint_every == 0: trainer.save() except: trainer.save() stop = time.time() train_duration = time.strftime('%H:%M:%S', time.gmtime(stop - start)) print( 'Training finished ({}), check the results in ~/ray_results/<dir>/'.format( train_duration))
# dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn_trainer.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo_trainer.train() print(pretty_print(result_ppo)) # Test passed gracefully. if args.as_test and \ result_dqn["episode_reward_mean"] > args.stop_reward and \ result_ppo["episode_reward_mean"] > args.stop_reward: print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"])) # Desired reward not reached. if args.as_test: raise ValueError("Desired reward ({}) not reached!".format( args.stop_reward))
config=config, stop={"training_iteration": args.pre_training_iters}, verbose=1, checkpoint_freq=1, checkpoint_at_end=True, ) print("Pre-training done.") best_checkpoint = results.get_best_checkpoint( results.trials[0], mode="max") print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Trainer to "fix" our checkpoint. new_trainer = PPOTrainer(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. new_trainer.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_trainer.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_trainer.save() new_trainer.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}") print("Starting new tune.run")