def init_evaluator(opt, eval_queue, confirm_queue): """ Here we initialize the evaluator, creating objects and shit. """ log = init_eval_logger(opt.out_dir) env = get_wrapped_atari( opt.game, mode="testing", seed=opt.seed, no_gym=opt.no_gym ) eval_estimator = get_estimator( "atari", hist_len=opt.hist_len, action_no=env.action_space.n, hidden_sz=opt.hidden_sz, shared_bias=opt.shared_bias, ) eval_estimator.cuda() epsilon = get_epsilon(name="constant", start=opt.eval_epsilon) policy_evaluation = EpsilonGreedyPolicy( eval_estimator, env.action_space, epsilon ) opt.log = log opt.env = env opt.policy_evaluation = policy_evaluation opt.eval_queue = eval_queue opt.confirm_queue = confirm_queue evaluate(opt)
def init_player(opt, experience_queue, sync_queue): """ Function to serve as target for the player process. """ log = Logger(label="label", path=opt.out_dir) log.add_group( tag="playing", metrics=( log.SumMetric("ep_cnt"), log.AvgMetric("rw_per_ep", emph=True), log.AvgMetric("rw_per_step"), log.MaxMetric("max_q"), log.FPSMetric("playing_fps"), log.MaxMetric("ram"), log.MaxMetric("gpu"), ), console_options=("white", "on_green", ["bold"]), ) env = get_wrapped_atari( opt.game, mode="training", seed=opt.seed, no_gym=opt.no_gym, device=torch.device("cuda"), ) epsilon = get_epsilon( steps=opt.epsilon_steps, end=opt.epsilon_end, warmup_steps=opt.learn_start, ) policy_evaluation = EpsilonGreedyPolicy(opt.estimator, env.action_space.n, epsilon) opt.log = log opt.env = env opt.policy_evaluation = policy_evaluation opt.experience_queue = experience_queue opt.sync_queue = sync_queue play(opt)
def test(opt, crt_step, estimator, action_space, eval_env, log): """ Here we do the training. DeepMind uses a constant epsilon schedule with a very small value instead of a completely Deterministic Policy. """ epsilon = get_epsilon(name="constant", start=opt.eval_epsilon) estimator.to("cuda") policy_evaluation = EpsilonGreedyPolicy(estimator, action_space, epsilon) if eval_env is None: eval_env = get_wrapped_atari(opt.game, mode="testing", seed=opt.seed, no_gym=opt.no_gym) mean_ep_rw, mean_ep_crw = evaluate_once(crt_step, policy_evaluation, eval_env, opt.eval_steps, log) return mean_ep_rw, mean_ep_crw
def get_stuff(opt, model): # wrap the gym env env = get_wrapped_atari( opt.game, mode="testing", seed=42, no_gym=opt.no_gym, device=opt.mem_device, ) action_no = env.action_space.n estimator = get_estimator( "atari", hist_len=4, action_no=action_no, hidden_sz=512, shared_bias=opt.shared_bias, ) estimator = estimator.cuda() estimator.load_state_dict(model["model"]) epsilon = get_epsilon(name="constant", start=opt.eval_epsilon) policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon) return env, policy_evaluation
def main(args): """ Here we initialize stuff. """ args.seed = random.randint(0, 1e4) if args.seed == 42 else args.seed print(f"torch manual seed={args.seed}.") torch.manual_seed(args.seed) # wrap the gym env env = get_wrapped_atari( args.game, mode="training", hist_len=4, seed=args.seed, no_gym=args.no_gym, ) print(env) print("ActionSpace: ", env.action_space) # construct an estimator to be used with the policy action_no = env.action_space.n estimator = get_estimator("atari", hist_len=4, action_no=action_no, hidden_sz=256) estimator = estimator.cuda() # construct an epsilon greedy policy # also: epsilon = {'name':'linear', 'start':1, 'end':0.1, 'steps':1000} epsilon = get_epsilon(steps=args.epsilon_steps) policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon) # construct a policy improvement type # optimizer = get_optimizer('Adam', estimator, lr=0.0001, eps=0.0003) optimizer = optim.Adam(estimator.parameters(), lr=args.lr, eps=args.adam_eps) policy_improvement = DQNPolicyImprovement(estimator, optimizer, gamma=0.99, is_double=args.double_dqn) # we also need an experience replay if args.prioritized: experience_replay = PER( args.mem_size, batch_size=32, alpha=0.6, optim_steps=((args.step_no - args.learn_start) / args.update_freq), ) priority_update_cb = partial(priority_update, experience_replay) else: experience_replay = ER(args.mem_size, batch_size=32) # experience_replay = ER(100000, batch_size=32, hist_len=4) # flat # construct a tester tester = None # construct a logger if not args.label: sampling = "prioritized" if args.prioritized else "uniform" label = f"{datetime.now():%Y%b%d-%H%M%S}_{args.game}_{sampling}" log = Logger(label=label, path=f"./results/{label}") train_log = log.add_group( tag="training", metrics=( log.SumMetric("ep_cnt", resetable=False), log.AvgMetric("rw_per_ep", emph=True), log.AvgMetric("rw_per_step"), log.MaxMetric("max_q"), log.FPSMetric("training_fps"), log.FPSMetric("sampling_fps"), ), console_options=("white", "on_blue", ["bold"]), ) log.log_info(train_log, "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S")) log.log_info(train_log, "pytorch v%s." % torch.__version__) # Add the created objects in the args namespace args.env = env args.policy_evaluation = policy_evaluation args.policy_improvement = policy_improvement args.experience_replay = experience_replay args.tester = tester args.log = log if args.prioritized: args.priority_update = priority_update_cb # print the args print_namespace(args) # start the training train(args)
def run(opt): """ Here we initialize stuff. """ opt.seed = random.randint(0, 1e4) if not opt.seed else opt.seed print(f"torch manual seed={opt.seed}.") torch.manual_seed(opt.seed) # wrap the gym env env = get_wrapped_atari( opt.game, mode="training", seed=opt.seed, no_gym=opt.no_gym, device=opt.mem_device, ) if opt.async_eval: eval_env = None else: eval_env = get_wrapped_atari(opt.game, mode="testing", seed=opt.seed, no_gym=opt.no_gym) # construct an estimator to be used with the policy action_no = env.action_space.n estimator = get_estimator( "atari", hist_len=4, action_no=action_no, hidden_sz=512, shared_bias=opt.shared_bias, ) estimator = estimator.cuda() # construct an epsilon greedy policy # also: epsilon = {'name':'linear', 'start':1, 'end':0.1, 'steps':1000} epsilon = get_epsilon( steps=opt.epsilon_steps, end=opt.epsilon_end, warmup_steps=opt.learn_start, ) policy_evaluation = EpsilonGreedyPolicy(estimator, action_no, epsilon) # construct a policy improvement type optimizer = optim.RMSprop( estimator.parameters(), lr=opt.lr, momentum=opt.rmsprop_momentum, alpha=0.95, eps=opt.rmsprop_eps, centered=True, ) policy_improvement = DQNPolicyImprovement(estimator, optimizer, gamma=0.99, is_double=opt.double) # we also need an experience replay experience_replay = create_memory(opt) log = init_eval_logger(opt.out_dir) train_log = log.add_group( tag="training", metrics=( log.SumMetric("ep_cnt"), log.AvgMetric("rw_per_ep", emph=True), log.AvgMetric("rw_per_step"), log.MaxMetric("max_q"), log.FPSMetric("training_fps"), log.FPSMetric("sampling_fps"), log.MaxMetric("ram"), log.MaxMetric("gpu"), ), console_options=("white", "on_blue", ["bold"]), ) log.log_info(train_log, "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S")) log.log_info(train_log, "pytorch v%s." % torch.__version__) # Add the created objects in the opt namespace opt.env = env opt.eval_env = eval_env opt.policy_evaluation = policy_evaluation opt.policy_improvement = policy_improvement opt.experience_replay = experience_replay opt.log = log # print the opt print("Starting experiment using the following settings:") print(liftoff.config.config_to_string(opt)) print(estimator) opt.eval_opt = Namespace( eval_steps=opt.eval_steps, eval_epsilon=opt.eval_epsilon, game=opt.game, seed=opt.seed, no_gym=opt.no_gym, ) opt.evals = [] # start the training train(opt)
def main(seed=42, label="results", training_steps=10000000, lr=0.0001): """ Here we initialize stuff. """ print(f'torch manual seed={seed}.') torch.manual_seed(seed) # wrap the gym env env = get_wrapped_atari('PongNoFrameskip-v4', mode='training', hist_len=4) print(env) # construct an estimator to be used with the policy action_no = env.action_space.n estimator = get_estimator('atari', hist_len=4, action_no=action_no, hidden_sz=512) estimator = estimator.cuda() # construct an epsilon greedy policy # also: epsilon = {'name'='linear', 'start'=1, 'end'=0.1, 'steps_no'=1000} epsilon = get_epsilon(name='linear', start=1, end=0.01, steps_no=30000) policy_evaluation = EpsilonGreedyPolicy(estimator, epsilon) # construct a policy improvement type # optimizer = get_optimizer('Adam', estimator, lr=0.0001, eps=0.0003) optimizer = optim.Adam(estimator.parameters(), lr=lr) policy_improvement = DQNPolicyImprovement(estimator, optimizer, gamma=0.99) # we also need an experience replay experience_replay = ER(100000, batch_size=32, hist_len=4, cuda=True) # construct a tester tester = None # construct a logger log = Logger(label=label, path=f'./{label}') train_log = log.add_group( tag="training", metrics=(log.SumMetric("ep_cnt", resetable=False), log.AvgMetric("rw_per_ep", emph=True), log.AvgMetric("rw_per_step"), log.MaxMetric("max_q"), log.FPSMetric("training_fps"), log.FPSMetric("sampling_fps")), console_options=("white", "on_blue", ["bold"]) ) log.log_info(train_log, "date: %s." % time.strftime("%d/%m/%Y | %H:%M:%S")) log.log_info(train_log, "pytorch v%s." % torch.__version__) # construct a structure for easily accessing objects and settings args = SimpleNamespace(env=env, policy_evaluation=policy_evaluation, policy_improvement=policy_improvement, experience_replay=experience_replay, tester=tester, log=log, training_steps=training_steps, start_learning_after=10000, update_freq=1) for k, v in args.__dict__.items(): if k != "env": k = clr(k, attrs=['bold']) print(f'{k}: {v}') # start the training train(args)