def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name1, writer1, net1, tgt_net1, selector1, epsilon_tracker1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0) result_name2, writer2, net2, tgt_net2, selector2, epsilon_tracker2, agent2, exp_source2, buffer2, optimizer2 = main.make_components(args, params, device, env, 1) frame = 0 frame_idx1 = 0 frame_idx2 = 0 eval_states1 = None eval_states2 = None date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1, \ common.RewardTracker(writer2, params['stop_reward_player2'], net2, date_time + result_name2 + ".dat", 1, env) as reward_tracker2: # fill histories main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1) main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2) while True: if frame // args.units % 2 == 0: frame_idx1 += 1 if main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1): break else: frame_idx2 += 1 if main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2): break frame += 1 if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break
def play_func(params, net, cuda, exp_queue): env = make_env(params) writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=cuda) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break exp_queue.put(None)
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name1, writer1, net1, tgt_net1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0) net2 = ptan.agent.TargetNet(net1) agent2 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) frame = 0 frame_idx1 = 0 date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1: # fill history main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1) while True: if frame // args.units % 2 == 0: state, _, _, _ = env.step((1, -1)) action, _ = agent2([state]) state, reward, done, _ = env.step((1, action[0])) if done: state = env.reset() else: frame_idx1 += 1 if main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1): break if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break frame += 1 if frame % NET_SYNC == 0: net2.sync()
def play_func(params, net, cuda, exp_queue): env = gym.make(params.env_name) env = ptan.common.wrappers.wrap_dqn(env) device = torch.device("cuda" if cuda else "cpu") writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params.epsilon_start) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params.stop_reward) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break exp_queue.put(None)
def grads_func(proc_name, net, cuda, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] agent = ptan.agent.PolicyAgent(lambda x: net(x)[0], cuda=cuda, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast(envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) batch = [] frame_idx = 0 writer = SummaryWriter(comment=proc_name) with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for exp in exp_source: frame_idx += 1 new_rewards = exp_source.pop_total_rewards() if new_rewards and tracker.reward(new_rewards[0], frame_idx): break batch.append(exp) if len(batch) < GRAD_BATCH: continue states_v, actions_t, vals_ref_v = \ common.unpack_batch(batch, net, last_val_gamma=GAMMA**REWARD_STEPS, cuda=cuda) batch.clear() net.zero_grad() logits_v, value_v = net(states_v) loss_value_v = F.mse_loss(value_v, vals_ref_v) log_prob_v = F.log_softmax(logits_v, dim=1) adv_v = vals_ref_v - value_v.detach() log_prob_actions_v = adv_v * log_prob_v[range(GRAD_BATCH), actions_t] loss_policy_v = -log_prob_actions_v.mean() prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = ENTROPY_BETA * (prob_v * log_prob_v).sum(dim=1).mean() loss_v = entropy_loss_v + loss_value_v + loss_policy_v loss_v.backward() tb_tracker.track("advantage", adv_v, frame_idx) tb_tracker.track("values", value_v, frame_idx) tb_tracker.track("batch_rewards", vals_ref_v, frame_idx) tb_tracker.track("loss_entropy", entropy_loss_v, frame_idx) tb_tracker.track("loss_policy", loss_policy_v, frame_idx) tb_tracker.track("loss_value", loss_value_v, frame_idx) tb_tracker.track("loss_total", loss_v, frame_idx) # gather gradients nn_utils.clip_grad_norm(net.parameters(), CLIP_GRAD) grads = [param.grad.data.cpu().numpy() if param.grad is not None else None for param in net.parameters()] train_queue.put(grads) train_queue.put(None)
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) net1 = dqn_model.RainbowDQN(env.observation_space.shape, env.action_space.n) net1.load_state_dict( torch.load(args.model1, map_location=lambda storage, loc: storage)) agent1 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=torch.device("cpu")) result_name = "-" + "-rainbow" + "-scenario=" + args.scenario + "-units=" + str( args.units) writer1 = SummaryWriter(comment=result_name + "-player0") env.reset() total_reward1 = 0.0 counter1 = collections.Counter() epsilon = 0.02 frame_idx1 = 0 with common.RewardTracker(writer1, 100, net1, "x.dat", 0, env) as reward_tracker1: while True: frame_idx1 += 1 if np.random.random() < epsilon: action = [env.action_space.sample()] else: state, _, _, _ = env.step((0, -1)) action, _ = agent1([state], [None]) counter1[action[0]] += 1 _, reward, done, _ = env.step((0, action[0])) total_reward1 += reward if done: reward_tracker1.reward(total_reward1, frame_idx1) total_reward1 = 0.0 env.reset() net1.load_state_dict( torch.load(args.model1, map_location=lambda storage, loc: storage)) if args.maxFrames > 0 and frame_idx1 > args.maxFrames: break
def play_func(params, net, cuda, fsa, exp_queue, fsa_nvec=None): device = torch.device("cuda" if cuda else "cpu") env = make_env(params) writer = SummaryWriter(comment="-" + params['run_name'] + "-05_new_wrappers") if not fsa: selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa) else: if 'Index' in net.__class__.__name__: selector = ptan.actions.EpsilonGreedyActionSelectorFsa(fsa_nvec, epsilon=params['epsilon_start']) epsilon_tracker = common.IndexedEpsilonTracker(selector, params, fsa_nvec) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa, epsilon_tracker=epsilon_tracker) else: selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device, fsa=fsa) # epsilon_tracker = common.IndexedEpsilonTrackerNoStates(selector, params, fsa_nvec) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward'], params['telemetry'], params['plot']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) if not fsa or 'Index' not in net.__class__.__name__: epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() new_scores = exp_source.pop_total_scores() if new_rewards: if not fsa or 'Index' not in net.__class__.__name__: new_score = [] if not new_scores else new_scores[0] if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon, params['plot']): break else: new_score = [] if not new_scores else new_scores[0] if reward_tracker.reward(new_rewards[0], new_score, frame_idx, selector.epsilon_dict, params['plot']): break exp_queue.put(None)
def execute(args, params, device): utils.kill_game_processes() env = main.make_env(args, params) result_name, writer, net, tgt_net, agent, exp_source, buffer, optimizer = main.make_components( args, params, device, env, 0) frame_idx = 0 date_time = datetime.now().strftime("%b%d_%H-%M-%S") with common.RewardTracker(writer, params['stop_reward_player1'], net, date_time + result_name + ".dat", 0, env) as reward_tracker: while True: frame_idx += 1 if main.train(params, buffer, device, frame_idx, exp_source, reward_tracker, optimizer, net, tgt_net, writer): break if args.maxFrames > 0 and frame_idx > args.maxFrames: break
def play_func(params, net, cuda, exp_queue, device_id): env_name = params['env_name'] run_name = params['run_name'] if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] env = gym.make(env_name) env = ptan.common.wrappers.wrap_dqn(env) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") if 'save_iter' not in params: save_iter = 500 else: save_iter = params['save_iter'] writer = SummaryWriter(comment="-" + params['run_name'] + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('models/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] best_reward = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % save_iter == 0): # write to disk np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) if mean_reward > best_reward: print("Saving model...") model_name = 'models/{}_{}.pth'.format(run_name, game_idx) torch.save(net, model_name) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) best_reward = mean_reward if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'models/{}_{}.pth'.format(run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('models/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
# Initialise weights and copy from net to target net tf.global_variables_initializer().run() sync_nets.run() # Action selector selector = rl.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) # Epsilon increment epsilon_tracker = common.EpsilonTracker(selector, params) # DQN agent agent = rl.agent.DQNAgent(state, net_q, selector) # Experience source exp_source = rl.experience_ptan.ExperienceSourceFirstLast(env, agent, gamma=params['gamma']) # Memory buffer buffer = rl.experience_ptan.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) frame_idx = 0 with common.RewardTracker(writer) as reward_tracker: # Initial save saver.save(sess, save_dir, global_step=global_step) while frame_idx < total_frames: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon) # Don't train while filling memory if len(buffer) < rep_init: continue
def main(_config,_run): logger = _run SAVE_NAME = _config['SAVE_NAME'] LOAD_SAVED_MODEL = _config['LOAD_SAVED_MODEL'] MODEL_PATH_FINAL = _config['MODEL_PATH_FINAL'] total_steps = 1000000 params = common.HYPERPARAMS['gamePlay2'] params['epsilon_frames'] *= 2 parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name'],glob_conf=_config,logger=logger) #env = ptan.common.wrappers.wrap_dqn(env) writer = SummaryWriter(comment="-" + params['run_name'] + "-rainbow-beta200") net = RainbowDQN(env.observation_space.shape, env.action_space.n).to(device) #net.load_state_dict(torch.load( )) name_load = current_path +"/models" +MODEL_PATH_FINAL if _config['LOAD_SAVED_MODEL']: mdl, opt, lss = load_ckp(MODEL_PATH_FINAL, net, optimizer) net = mdl optimizer = opt tgt_net = ptan.agent.TargetNet(net) agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) # change the step_counts to change multi step prediction exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) today = datetime.datetime.now() todays_date_full = str(today.year) + "_" + str(today.month) + "_" + str(today.day) + "_" todays_date_full += str(today.hour) + "_" + str(today.minute) + "_" + str(today.second) folder_name = todays_date_full +"_"+experiment_name results_dir = current_path + "/results/" + folder_name results_dir_weights = results_dir + "/weights" os.makedirs(results_dir) os.makedirs(results_dir_weights) frame_idx = 0 beta = BETA_START best_mean_reward = 0.0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while frame_idx < total_steps: frame_idx += 1 buffer.populate(1) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: # start saving the model after actual training begins if frame_idx > 100: if best_mean_reward is None or best_mean_reward < reward_tracker.mean_reward: torch.save(net.state_dict(), SAVE_NAME + "-best.dat") if best_mean_reward is not None: print("Best mean reward updated %.3f -> %.3f, model saved" % \ (best_mean_reward, reward_tracker.mean_reward)) if not reward_tracker.mean_reward == 0: best_mean_reward = reward_tracker.mean_reward if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states, _, _ = buffer.sample(STATES_TO_EVALUATE, beta) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'], beta) loss_v, sample_prios_v = calc_loss(batch, batch_weights, net, tgt_net.target_model, params['gamma'] ** REWARD_STEPS, device=device) # if frame_idx % 10000 == 0: if frame_idx % 5000 == 0: checkpoint = ({ 'model': net.state_dict(), 'optimizer': optimizer.state_dict(), 'loss': loss_v, 'num_step': frame_idx }) torch.save(checkpoint, results_dir_weights + "/rainbow" + str(frame_idx) + "step.dat") # Save network parameters as histogram for name, param in net.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), frame_idx) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios_v.data.cpu().numpy()) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if logger: loss_v.item() logger.log_scalar("loss", loss_v.item()) logger.log_scalar("mean_reward", reward_tracker.mean_reward)
"""자식 process가 메인 process에 데이터 전달""" train_queue = mp.Queue( maxsize=PROCESSES_COUNT) #꽉찬 큐에는 새로 입력 불가능 (for on-policy) data_proc_list = [] for _ in range(PROCESSES_COUNT): #자식 별 data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() #data_fun()이 자식 process에서 실행 data_proc_list.append(data_proc) """학습""" batch = [] step_idx = 0 try: with common.RewardTracker(writer, stop_reward=REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: while True: train_entry = train_queue.get() #queue에 있는게 reward if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue #queue에 있는게 reward가 아닌 expsource객체 (에피소드가 끝남) step_idx += 1 batch.append(train_entry)
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay") net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: writer.add_scalar("beta", beta, frame_idx) if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta) loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model, params["gamma"], cuda=args.cuda) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
def main(): global params_save_file game = 'revenge' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-dqfd(PDD DQN)") net = dqn_model.DuelingDQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) demo_data = demo_data_reader.get_demo_data(env, game, num_states=params['demo_size'], skip=params['skip-frames']) exp_source = ptan.experience.ExperienceSourceNFirstLast(env, agent, gamma=params['gamma'], steps_count=params['n-steps'], demo_data=demo_data) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) buffer.populate_demo_data() optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'], weight_decay=L2_REG_LAMBDA) print("Demo data size: {}".format(buffer.demo_samples)) sys.stdout.flush() frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] if frame_idx > params['pretrain_steps']: buffer.populate(params['steps']) else: if frame_idx % 500 == 0: writer.add_scalar("beta", beta, frame_idx) reward_tracker.record_training(frame_idx, selector.epsilon, last_dq_losses, last_n_losses, last_e_losses, last_demo_sizes) epsilon_tracker.frame(frame_idx) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: writer.add_scalar("beta", beta, frame_idx) if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses, last_n_losses, last_e_losses, last_demo_sizes): break optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta) batch_demo_mask = (np.array(batch_indices) < buffer.demo_samples).astype(np.uint8) loss_v, sample_prios = calc_loss(batch, batch_demo_mask, batch_weights, net, tgt_net.target_model, params["gamma"], params["gamma"] ** params['n-steps'], cuda=args.cuda) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
def train_model(cuda, phase, premodel, pdays): """ cuda : True / False phase : 1~3 premodel: data/phase1_model.data pdays: integer """ device = torch.device("cuda" if cuda else "cpu") phase = int(phase) if phase == 1: config = sconfig elif phase == 2: config = mconfig elif phase == 3: config = pconfig run_name = "v" + config.version + "-phase" + str(phase) saves_path = os.path.join("saves", run_name) os.makedirs(saves_path, exist_ok=True) save_name = "" writer = SummaryWriter(comment=run_name) prices_list, val_prices_list = data.load_prices(config.choices) if phase == 1: s_env = environ.StocksEnvS(prices_list) stock_env = s_env val_stock_env = environ.StocksEnvS(val_prices_list) save_name = "{}.data".format(run_name) elif phase == 2: # phase 1 의 network 그래프를 로드한다. s_env = environ.StocksEnvS(prices_list) prenet = models.SimpleFFDQN(s_env.observation_space.shape[0], s_env.action_space.n) #.to(device) models.load_model(premodel, prenet) # phase2 환경 생성 stock_env = environ.StocksEnvM(prices_list, prenet) val_stock_env = environ.StocksEnvM(val_prices_list, prenet) save_name = "{}.data".format(run_name) elif phase == 3: predict_days = int(pdays) stock_env = pdenviron.PredEnv(prices_list=prices_list, predict_days=7) val_stock_env = pdenviron.PredEnv(prices_list=prices_list, predict_days=7) save_name = "{}-{}.data".format(run_name, predict_days) net = models.SimpleFFDQN(stock_env.observation_space.shape[0], stock_env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(config.epsilon_start) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( stock_env, agent, config.gamma, steps_count=config.reward_steps) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, config.replay_size) optimizer = optim.Adam(net.parameters(), lr=config.learning_rate) # main training loop step_idx = 0 eval_states = None best_mean_val = None with common.RewardTracker(writer, np.inf, group_rewards=100) as reward_tracker: while step_idx < config.end_step: step_idx += 1 buffer.populate(1) selector.epsilon = max( config.epsilon_stop, config.epsilon_start - step_idx / config.epsilon_steps) new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) if len(buffer) < config.replay_initial: continue if eval_states is None: print("Initial buffer populated, start training") eval_states = buffer.sample(config.states_to_evaluate) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) if step_idx % config.eval_every_step == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) writer.add_scalar("values_mean", mean_val, step_idx) if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print("%d: Best mean value updated %.3f -> %.3f" % (step_idx, best_mean_val, mean_val)) best_mean_val = mean_val #torch.save(net.state_dict(), os.path.join(saves_path, "mean_val-%.3f.data" % mean_val)) optimizer.zero_grad() batch = buffer.sample(config.batch_size) loss_v = common.calc_loss(batch, net, tgt_net.target_model, config.gamma**config.reward_steps, device=device) loss_v.backward() optimizer.step() if step_idx % config.target_net_sync == 0: tgt_net.sync() if step_idx % config.checkpoint_every_step == 0: idx = step_idx // config.checkpoint_every_step torch.save( net.state_dict(), os.path.join(saves_path, "checkpoint-%d.data" % idx)) if step_idx % config.validation_every_step == 0: res = validation.validation_run(stock_env, net, device=device) for key, val in res.items(): writer.add_scalar(key + "_test", val, step_idx) res = validation.validation_run(val_stock_env, net, device=device) for key, val in res.items(): writer.add_scalar(key + "_val", val, step_idx) models.save_model(os.path.join(saves_path, save_name), net, {"predict_days": predict_days})
train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for _ in range(PROCESSES_COUNT): data_proc = mp.Process(target=data_func, args=(net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc) batch_states = [] batch_actions = [] batch_vals_ref = [] step_idx = 0 batch_size = 0 try: with common.RewardTracker(writer, REWARD_BOUND) as tracker: with ptan.common.utils.TBMeanTracker(writer, 100) as tb_tracker: while True: train_entry = train_queue.get() if isinstance(train_entry, TotalReward): if tracker.reward(train_entry.reward, step_idx): break continue states_t, actions_t, vals_ref_t = train_entry batch_states.append(states_t) batch_actions.append(actions_t) batch_vals_ref.append(vals_ref_t) step_idx += states_t.size()[0] batch_size += states_t.size()[0] if batch_size < BATCH_SIZE:
def play_func(params, net, cuda, exp_queue, device_id): """ With multiple envs, the exp_source class will return experiences (defined as a tuple of (state_framestack, action, reward, last_state_framestack) alternating between the two environments. Otherwise it returns just experinces from a single env. Even if the games have different frame shapes, they will by reduced to 84x84 *** There is a reason that it reinitializes the envs in this function that has to do with parallelization *** """ run_name = 'demon_invaders' if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] envSI = gym.make('SpaceInvadersNoFrameskip-v4') envSI = ptan.common.wrappers.wrap_dqn(envSI) envDA = gym.make('DemonAttackNoFrameskip-v4') envDA = ptan.common.wrappers.wrap_dqn(envDA) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") if 'save_iter' not in params: save_iter = 500 else: save_iter = params['save_iter'] writer = SummaryWriter(comment="-" + run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( [envSI, envDA], agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('models_multi/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward( new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % save_iter == 0): # write to disk print("Saving model...") model_name = 'models_multi/{}_{}_{}.pth'.format( run_name, params['secondary'], game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt( 'models_multi/{}_{}_reward.txt'.format( run_name, params['secondary']), np.array(mean_rewards)) if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'models_multi/{}_{}_{}.pth'.format(run_name, params['secondary'], game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt( 'models_multi/{}_{}_reward.txt'.format(run_name, params['secondary']), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('models_multi/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
def train_agent( run_name, data_paths=conf.default_data_paths, validation_paths=conf.default_validation_paths, model=models.DQNConv1D, large=False, load_checkpoint=None, saves_path=None, eps_steps=None, ): """ Main function for training the agents :run_name: a string of choice that dictates where to save :data_paths: dict specifying what data to train with :validation_paths: dict specifying what data to validate with :model: what model to use :large: whether or not to use large feature set :load_checkpoint: an optinal path to checkpoint to load from """ print("=" * 80) print("Training starting".rjust(40 + 17 // 2)) print("=" * 80) # Get training data stock_data = data.get_data_as_dict(data_paths, large=large) val_data = data.get_data_as_dict(validation_paths, large=large) # Setup before training can begin step_idx = 0 eval_states = None best_mean_val = None EPSILON_STEPS = eps_steps if eps_steps is not None else conf.EPSILON_STEPS # Use GPU if available, else fall back on CPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"[Info] Using device: {device}") # Set up the path to save the checkpoints to if saves_path is None: saves_path = os.path.join("saves", run_name) else: saves_path = os.path.join(saves_path, run_name) print(f"[Info] Saving to path: {saves_path}") os.makedirs(saves_path, exist_ok=True) # Create the gym-environment that the agent will interact with during training env = environ.StocksEnv( stock_data, bars_count=conf.BARS_COUNT, reset_on_close=conf.RESET_ON_CLOSE, random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET, reward_on_close=conf.REWARD_ON_CLOSE, large=large, ) env = wrappers.TimeLimit(env, max_episode_steps=1000) # Create the gym-environment that the agent will interact with when validating env_val = environ.StocksEnv( val_data, bars_count=conf.BARS_COUNT, reset_on_close=conf.RESET_ON_CLOSE, random_ofs_on_reset=conf.RANDOM_OFS_ON_RESET, reward_on_close=conf.REWARD_ON_CLOSE, large=large, ) # Create the model net = model(env.observation_space.shape, env.action_space.n).to(device) print("Using network:".rjust(40 + 14 // 2)) print("=" * 80) print(net) # Initialize agent and epsilon-greedy action-selector from the ptan package # The ptan package provides some helper and wrapper functions for ease of # use of reinforcement learning tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(conf.EPSILON_START) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, conf.GAMMA, steps_count=conf.REWARD_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, conf.REPLAY_SIZE) optimizer = optim.Adam(net.parameters(), lr=conf.LEARNING_RATE) # If a checkpoint is supplied to the function –> resume the training from there if load_checkpoint is not None: state = torch.load(load_checkpoint) net.load_state_dict(state["model_state_dict"]) optimizer.load_state_dict(state["optimizer_state_dict"]) step_idx = state["step_idx"] best_mean_val = state["best_mean_val"] print( f"State loaded –> step index: {step_idx}, best mean val: {best_mean_val}" ) net.train() # Create a reward tracker, i.e. an object that keeps track of the # rewards the agent gets during training reward_tracker = common.RewardTracker(np.inf, group_rewards=100) # The main training loop print("Training loop starting".rjust(40 + 22 // 2)) print("=" * 80) # Run the main training loop while True: step_idx += 1 buffer.populate(1) # Get current epsilon for epsilon-greedy action-selection selector.epsilon = max(conf.EPSILON_STOP, conf.EPSILON_START - step_idx / EPSILON_STEPS) # Take a step and get rewards new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) # As long as not enough data is in buffer, go to top again if len(buffer) < conf.REPLAY_INITIAL: continue if eval_states is None: print("Initial buffer populated, start training") eval_states = buffer.sample(conf.STATES_TO_EVALUATE) eval_states = [ np.array(transition.state, copy=False) for transition in eval_states ] eval_states = np.array(eval_states, copy=False) # Evaluate the model every x number of steps # and update the currently best performance if better value gotten if step_idx % conf.EVAL_EVERY_STEP == 0: mean_val = common.calc_values_of_states(eval_states, net, device=device) # If new best value –> save the model, both with meta data for resuming training # and as the full object for use in testing if best_mean_val is None or best_mean_val < mean_val: if best_mean_val is not None: print( f"{step_idx}: Best mean value updated {best_mean_val:.3f} -> {mean_val:.3f}" ) best_mean_val = mean_val # Save checkpoint with meta data torch.save( { "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "step_idx": step_idx, "best_mean_val": best_mean_val, }, os.path.join(saves_path, f"mean_val-{mean_val:.3f}.data"), ) # Save full object for testing torch.save( net, os.path.join(saves_path, f"mean_val-{mean_val:.3f}-fullmodel.data"), ) # Reset optimizer's gradients before optimization step optimizer.zero_grad() batch = buffer.sample(conf.BATCH_SIZE) # Calculate the loss loss_v = common.calc_loss( batch, net, tgt_net.target_model, conf.GAMMA**conf.REWARD_STEPS, device=device, ) # Calculate the gradient loss_v.backward() # Do one step of gradient descent optimizer.step() # Sync up the to networks we're using # Two networks in this manner should increase the agent's ability to converge if step_idx % conf.TARGET_NET_SYNC == 0: tgt_net.sync() # Every 1 million steps, save model in case something happens # so we can resume training in that case if step_idx % conf.CHECKPOINT_EVERY_STEP == 0: idx = step_idx // conf.CHECKPOINT_EVERY_STEP torch.save( { "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "step_idx": step_idx, "best_mean_val": best_mean_val, }, os.path.join(saves_path, f"checkpoint-{idx}.data"), ) torch.save(net, os.path.join(saves_path, f"fullmodel-{idx}.data")) print("Training done")
agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=args.nsteps) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None # will be populated with held-out states with common.RewardTracker(writer, params['stop_reward'] ) as reward_tracker: #create a reward tracker object while True: frame_idx += 1 # ExperienceReplayBuffer asks the ExperienceSourceFirstLast to iterate by one step to get the next transition # ExperienceSourceFirstLast feeds observation to obtain action # Agent calculated Q-values through the NN # Action selector selects action # Action is fed into ExperienceSource to obtain reward and next obs # Buffer stores transition in FIFO order buffer.populate(1) # iterates ExperienceReplayBuffer by 1 step. # this in turn iterates exp_source [ExperienceSourceFirstLast] by one step # one single experience step # Experience = namedtuple('Experience', ['state', 'action', 'reward', 'done']) # Class ExperienceSource provides us full subtrajectories of given length as the list of (s, a, r, s') objects. # Now it returns single object on every iteration, which is again a namedtuple with the following fields:
tgt_net = ptan.agent.TargetNet(net) agent = ptan.agent.DQNAgent(lambda x: net.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 beta = BETA_START with common.RewardTracker(MODEL_NAME, net, writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) beta = min( 1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad()
net = AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n) if args.cuda: net.cuda() print(net) agent = ptan.agent.ActorCriticAgent(net, apply_softmax=True, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceRollouts( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5) step_idx = 0 with common.RewardTracker(writer, stop_reward=18) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: for mb_states, mb_rewards, mb_actions, mb_values in exp_source: # handle new rewards new_rewards = exp_source.pop_total_rewards() if new_rewards: if tracker.reward(np.mean(new_rewards), step_idx): break optimizer.zero_grad() states_v = Variable(torch.from_numpy(mb_states)) mb_adv = mb_rewards - mb_values adv_v = Variable(torch.from_numpy(mb_adv)) actions_t = torch.from_numpy(mb_actions) vals_ref_v = Variable(torch.from_numpy(mb_rewards))
data_proc_list = [] # Spawn processes to run data_func for _ in range(PROCESSES_COUNT): data_proc = mp.Process( target=data_func, args=(net, device, train_queue)) # The processes will run data_func() data_proc.start() data_proc_list.append(data_proc) batch = [] step_idx = 0 try: with common.RewardTracker( writer, stop_reward=REWARD_BOUND ) as tracker: # Run until reward goal reached with ptan.common.utils.TBMeanTracker( writer, batch_size=100) as tb_tracker: # ?? while True: # Get one transition from the training queue train_entry = train_queue.get() # If the episode is over we will receive the total reward from that episode if isinstance(train_entry, TotalReward): finished, save_checkpoint = tracker.reward( train_entry.reward, step_idx) if save_checkpoint: torch.save( net.state_dict(), './checkpoints/' + args.name + "-best.dat") if finished:
net.cuda() tgt_net = ptan.agent.TargetNet(net) agent = ptan.agent.DQNAgent(net, ptan.actions.ArgmaxActionSelector(), cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast( env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer( exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch = buffer.sample(params['batch_size']) loss_v = common.calc_loss_dqn(batch,
writer = SummaryWriter(comment="-simple-" + args.run) net = models.SimpleFFDQN(env.observation_space.shape[0], env.action_space.n).to(device) tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(EPSILON_START) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, GAMMA, steps_count=REWARD_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, REPLAY_SIZE) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # main training loop step_idx = 0 eval_states = None best_mean_val = None with common.RewardTracker(writer, np.inf, group_rewards=100) as reward_tracker: while True: step_idx += 1 buffer.populate(1) selector.epsilon = max(EPSILON_STOP, EPSILON_START - step_idx / EPSILON_STEPS) new_rewards = exp_source.pop_rewards_steps() if new_rewards: reward_tracker.reward(new_rewards[0], step_idx, selector.epsilon) if len(buffer) < REPLAY_INITIAL: continue if eval_states is None: print("Initial buffer populated, start training") eval_states = buffer.sample(STATES_TO_EVALUATE)
def main(): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) save_path = os.path.join("saves", "ddpg-") os.makedirs(save_path, exist_ok=True) device = torch.device("cuda") act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = common.TargetNet(act_net) tgt_crt_net = common.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_") agent = model.AgentDDPG(act_net, device=device) exp_source = experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with common.RewardTracker(writer) as tracker: with common.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 #print("populate") buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() #print(rewards_steps) if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < 100: continue batch = buffer.sample(BATCH_SIZE) #print("infer") states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) #print("train critic")# train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) #print("train actor") # train actor act_opt.zero_grad() act_opt.zero_grad() cur_actions_v = act_net(states_v) crt_distr_v = crt_net(states_v, cur_actions_v) actor_loss_v = -crt_net.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() actor_loss_v.backward() act_opt.step() tb_tracker.track("loss_actor", actor_loss_v, frame_idx) tgt_act_net.alpha_sync(alpha=1 - 1e-3) tgt_crt_net.alpha_sync(alpha=1 - 1e-3) if frame_idx % TEST_ITERS == 0: print("testing") env.reset() ts = time.time() rewards, steps = test_net(act_net, env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, frame_idx) writer.add_scalar("test_steps", steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double)) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch = buffer.sample(params['batch_size'] * params['steps']) loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, double=args.double) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % EVAL_EVERY_FRAME == 0: mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda) writer.add_scalar("values_mean", mean_val, frame_idx) if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
def play_func(params, net, cuda, exp_queue, device_id): """ The paper suggests sampling the actions from the learner net, so that requires little change from the multienv implementation. *** There is a reason that it reinitializes the envs in this function that has to do with parallelization *** """ run_name = params['run_name'] if 'max_games' not in params: max_games = 16000 else: max_games = params['max_games'] envSI = gym.make('SpaceInvadersNoFrameskip-v4') envSI = ptan.common.wrappers.wrap_dqn(envSI) envDA = gym.make('DemonAttackNoFrameskip-v4') envDA = ptan.common.wrappers.wrap_dqn(envDA) device = torch.device("cuda:{}".format(device_id) if cuda else "cpu") writer = SummaryWriter(comment="-" + run_name + "-03_parallel") selector = ptan.actions.EpsilonGreedyActionSelector( epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, device=device) exp_source = ExperienceSourceFirstLast_AM([envSI, envDA], agent, gamma=params['gamma'], steps_count=1) exp_source_iter = iter(exp_source) fh = open('mimic_models/{}_metadata.csv'.format(run_name), 'w') out_csv = csv.writer(fh) frame_idx = 0 game_idx = 1 model_count = 0 model_stats = [] mean_rewards = [] with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += 1 exp = next(exp_source_iter) exp_queue.put(exp) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: status, num_games, mean_reward, epsilon_str = reward_tracker.reward( new_rewards[0], frame_idx, selector.epsilon) mean_rewards.append(mean_reward) if status: break if game_idx and (game_idx % 500 == 0): # write to disk print("Saving model...") model_name = 'mimic_models/{}_{}.pth'.format( run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('mimic_models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) if game_idx == max_games: break game_idx += 1 print("Saving final model...") model_name = 'mimic_models/{}_{}.pth'.format(run_name, game_idx) net.to(torch.device('cpu')) torch.save(net, model_name) net.to(device) new_row = [model_name, num_games, mean_reward, epsilon_str] out_csv.writerow(new_row) np.savetxt('mimic_models/{}_reward.txt'.format(run_name), np.array(mean_rewards)) # plt.figure(figsize=(16, 9)) # plt.tight_layout() # plt.title('Reward vs time, {}'.format(run_name)) # plt.xlabel('Iteration') # plt.ylabel('Reward') # ys = np.array(mean_rewards) # plt.plot(ys, c='r') # plt.savefig('mimic_models/{}_reward.png'.format(run_name)) # plt.close() fh.close() exp_queue.put(None)
tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) writer = SummaryWriter(comment=f"-ddpg_{args.name}") agent = dd_utils.AgentDDPG(act_net, device=device, clip_actions=params["clip_actions"]) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params["gamma"], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params["replay_size"]) # the original paper was using a different optimizer for each network act_opt = torch.optim.Adam(act_net.parameters(), lr=params["lr"]) crt_opt = torch.optim.Adam(crt_net.parameters(), lr=params["lr"]) frame_idx = 0 best_reward = None with common.RewardTracker(writer, stop_reward=params["stopping_reward"]) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < params["replay_init"]: continue batch = buffer.sample(params["batch_size"]) states_v, actions_v, rewards_v, dones_mask, last_states_v = dd_utils.unpack_batch(batch, device)