def sim_test(): global env_name, save_name, agent_args, env_real, env_sim, nets env_real = env_real.Env_real(False) env_sim = env_sim.Env_sim(True) GAT_model = nets.GAT_net(env_real, env_sim, GAT_args) agent = Agent(env_sim, agent_args) episodes = int(10) max_steps = 2000 for episode in range(episodes): state = env_sim.reset() done = False score = 0 step = 0 while not done and step <= max_steps: action, clipped_action, value, cost_value = agent.get_action( state, False) # action transformer by GAT transformed_next_state = GAT_model.forward_transform( state, clipped_action) transformed_action = GAT_model.backward_transform( state, transformed_next_state) state, reward, done, info = env_sim.step(transformed_action) print(reward, '\t', info.get('cost', 0)) score += reward step += 1 print("score :", score)
def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph env = gym.make(env_name) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) step = 0 episode = 0 while total_step < total_max_step: episode += 1 #gradient reset & parameter synchronize agent.update_parameter(global_agent) ### start_step = step states = [] actions = [] rewards = [] score = 0 cnt = 0 state = env.reset() while True: cnt += 1 step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) ####### modify reward function ####### #reward = 200-cnt if done else 0 reward += 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) score += reward if done or step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) if done: break agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] state = next_state #score_logger.write([cnt, score]) if t_idx == 0: print(score) graph.update(score, p_loss, v_loss, entropy) if episode % 100 == 0: global_agent.save()
def train(): global env_name, save_name, agent_args env = gym.make(env_name) if env_name == 'DobroHalfCheetah-v0': env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = int(5e5) save_freq = 1 save_period = 1000 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period) for episode in range(episodes): state = env.reset() agent.actor_noise.reset() done = False score = 0 step = 0 while not done: step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) agent.replay_memory.append([ np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32) ]) ######################## if len(agent.replay_memory) > agent.train_start: v_loss, p_loss = agent.train() v_loss_logger.write([1, v_loss]) p_loss_logger.write([1, p_loss]) p_losses.append(p_loss) v_losses.append(v_loss) value = agent.get_value(state, action) entropies.append(value) scores.append(reward) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) state = next_state score += reward print(episode, score, agent.epsilon) score_logger.write([step, score]) if (episode + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)
def test(): agent = Agent(env) agent.epsilon = 0.01 action_low = env.action_space.low[0] action_high = env.action_space.high[0] episodes = int(1e6) avg_Q = deque(maxlen=200) for episode in range(episodes): state = env.reset() done = False while not done: action = agent.get_action(state) a_t = (action/(agent.n_action-1)) a_t = a_t*(action_high - action_low) + action_low state, reward, done, info = env.step([a_t]) env.render()
def test(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) episodes = int(1e6) for episode in range(episodes): state = env.reset() done = False score = 0 while not done: action, clipped_action, value = agent.get_action(state, False) #action, clipped_action, value = agent.get_action(state, True) state, reward, done, info = env.step(clipped_action) score += reward env.render() time.sleep(0.01) print("score :", score)
def test(): global env_name save_name = env_name.split('-')[0] gamma = 0.99 env = gym.make(env_name) env.unwrapped.initialize(is_render=True) agent = Agent("global", env, save_name, gamma) episodes = int(1e6) for episode in range(episodes): state = env.reset() done = False while not done: #action = agent.get_action(state, False) action = agent.get_action(state, True) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 state, reward, done, info = env.step(action) #state, reward, done, info = env.step(a_t) env.render()
def test(): global env_name, agent_args save_name = env_name.split('-')[0] gamma = 0.99 env = gym.make(env_name) agent = Agent("global", env, save_name, gamma, agent_args) episodes = int(1e6) for episode in range(episodes): state = env.reset() done = False while not done: #action = agent.get_action(state, False) action = agent.get_action(state, True) print(action) #time.sleep(0.01) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 state, reward, done, info = env.step(action) #state, reward, done, info = env.step(a_t) env.render()
def test(): global env_name, save_name, agent_args env = gym.make(env_name) if env_name == 'DobroHalfCheetah-v0': env.unwrapped.initialize(is_render=True) elif env_name == 'HalfCheetahBulletEnv-v0': env.render() agent = Agent(env, agent_args) episodes = int(1e6) avg_Q = deque(maxlen=200) for episode in range(episodes): state = env.reset() agent.actor_noise.reset() done = False while not done: #action = agent.get_action(state, False) action = agent.get_action(state, True) state, reward, done, info = env.step(action) print(np.mean(action)) env.render() time.sleep(0.01)
def real_test(): global env_name, save_name, agent_args, env_real env_real = env_real.Env_real(True) agent = Agent(env_real, agent_args) episodes = int(10) max_steps = 2000 for episode in range(episodes): input_value = input('Ready? (y/n)') if input_value == 'n': break state = env_real.reset() done = False score = 0 step = 0 while not done and step <= max_steps: action, clipped_action, value, cost_value = agent.get_action( state, False) state, reward, done, info = env_real.step(clipped_action) print(reward, '\t', info.get('cost', 0)) score += reward step += 1 print("score :", score)
def train(): global env, env_name env_name = env_name.split('-')[0] agent = Agent(env, env_name) loss_logger = Logger(env_name, 'loss') score_logger = Logger(env_name, 'score') action_low = env.action_space.low[0] action_high = env.action_space.high[0] episodes = int(5e2) avg_Q = deque(maxlen=200) for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 while not done: step += 1 action = agent.get_action(state) a_t = (action/(agent.n_action-1)) a_t = a_t*(action_high - action_low) + action_low next_state, reward, done, info = env.step([a_t]) agent.replay_memory.append([np.array(state, np.float32), action, reward, done, np.array(next_state, np.float32)]) ######################## #replay 메모리에 어느정도 쌓이면 학습시작하기 if len(agent.replay_memory) > agent.train_start: Q, loss = agent.train() loss_logger.write([1, loss]) avg_Q.append(Q) state = next_state score += reward #print(episode, accumulate+100, self.epsilon) print(episode, score, agent.epsilon, np.mean(avg_Q)) agent.update_target_model() score_logger.write([step, score]) if (episode+1)%agent.save_freq == 0: agent.save() loss_logger.save() score_logger.save()
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') cost_v_loss_logger = Logger(save_name, 'cost_v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') cost_logger = Logger(save_name, 'cost') graph = Graph( 1000, save_name, ['score', 'cost', 'value loss', 'cost value loss', 'kl divergence']) max_steps = 4000 max_ep_len = 1000 episodes = int(max_steps / max_ep_len) epochs = 500 save_freq = 10 log_length = 10 p_objectives = deque(maxlen=log_length) c_objectives = deque(maxlen=log_length) v_losses = deque(maxlen=log_length) cost_v_losses = deque(maxlen=log_length) kl_divergence = deque(maxlen=log_length) scores = deque(maxlen=log_length * episodes) costs = deque(maxlen=log_length * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] cost_targets = [] gaes = [] cost_gaes = [] avg_costs = [] ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 cost = 0 step = 0 temp_rewards = [] temp_costs = [] values = [] cost_values = [] while True: step += 1 ep_step += 1 assert env.observation_space.contains(state) action, clipped_action, value, cost_value = agent.get_action( state, True) assert env.action_space.contains(clipped_action) next_state, reward, done, info = env.step(clipped_action) #for predict cost h_dist = hazard_dist(env.hazards_pos, env.world.robot_pos()) predict_cost = get_cost(h_dist) states.append(state) actions.append(action) temp_rewards.append(reward) temp_costs.append(predict_cost) values.append(value) cost_values.append(cost_value) state = next_state score += reward cost += info.get('cost', 0) #로그는 실제 cost를 남겨서, discrete한 cost랑 비교해야함. if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value, cost_value = agent.get_action( state, True) else: value = 0 cost_value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) next_cost_values = cost_values[1:] + [cost_value] temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets( temp_costs, cost_values, next_cost_values) avg_costs.append(np.mean(temp_costs)) targets += list(temp_targets) gaes += list(temp_gaes) cost_targets += list(temp_cost_targets) cost_gaes += list(temp_cost_gaes) score_logger.write([step, score]) cost_logger.write([step, cost]) scores.append(score) costs.append(cost) trajs = [ states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs ] v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train( trajs) v_loss_logger.write([ep_step, v_loss]) cost_v_loss_logger.write([ep_step, cost_v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) c_objectives.append(cost_objective) v_losses.append(v_loss) cost_v_losses.append(cost_v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence), np.mean(c_objectives)) graph.update([ np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence) ]) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() cost_v_loss_logger.save() kl_logger.save() score_logger.save() cost_logger.save() graph.update(None, finished=True)
def train(): global env_name, save_name, agent_args, env_real, env_sim, nets env_real = env_real.Env_real(False) env_sim = env_sim.Env_sim(True) GAT_model = nets.GAT_net(env_real, env_sim, GAT_args) agent = Agent(env_sim, agent_args) # wandb.init(project=save_name) accum_step = 0 avg_temp_cost = 0 v_loss_logger = Logger(save_name, 'v_loss') cost_v_loss_logger = Logger(save_name, 'cost_v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') cost_logger = Logger(save_name, 'cost') max_steps = 2000 max_ep_len = 1000 episodes = int(max_steps / max_ep_len) epochs = 2 #50 save_freq = 1 log_length = 10 p_objectives = deque(maxlen=log_length) c_objectives = deque(maxlen=log_length) v_losses = deque(maxlen=log_length) cost_v_losses = deque(maxlen=log_length) kl_divergence = deque(maxlen=log_length) scores = deque(maxlen=log_length * episodes) costs = deque(maxlen=log_length * episodes) is_backup = False backup_name = '{}/backup.pkl'.format(save_name) if os.path.isfile(backup_name): #input_value = raw_input('backup file exists. wanna continue the last work?( y/n )') #if input_value != 'n': # is_backup = True is_backup = True if is_backup: with open(backup_name, 'rb') as f: backup_list = pickle.load(f) start_iter = backup_list[0] else: start_iter = 0 backup_list = [start_iter] for epoch in range(start_iter, epochs): #continue? print("=" * 20) print("Epoch : {}".format(epoch + 1)) #input_value = raw_input("wanna continue episodes?( y/n )") #if input_value == 'n': # break states = [] actions = [] targets = [] cost_targets = [] gaes = [] cost_gaes = [] avg_costs = [] ep_step = 0 while ep_step < max_steps: #input_value = raw_input("ready?") state = env_sim.reset() done = False score = 0 cost = 0 step = 0 temp_rewards = [] temp_costs = [] values = [] cost_values = [] while True: if rospy.is_shutdown(): sys.exit() step += 1 ep_step += 1 action, clipped_action, value, cost_value = agent.get_action( state, True) # action transformer by GAT transformed_next_state = GAT_model.forward_transform( state, clipped_action) transformed_action = GAT_model.backward_transform( state, transformed_next_state) next_state, reward, done, info = env_sim.step( transformed_action) predict_cost = info['continuous_cost'] states.append(state) actions.append(action) temp_rewards.append(reward) temp_costs.append(predict_cost) values.append(value) cost_values.append(cost_value) state = next_state score += reward cost += info.get('cost', 0) if done or step >= max_ep_len: break print("step : {}, score : {}".format(step, score)) if step >= max_ep_len: action, clipped_action, value, cost_value = agent.get_action( state, True) else: value = 0 cost_value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) next_cost_values = cost_values[1:] + [cost_value] temp_cost_gaes, temp_cost_targets = agent.get_gaes_targets( temp_costs, cost_values, next_cost_values) avg_costs.append(np.mean(temp_costs)) targets += list(temp_targets) gaes += list(temp_gaes) cost_targets += list(temp_cost_targets) cost_gaes += list(temp_cost_gaes) score_logger.write([step, score]) cost_logger.write([step, cost]) scores.append(score) costs.append(cost) accum_step += step avg_temp_cost = np.mean(temp_costs) # wandb.log({'step': accum_step, 'score':score, 'cost':cost, 'avg_temp_cost':avg_temp_cost}) trajs = [ states, actions, targets, cost_targets, gaes, cost_gaes, avg_costs ] v_loss, cost_v_loss, p_objective, cost_objective, kl = agent.train( trajs) v_loss_logger.write([ep_step, v_loss]) cost_v_loss_logger.write([ep_step, cost_v_loss]) kl_logger.write([ep_step, kl]) p_objectives.append(p_objective) c_objectives.append(cost_objective) v_losses.append(v_loss) cost_v_losses.append(cost_v_loss) kl_divergence.append(kl) print(np.mean(scores), np.mean(costs), np.mean(v_losses), np.mean(cost_v_losses), np.mean(kl_divergence), np.mean(c_objectives)) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() cost_v_loss_logger.save() kl_logger.save() score_logger.save() cost_logger.save() #backup backup_list[0] = epoch + 1 with open(backup_name, 'wb') as f: pickle.dump(backup_list, f)
def train(): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores gamma = 0.99 num_thread = 10 total_step = 0 total_max_step = 1e7 step_period = 1e4 #1e4 step_period = int(step_period / num_thread) save_name = env_name.split('-')[0] env = gym.make(env_name) env.unwrapped.initialize(is_render=False) global_agent = Agent("global", env, save_name, gamma) loss_logger = Logger(save_name, 'loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), 'A3C') env.close() p_losses = deque(maxlen=step_period) v_losses = deque(maxlen=step_period) entropies = deque(maxlen=step_period) scores = deque(maxlen=step_period) def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) episode = 0 step = 0 p_loss = None v_loss = None entropy = None #gradient reset & parameter synchronize agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] dones = [] score = 0 state = env.reset() while total_step < total_max_step: step += 1 total_step += 1 action = agent.get_action(state, True) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 next_state, reward, done, info = env.step(action) #next_state, reward, done, info = env.step(a_t) ####### modify reward function ####### #reward = 200-cnt if done else 0 #reward /= 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) dones.append(done) score += reward if step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): if dones[-i - 1]: ret = 0 #elif i > 0: # ret = agent.get_value(states[-i]) ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) p_losses.append(p_loss) v_losses.append(v_loss) entropies.append(entropy) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) agent.update_parameter(global_agent) if t_idx == 0: graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) start_step = step states = [] actions = [] rewards = [] dones = [] state = next_state #score_logger.write([cnt, score]) if done: episode += 1 if t_idx == 0 and episode % 10 == 0: global_agent.save() scores.append(score) print(t_idx, score) score = 0 state = env.reset() threads = [] for i in range(num_thread): threads.append(threading.Thread(target=thread_func, args=(i, ))) threads[-1].start() for thread in threads: thread.join() graph.update(0, 0, 0, 0, True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) p_loss_logger = Logger(save_name, 'p_loss') v_loss_logger = Logger(save_name, 'v_loss') kl_logger = Logger(save_name, 'kl') score_logger = Logger(save_name, 'score') graph = Graph( 1000, save_name, ['score', 'policy loss', 'value loss', 'kl divergence', 'entropy']) episodes = 10 max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) epochs = int(1e5) save_freq = 10 save_period = 10 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) kl_divergence = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] next_states = [] rewards = [] gaes = [] ep_step = 0 #for episode in range(episodes): while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 temp_rewards = [] values = [] while True: step += 1 ep_step += 1 action, clipped_action, value = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) next_states.append(next_state) rewards.append(reward) values.append(value) state = next_state score += reward if done or step >= max_ep_len: break if step >= max_ep_len: action, clipped_action, value = agent.get_action(state, True) else: #중간에 끝난 거면, 다 돌기전에 죽어버린거니, value = 0 으로 해야함 value = 0 print("done before max_ep_len...") next_values = values[1:] + [value] temp_gaes, temp_targets = agent.get_gaes_targets( temp_rewards, values, next_values) targets += list(temp_targets) gaes += list(temp_gaes) score_logger.write([step, score]) scores.append(score) trajs = [states, actions, targets, next_states, rewards, gaes] p_loss, v_loss, kl, entropy = agent.train(trajs) p_loss_logger.write([ep_step, p_loss]) v_loss_logger.write([ep_step, v_loss]) kl_logger.write([ep_step, kl]) p_losses.append(p_loss) v_losses.append(v_loss) kl_divergence.append(kl) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies)) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(kl_divergence), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() p_loss_logger.save() v_loss_logger.save() kl_logger.save() score_logger.save() graph.update(None, finished=True)
def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph, p_losses, v_losses, entropies, scores env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) episode = 0 step = 0 p_loss = None v_loss = None entropy = None #gradient reset & parameter synchronize agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] dones = [] score = 0 state = env.reset() while total_step < total_max_step: step += 1 total_step += 1 action = agent.get_action(state, True) #if action[0] > 0: # a_t = 1 #else : # a_t = 0 next_state, reward, done, info = env.step(action) #next_state, reward, done, info = env.step(a_t) ####### modify reward function ####### #reward = 200-cnt if done else 0 #reward /= 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) dones.append(done) score += reward if step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): if dones[-i - 1]: ret = 0 #elif i > 0: # ret = agent.get_value(states[-i]) ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) p_losses.append(p_loss) v_losses.append(v_loss) entropies.append(entropy) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) agent.update_parameter(global_agent) if t_idx == 0: graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) start_step = step states = [] actions = [] rewards = [] dones = [] state = next_state #score_logger.write([cnt, score]) if done: episode += 1 if t_idx == 0 and episode % 10 == 0: global_agent.save() scores.append(score) print(t_idx, score) score = 0 state = env.reset()
def train(): global env_name, save_name, agent_args env = gym.make(env_name) agent = Agent(env, agent_args) score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name, ['score', 'policy loss', 'Q value loss', 'entropy']) max_steps = 4000 max_ep_len = min(1000, env.spec.max_episode_steps) start_training_after_steps = 1000 step_per_training = 50 epochs = 1000 save_freq = 1 record_length = 10 p_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) q_losses = deque(maxlen=record_length * int(max_ep_len / step_per_training)) entropies = deque(maxlen=record_length * int(max_ep_len / step_per_training)) scores = deque(maxlen=record_length) total_step = 0 for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() score = 0 step = 0 while True: step += 1 ep_step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) done = False if step >= max_ep_len else done agent.replay_memory.append( [state, action, reward, np.float(done), next_state]) if len(agent.replay_memory) > start_training_after_steps and ( total_step + 1) % step_per_training == 0: for _ in range(step_per_training): p_loss, q_loss, entropy = agent.train() p_losses.append(p_loss) q_losses.append(q_loss) entropies.append(entropy) print(np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies)) state = next_state score += reward if done or step >= max_ep_len: break score_logger.write([step, score]) scores.append(score) graph.update([ np.mean(scores), np.mean(p_losses), np.mean(q_losses), np.mean(entropies) ]) if (epoch + 1) % save_freq == 0: agent.save() score_logger.save() graph.update(None, finished=True)
def train(): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph gamma = 0.99 num_thread = 10 total_step = 0 total_max_step = 1e6 step_period = 1e3 step_period = int(step_period / num_thread) save_name = env_name.split('-')[0] env = gym.make(env_name) global_agent = Agent("global", env, save_name, gamma) loss_logger = Logger(save_name, 'loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), 'A3C') env.close() def thread_func(t_idx): global total_step, total_max_step, env_name, global_agent, step_period, gamma, \ loss_logger, score_logger, graph env = gym.make(env_name) agent = Agent("local_{}".format(t_idx), env, save_name, gamma) step = 0 episode = 0 while total_step < total_max_step: episode += 1 #gradient reset & parameter synchronize agent.update_parameter(global_agent) ### start_step = step states = [] actions = [] rewards = [] score = 0 cnt = 0 state = env.reset() while True: cnt += 1 step += 1 total_step += 1 action = agent.get_action(state, True) next_state, reward, done, info = env.step(action) ####### modify reward function ####### #reward = 200-cnt if done else 0 reward += 10 ####### modify reward function ####### states.append(state) actions.append(action) rewards.append(reward) score += reward if done or step - start_step == step_period: ret = 0 if done else agent.get_value(next_state) targets = [] for i in range(len(states)): ret = rewards[-i - 1] + gamma * ret targets.append(ret) targets = targets[::-1] p_grad, p_loss, v_grad, v_loss, entropy = agent.calc_gradient( states, actions, targets) global_agent.update_with_gradients(p_grad, v_grad) #loss_logger.write([step-start_step,p_loss,v_loss]) if done: break agent.update_parameter(global_agent) start_step = step states = [] actions = [] rewards = [] state = next_state #score_logger.write([cnt, score]) if t_idx == 0: print(score) graph.update(score, p_loss, v_loss, entropy) if episode % 100 == 0: global_agent.save() threads = [] for i in range(num_thread): threads.append(threading.Thread(target=thread_func, args=(i, ))) threads[-1].start() for thread in threads: thread.join() graph.update(0, 0, 0, 0, True)
def train(): global env_name, save_name, agent_args env = gym.make(env_name) env.unwrapped.initialize(is_render=False) agent = Agent(env, agent_args) v_loss_logger = Logger(save_name, 'v_loss') p_loss_logger = Logger(save_name, 'p_loss') score_logger = Logger(save_name, 'score') graph = Graph(1000, save_name.upper(), agent.name) episodes = 10 epochs = int(1e5) save_freq = 10 save_period = 100 p_losses = deque(maxlen=save_period) v_losses = deque(maxlen=save_period) entropies = deque(maxlen=save_period) scores = deque(maxlen=save_period * episodes) for epoch in range(epochs): states = [] actions = [] targets = [] ep_step = 0 for episode in range(episodes): state = env.reset() done = False score = 0 step = 0 temp_rewards = [] while not done: step += 1 ep_step += 1 action, clipped_action = agent.get_action(state, True) next_state, reward, done, info = env.step(clipped_action) states.append(state) actions.append(action) temp_rewards.append(reward) state = next_state score += reward score_logger.write([step, score]) scores.append(score) temp_targets = np.zeros_like(temp_rewards) ret = 0 for t in reversed(range(len(temp_rewards))): ret = temp_rewards[t] + agent.discount_factor * ret temp_targets[t] = ret targets += list(temp_targets) trajs = [states, actions, targets] v_loss, p_objective, kl = agent.train(trajs) v_loss_logger.write([ep_step, v_loss]) p_loss_logger.write([ep_step, p_objective]) p_losses.append(p_objective) v_losses.append(v_loss) entropies.append(kl) #print(v_loss, p_objective, kl) print(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) graph.update(np.mean(scores), np.mean(p_losses), np.mean(v_losses), np.mean(entropies)) if (epoch + 1) % save_freq == 0: agent.save() v_loss_logger.save() p_loss_logger.save() score_logger.save() graph.update(0, 0, 0, 0, finished=True)