def main(): rospy.init_node('sac_stage_1') pub_result = rospy.Publisher('result', Float32, queue_size=5) result = Float32() env = Env() torch.manual_seed(500) actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr) critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr) hard_target_update(critic, target_critic) # initialize automatic entropy tuning target_entropy = -torch.prod(torch.Tensor(action_size)).item() writer = SummaryWriter('./house_sac_4') replay_buffer = deque(maxlen=100000) recent_rewards = [] for episode in range(10001): done = False score = 0. state = env.reset() print('Episode: ' + str(episode)) past_action = np.array([0.,0.]) for step in range(1000): state = np.float32(state) #print(state) mu, std = actor(torch.Tensor(state)) action = get_action(mu, std) #action = np.array([np.clip(action[0], 0., 0.22), # np.clip(action[1], -2., 2.)]) next_state, reward, done = env.step(action,past_action) print(action, reward) past_action = action next_state = np.float32(next_state) mask = 0 if done else 1 if step > 1: score += reward replay_buffer.append((state, action, reward, next_state, mask)) state = next_state if done: recent_rewards.append(score) break if len(replay_buffer) >= 2*batch_size and is_training: mini_batch = random.sample(replay_buffer, batch_size) actor.train(), critic.train(), target_critic.train() alpha = train_model(actor, critic, target_critic, mini_batch, actor_optimizer, critic_optimizer, target_entropy) soft_target_update(critic, target_critic, tau) result = score pub_result.publish(result) gc.collect() print('reward per ep: ' + str(score)) if episode % 10 == 0: print('{} episode | score_avg: {:.2f}'.format(episode, np.mean(recent_rewards))) writer.add_scalar('log/score', float(np.mean(recent_rewards)), episode+260) #writer.add_scalar('log/alpha', float(alpha.detach().numpy()),episode+260) recent_rewards = [] print("save") if episode % 10 == 0: save_models(episode+260)
past_action = np.array([0.,0.]) for ep in range(max_episodes): done = False state = env.reset() print('Episode: ' + str(ep)) rewards_current_episode = 0 for step in range(max_steps): state = np.float32(state) action = policy_net.get_action(state) unnorm_action = np.array([action_unnormalized(action[0], ACTION_V_MAX, ACTION_V_MIN), action_unnormalized(action[1], ACTION_W_MAX, ACTION_W_MIN)]) next_state, reward, done = env.step(unnorm_action, past_action) # print('action', unnorm_action,'r',reward) past_action = action rewards_current_episode += reward next_state = np.float32(next_state) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > 8*batch_size and is_training: soft_q_update(batch_size) state = next_state if done: break print('reward per ep: ' + str(rewards_current_episode)) rewards.append(rewards_current_episode)
def main(): rospy.init_node('ddpg_stage_1') pub_result = rospy.Publisher('result', Float32, queue_size=5) result = Float32() env = Env() torch.manual_seed(1000) actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr) critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr) hard_target_update(actor, critic, target_actor, target_critic) ou_noise = OUNoise(action_size, theta, mu, sigma) writer = SummaryWriter('./house_td3_4') replay_buffer = deque(maxlen=100000) recent_rewards = [] for episode in range(100001): done = False score = 0. state = env.reset() print('Episode: ' + str(episode)) past_action = np.array([0., 0.]) for step in range(1000): state = np.float32(state) #print(state) policy = actor(torch.Tensor(state)) action = get_action(policy, ou_noise, episode) next_state, reward, done = env.step(action, past_action) print(action, reward) past_action = action next_state = np.float32(next_state) mask = 0 if done else 1 if step > 1: score += reward replay_buffer.append((state, action, reward, next_state, mask)) state = next_state if done: recent_rewards.append(score) break if len(replay_buffer) >= 2 * batch_size and is_training: mini_batch = random.sample(replay_buffer, batch_size) actor.train(), critic.train() target_actor.train(), target_critic.train() train_model(actor, critic, target_actor, target_critic, actor_optimizer, critic_optimizer, mini_batch, step) soft_target_update(actor, critic, target_actor, target_critic, tau) result = score pub_result.publish(result) gc.collect() print('reward per ep: ' + str(score))