示例#1
0
def main():
    rospy.init_node('sac_stage_1')
    pub_result = rospy.Publisher('result', Float32, queue_size=5)
    result = Float32()
    env = Env()
    torch.manual_seed(500)
    

    
    
    
    actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

    hard_target_update(critic, target_critic)
    
    # initialize automatic entropy tuning
    target_entropy = -torch.prod(torch.Tensor(action_size)).item()
    
    writer = SummaryWriter('./house_sac_4')

    replay_buffer = deque(maxlen=100000)
    recent_rewards = []
    
    for episode in range(10001):
        done = False
        score = 0.
        state = env.reset()
        print('Episode: ' + str(episode))
        past_action = np.array([0.,0.])

        for step in range(1000):
            state = np.float32(state)
            #print(state)
            mu, std = actor(torch.Tensor(state))
            action = get_action(mu, std)
            #action = np.array([np.clip(action[0], 0., 0.22),
             #                         np.clip(action[1], -2., 2.)])
           
            next_state, reward, done = env.step(action,past_action)
            print(action, reward)
            past_action = action
            
            next_state = np.float32(next_state)
            mask = 0 if done else 1
            if step > 1:
		score += reward
            	replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state

            if done:
                recent_rewards.append(score)
                break
            
            if len(replay_buffer) >= 2*batch_size and is_training:

                mini_batch = random.sample(replay_buffer, batch_size)
                
                actor.train(), critic.train(), target_critic.train()
                alpha = train_model(actor, critic, target_critic, mini_batch, 
                                    actor_optimizer, critic_optimizer,
                                    target_entropy)
               
                soft_target_update(critic, target_critic, tau)
            
        result = score
        pub_result.publish(result)
        gc.collect()
        print('reward per ep: ' + str(score))
        
        if episode % 10 == 0:
            print('{} episode | score_avg: {:.2f}'.format(episode, np.mean(recent_rewards)))
            writer.add_scalar('log/score', float(np.mean(recent_rewards)),  episode+260)
            #writer.add_scalar('log/alpha', float(alpha.detach().numpy()),episode+260)
            recent_rewards = []
            print("save")
        if episode % 10 == 0:
           save_models(episode+260)
示例#2
0
    past_action = np.array([0.,0.])

    for ep in range(max_episodes):
        done = False
        state = env.reset()
        print('Episode: ' + str(ep))

        rewards_current_episode = 0

        for step in range(max_steps):
            state = np.float32(state)
            
            action = policy_net.get_action(state)
            unnorm_action = np.array([action_unnormalized(action[0], ACTION_V_MAX, ACTION_V_MIN), action_unnormalized(action[1], ACTION_W_MAX, ACTION_W_MIN)])

            next_state, reward, done = env.step(unnorm_action, past_action)
            # print('action', unnorm_action,'r',reward)
            past_action = action

            rewards_current_episode += reward
            next_state = np.float32(next_state)
            replay_buffer.push(state, action, reward, next_state, done)
            if len(replay_buffer) > 8*batch_size and is_training:
                soft_q_update(batch_size)
            state = next_state

            if done:
                break
        
        print('reward per ep: ' + str(rewards_current_episode))
        rewards.append(rewards_current_episode)
示例#3
0
def main():
    rospy.init_node('ddpg_stage_1')
    pub_result = rospy.Publisher('result', Float32, queue_size=5)
    result = Float32()
    env = Env()
    torch.manual_seed(1000)

    actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

    hard_target_update(actor, critic, target_actor, target_critic)
    ou_noise = OUNoise(action_size, theta, mu, sigma)

    writer = SummaryWriter('./house_td3_4')

    replay_buffer = deque(maxlen=100000)
    recent_rewards = []

    for episode in range(100001):
        done = False
        score = 0.
        state = env.reset()
        print('Episode: ' + str(episode))
        past_action = np.array([0., 0.])

        for step in range(1000):

            state = np.float32(state)
            #print(state)
            policy = actor(torch.Tensor(state))
            action = get_action(policy, ou_noise, episode)

            next_state, reward, done = env.step(action, past_action)
            print(action, reward)
            past_action = action

            next_state = np.float32(next_state)
            mask = 0 if done else 1
            if step > 1:
                score += reward
                replay_buffer.append((state, action, reward, next_state, mask))

            state = next_state

            if done:
                recent_rewards.append(score)
                break

            if len(replay_buffer) >= 2 * batch_size and is_training:

                mini_batch = random.sample(replay_buffer, batch_size)

                actor.train(), critic.train()
                target_actor.train(), target_critic.train()
                train_model(actor, critic, target_actor, target_critic,
                            actor_optimizer, critic_optimizer, mini_batch,
                            step)

                soft_target_update(actor, critic, target_actor, target_critic,
                                   tau)

        result = score
        pub_result.publish(result)
        gc.collect()
        print('reward per ep: ' + str(score))