Exemplo n.º 1
0
    def wait_robot(self):
        if self.state == 'idle':
            if self._step < self.skip_step:
                return
        if self.state == 'go_obj':
            if goal_distance(self.robot_state[:2],
                             self.obj_pos[:2]) > self._DIS_ERROR * 2:
                return
        elif self.state == 'down':
            if (goal_distance(self.robot_state[:2], self.obj_pos[:2]) >
                    self._DIS_ERROR or self.robot_state[2] >
                    self.obj_pos[2] + self._DIS_ERROR / 2.0):
                return
        elif self.state == 'up':
            if (goal_distance(self.robot_state[:2], self.tar_pos[:2]) >
                    self._DIS_ERROR * 2 or self.robot_state[2] <
                    self.tar_pos[2] - self._DIS_ERROR / 2.0):
                return
        # Done!!!
        elif self.state == 'go_goal':
            if goal_distance(self.robot_state[:3],
                             self.goal_pos) > self._DIS_ERROR * 3:
                return
            self._done = True
        elif self.state == 'grip':
            # print(self._step, self.past_gs, self.robot_state[-1])
            if (self._step < self.skip_step or self.robot_state[-1] >= 0.05
                    or self.past_gs - self.robot_state[-1] >
                    self._DIS_ERROR / 2.0):
                return

        self.state = self.next_state
        self._every_task.append(self._step)
        self._step = 0
Exemplo n.º 2
0
 def wait_robot(self):
     if self.state == 'go_obj':
         if goal_distance(self.robot_state[:2],
                          self.obj_pos[:2]) > self._DIS_ERROR:
             # print(goal_distance(self.robot_state[:2], self.obj_pos[:2]))
             return
     elif self.state == 'down':
         if goal_distance(self.robot_state[:3],
                          self.obj_pos) > self._DIS_ERROR:
             return
     # Done!!!
     elif self.state == 'up':
         if goal_distance(self.robot_state[:3],
                          self.tar_pos) > self._DIS_ERROR:
             return
         # TODO: Revise this appoarch to change goal pos
         self._done = True
     elif self.state == 'go_goal':
         if goal_distance(self.robot_state[:3],
                          self.goal_pos) > self._DIS_ERROR:
             return
     elif self.state == 'grip':
         if self.robot_state[-1] >= -.5:
             return
     self.state = self.next_state
Exemplo n.º 3
0
 def compute_reward(self, achieved_goal, goal, info):
     # Compute distance between goal and the achieved goal.
     d = goal_distance(achieved_goal, goal)
     if self.reward_type == 'sparse':
         return -(d > self.distance_threshold).astype(np.float32)
     else:
         return np.exp(-d)
Exemplo n.º 4
0
    def eval(self, model_name='', random=False):
        if not random:
            self.load_weights('pretrained/' + model_name)
        score = 0
        solve_count = 0
        tr = tqdm(range(100))
        for ep in tr:
            state = self.env.reset()
            tr.set_description("Solve percentage: {:.3f}".format(solve_count /
                                                                 (ep + 1)))
            for t in range(200):
                if random:
                    a = self.env.action_space.sample()
                else:
                    a, v = self.call(state['observation'])

                state, r, done, info = self.env.step(a)
                d = fetch_env.goal_distance(state['achieved_goal'],
                                            state['desired_goal'])
                done = d <= self.dist_thresh
                if done:
                    solve_count += 1
                    break
                score += r

        return score / 100.0
Exemplo n.º 5
0
def compute_reward(self, achieved_goal, goal, info):
    reward = 0
    completion_reward = 5  # TODO: Make tweakable hyperparam

    # Compute distance between goal and the achieved goal.
    d = goal_distance(achieved_goal, goal)
    reached_goal = d <= self.distance_threshold

    # add distance reward
    if self.reward_type == 'sparse':
        reward = -(not reached_goal).astype(np.float32)
    else:  # dense distance reward
        reward = -d

    # add completion reward
    if reached_goal:
        reward += completion_reward

    return reward


#   def reset(self):
#     ...
#   def render(self, mode='human'):
#     ...
#   def close(self):
#     ...
Exemplo n.º 6
0
    def eval(self, env, model_name='', random=False, render=False):
        if not random:
            self.actor.model.load_weights('pretrained/' + model_name +
                                          'Actor.h5')
            self.critic.model.load_weights('pretrained/' + model_name +
                                           'Critic.h5')
        score = 0
        solve_count = 0
        tr = tqdm(range(100))
        avg_time = 0
        for ep in tr:
            state = env.reset()

            for t in range(50):
                if render:
                    env.render()
                if random:
                    a = env.action_space.sample()
                else:
                    a = self.policy_action(self.format_state(state))[0]

                state, r, done, info = env.step(a)
                d = goal_distance(state['achieved_goal'],
                                  state['desired_goal'])
                done = d <= 0.05
                if done:
                    solve_count += 1
                    break
                score += r
            tr.set_description("Solve percentage: {:.3f}".format(solve_count /
                                                                 (ep + 1)))
            avg_time += t
        print("average time to solve:", avg_time / 100.0)
        return score / 100.0
Exemplo n.º 7
0
    def step(self, action):
        action = np.clip(action, self.action_space.low, self.action_space.high)
        self._set_action(action)
        self.sim.step()
        self._step_callback()
        obs = self._get_obs()

        done = self._is_success(obs['achieved_goal'], self.goal)
        info = {
            'is_success':
            done,  # does not include done from TimeLimit (episode completion)
            'dist': goal_distance(obs['achieved_goal'], self.goal)
        }
        reward = self.compute_reward(obs['achieved_goal'], self.goal, info)

        # Time penalty to encourage faster reaching
        reward_time = -0.1  # TODO: Make tweakable hyperparam
        reward = reward + reward_time
        return obs, reward, done, info
Exemplo n.º 8
0
 def compute_reward(self, achieved_goal, goal, info):
     """Compute goal reward"""
     d = goal_distance(achieved_goal, goal)
     return (d <= self.distance_threshold).astype(np.float32)
            total_reward += r

            if step % 20 == 0:
                rgb_obs = env.sim.render(width=200, height=200, camera_name="external_camera_0", depth=False,
                    mode='offscreen', device_id=-1)
                # rgb_obs1 = env.sim.render(width=200, height=200, camera_name="external_camera_1", depth=False,
                #     mode='offscreen', device_id=-1)
                plt.figure(1)
                plt.imshow(rgb_obs)
                # plt.figure(2)
                # plt.imshow(rgb_obs1)
                plt.show(block=False)
                plt.pause(0.001)

            if (not upper and 
                goal_distance(obs['eeinfo'][0][:2], obs['achieved_goal'][:2]) < 0.05 and
                obs['eeinfo'][0][-1] > obs['achieved_goal'][-1] + .01):
                upper = 1
                break
                
            if info['is_success'] or done:
                break

        # plt.figure(1)
        # plt.imshow(gif_pic/255.)
        # plt.figure(2)
        # plt.imshow(rgb_obs)
        # plt.show(block=False)
        # plt.pause(0.001)
        upper_sucess += upper
        print(i, "total reward %0.2f. sucess %d rate %.2f" % (total_reward, upper_sucess, upper_sucess / (i+1)))
Exemplo n.º 10
0
 def _is_success(self, achieved_goal, desired_goal):
     d = goal_distance(achieved_goal, desired_goal)
     return (d < self.distance_threshold).astype(np.float32)
Exemplo n.º 11
0
 def compute_reward(self, achieved_goal, goal, info):
     # Compute distance between goal and the achieved goal.
     return -goal_distance(achieved_goal, goal)
Exemplo n.º 12
0
    def train(self,
              num_eps=100,
              render=False,
              model_start='',
              model_save='fetchReach.h5',
              custom_r=False,
              v_lr=1.0,
              p_lr=1.0,
              verbose=1):
        best_avg_model_name = 'best_avg-' + model_save
        self.p_opt = tf.train.RMSPropOptimizer(learning_rate=p_lr, epsilon=0.1)
        # self.p_opt = tf.train.AdamOptimizer(learning_rate=p_lr)

        # self.p_opt = tf.train.GradientDescentOptimizer(learning_rate=p_lr)
        # self.v_opt = tf.train.AdamOptimizer(learning_rate=v_lr)
        # self.load_weights('mtnCar.h5')
        if model_start:
            self.load_weights(model_start)

        self.num_eps = num_eps
        best_r = -float('inf')
        best_avg = -float('inf')

        num_steps = 200
        avg_r_ep = 0
        prev_actions = np.zeros(4, dtype=np.float64)

        if verbose == 0:
            ep_iter = tqdm(range(self.num_eps))
        else:
            ep_iter = range(self.num_eps)

        solve_count = 0
        for ep in ep_iter:
            tr = range(num_steps)
            if verbose == 1:
                tr = tqdm(tr)

            # tr = tqdm(itertools.count())
            state = self.env.reset()
            # self.env.distance_threshold = 5
            total_r = 0
            avg_value = 0
            actions = np.zeros(4, dtype=np.float64)
            avg = 0
            for t in tr:
                with tf.GradientTape(persistent=True) as tape:
                    a, v = self.call(state['observation'])

                    # a = list(a.numpy()) + [0]
                    actions += a
                    if render:
                        self.env.render()
                    next_state, r, done, info = self.env.step(a)
                    d = fetch_env.goal_distance(next_state['achieved_goal'],
                                                next_state['desired_goal'])
                    done = d <= self.dist_thresh and t > 1
                    if custom_r:
                        r = (self.dist_thresh / (d + 1e-6))
                        r = min(r, 1.0)

                    if done:
                        solve_count += 1
                        print('solved')
                        if custom_r:
                            r += 5
                    total_r += r
                    avg_value += v.numpy()
                    td_target = r + 0.99 * self.call(
                        next_state['observation'])[1]
                    td_error = td_target - v
                    vloss = self.get_value_loss(td_target)
                    ploss = tf.reduce_mean(self.get_policy_loss(td_error))
                    self.loss = vloss + ploss
                    # if custom_r:
                    #     self.loss *= -1
                self.update(tape)

                # grads = self.get_grads(tape, td_error, td_target)
                # self.optimizer.apply_gradients(zip(grads, self.weights))
                # "Frame Reward {:.3f} | "
                if verbose == 1:
                    tr.set_description("Ep {}/{} | "
                                       "Loss {:.3f} | "
                                       "Total Reward {:.3f} | "
                                       "Avg Value {:.3f} | "
                                       "Solve Ratio {:.3f} | "
                                       "Avg Reward/Epoch {:.3f}".format(
                                           ep + 1, self.num_eps, self.loss,
                                           total_r, avg_value / (t + 1),
                                           solve_count / (ep + 1), avg_r_ep))
                    # "Avg Reward {:.3f} | "
                if done:
                    break

                state = next_state

            if verbose == 0:
                ep_iter.set_description("Solve percentage: {:.3f}".format(
                    solve_count / ep))

            if done and total_r < avg:
                total_r = avg + 5

            if avg_r_ep == 0:
                avg_r_ep = total_r
            else:
                avg_r_ep = avg_r_ep * 0.99 + total_r * 0.01
            if fetch_env.goal_distance(actions, prev_actions) < 0.1:
                print('Possible error: actions same as previous state {}\n'.
                      format(actions / num_steps))
            prev_actions = actions
            avg = avg_r_ep  #avg_r_ep / (ep + 1)
            if avg >= best_avg and ep > 10:
                print(
                    f'\nSaving best average model with reward of {avg} to {best_avg_model_name}'
                )
                best_avg = avg
                self.save_weights('pretrained/' + best_avg_model_name)

            if total_r >= best_r:
                print(
                    f'\nSaving best model with reward of {total_r} to {model_save}'
                )
                best_r = total_r
                self.best_weights = self.weights
                self.save_weights('pretrained/' + model_save)
        self.save_weights('pretrained/last_' + model_save)
Exemplo n.º 13
0
    def train(self, env, args):
        results = []
        num_steps = 200
        # First, gather experience
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit="episode")

        avg_r_ep = 0

        best_avg = -float('inf')
        best_score = -float('inf')

        past_samples = 15
        hist_ratio = deque(maxlen=past_samples)
        hist_scores = deque(maxlen=past_samples)
        for e in tqdm_e:
            noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.act_dim))
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            s = env.reset()
            # noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            for _ in range(num_steps):
                if args.render: env.render()
                # Actor picks an action (following the deterministic policy)
                old_state = self.format_state(s)
                # print(old_state.shape)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a + noise(), -self.act_range, self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                a = np.squeeze(a)
                new_state, r, done, info = env.step(a)
                dist = goal_distance(new_state['achieved_goal'],
                                     new_state['desired_goal'])
                # new_state = new_state['observation']

                # Add outputs to memory buffer
                self.store_states(s, a, r, done, info, new_state)

                s = new_state
                cumul_reward += r

                # Sample experience from buffer
                states, actions, rewards, dones, new_states = self.sample_batch(
                    args.batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states,
                     self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state

                if done:
                    break

            if avg_r_ep == 0:
                avg_r_ep = cumul_reward
            else:
                avg_r_ep = avg_r_ep * 0.99 + cumul_reward * 0.01

            if avg_r_ep >= best_avg:
                best_avg = avg_r_ep
                self.actor.model.save_weights(
                    'pretrained/best_avg_ddpgActor.h5')
                self.critic.model.save_weights(
                    'pretrained/best_avg_ddpgCritic.h5')
            # Display score
            if cumul_reward >= best_score:
                best_score = cumul_reward
                self.actor.model.save_weights('pretrained/ddpgActor.h5')
                self.critic.model.save_weights('pretrained/ddpgCritic.h5')

            hist_ratio.append(int(dist <= 0.05))
            hist_scores.append(cumul_reward)

            tqdm_e.set_description(
                "Score: {} | "
                "Best Reward: {} (avg: {:.2f})| "
                "Avg Reward, solve ratio over last {} samples: {:.3f}, {:.3f}".
                format(cumul_reward, np.amax(hist_scores),
                       avg_r_ep, past_samples, np.mean(hist_scores),
                       np.mean(hist_ratio)))
            tqdm_e.refresh()

        return results