示例#1
0
    def init_demo_buffer(self, demoDataFile, update_stats=True):  # function that initializes the demo buffer

        demoData = np.load(demoDataFile)  # load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(self.num_demo):  # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [], [], []
            i = 0
            for transition in range(self.T - 1):
                obs.append([demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]

            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(
                episode)  # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ",
                         DEMO_BUFFER.get_current_size())  # print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size())  # print out the demonstration buffer size
示例#2
0
    def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer

        demoData = np.load(demoDataFile) #load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [] ,[] ,[]
            i = 0
            for transition in range(self.T - 1):
                obs.append([demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]


            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size
示例#3
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """

        directory_plot = '../shadow-hand-obervation-plot/shadow-hand-obs-png/' + datetime.datetime.now(
        ).strftime("%m%d_%H%M%S") + os.sep
        directory_env = '../shadow-hand-observation-env/shadow-hand-env-png/' + datetime.datetime.now(
        ).strftime("%m%d_%H%M%S") + os.sep
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []

        info_values = [
            np.empty(
                (self.T, self.rollout_batch_size, self.dims['info_' + key]),
                np.float32) for key in self.info_keys
        ]
        Qs = []
        x_bar = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
            37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
            54, 55, 56, 57, 58, 59, 60, 61
        ]
        x_lab = [
            "WR-J1-qpos", "WR-J0-qpos", "FF-J3-qpos", "FF-J2-qpos",
            "FF-J1-qpos", "FF-J0-qpos", "MF-J3-qpos", "MF-J2-qpos",
            "MF-J1-qpos", "MF-J0-qpos", "RF-J3-qpos", "RF-J2-qpos",
            "RF-J1-qpos", "RF-J0-qpos", "LF-J4-qpos", "LF-J3-qpos",
            "LF-J2-qpos", "LF-J1-qpos", "LF-J0-qpos", "TH-J4-qpos",
            "TH-J3-qpos", "TH-J2-qpos", "TH-J1-qpos", "TH-J0-qpos",
            "WR-J1-qvel", "WR-J0-qvel", "FF-J3-qvel", "FF-J2-qvel",
            "FF-J1-qvel", "FF-J0-qvel", "MF-J3-qvel", "MF-J2-qvel",
            "MF-J1-qvel", "MF-J0-qvel", "RF-J3-qvel", "RF-J2-qvel",
            "RF-J1-qvel", "RF-J0-qvel", "LF-J4-qvel", "LF-J3-qvel",
            "LF-J2-qvel", "LF-J1-qvel", "LF-J0-qvel", "TH-J4-qvel",
            "TH-J3-qvel", "TH-J2-qvel", "TH-J1-qvel", "TH-J0-qvel",
            "object_qvel-0", "object_qvel-1", "object_qvel-2", "object_qvel-3",
            "object_qvel-4", "object_qvel-5", "achieved_goal-0",
            "achieved_goal-1", "achieved_goal-2", "achieved_goal-3",
            "achieved_goal-4", "achieved_goal-5", "achieved_goal-6"
        ]

        # List set used for appending values from eposide
        observation_catcher = []
        observation_catcher_1 = []
        observation_catcher_2 = []
        observation_catcher_3 = []
        observation_catcher_4 = [
        ]  # this is the max append used for observation parameters qpos and qvel
        observation_catcher_5 = [
        ]  # this is the max append used for observation parameter object qvel
        observation_catcher_6 = [
        ]  # this is the max append used for observation parameter achieved goal

        # List set used for appending values from csv file
        observation_catcher_f0 = []
        observation_catcher_f1 = []
        observation_catcher_f2 = []
        observation_catcher_f3 = []
        observation_catcher_f4 = [
        ]  # this is the max append used for observation parameters qpos and qvel from csv file
        observation_catcher_f5 = [
        ]  # this is the max append used for observation parameter object qvel from csv file
        observation_catcher_f6 = [
        ]  # this is the max append used for observation parameter achieved goal from csv file
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            #u_check = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] # used to check the observation value changes --> replace in step with u_check instead of u[i]

            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(
                        u[i])  #u[i] & u_check
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']

                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                    elif self.rendder_and_save_png:  #ndrw
                        rgb_array = self.envs[i].render(mode='rgb_array')
                        im = Image.fromarray(rgb_array)
                        lov = im.crop(
                            (230, 180, 780, 730)
                        )  # the crop setting needed to be changed as per the direction and the required parameters are present in resource file.
                        observation_catcher.append(
                            o_new[i][0]
                        )  # use to append the requried set values from the observation vector (0-60)
                        observation_catcher_1.append(o_new[i][1])
                        observation_catcher_2.append(o_new[i][2])
                        observation_catcher_3.append(o_new[i][3])
                        observation_catcher_4.append(o_new[i][4])
                        observation_catcher_5.append(o_new[i][5])
                        observation_catcher_6.append(o_new[i][6])

                        #this place to read csv file which has all the observation space values of shadow-hand
                        with open('two_finger_ac_values/foo_0.csv',
                                  'r') as readcsv:
                            plots = csv.reader(readcsv, delimiter=',')
                            for row in plots:
                                observation_catcher_f0.append(
                                    float(row[0])
                                )  # use to append the requried set values from the observation vector (0-60) csv file
                                observation_catcher_f1.append(float(row[1]))
                                observation_catcher_f2.append(float(row[2]))
                                observation_catcher_f3.append(float(row[3]))
                                observation_catcher_f4.append(float(row[4]))
                                observation_catcher_f5.append(float(row[5]))
                                observation_catcher_f6.append(float(row[6]))

                        ax1.clear()
                        ax1.axvline(len(observation_catcher) - 1,
                                    ymin=-1,
                                    ymax=1,
                                    color='k',
                                    linestyle=':',
                                    linewidth=3)  # marker line for each step
                        ax1.plot(observation_catcher_f0,
                                 color='xkcd:coral',
                                 linewidth=4,
                                 label="achieved_goal-0"
                                 )  # full curve for all steps
                        ax1.plot(observation_catcher_f1,
                                 color='xkcd:green',
                                 linewidth=4,
                                 label="achieved_goal-1")
                        ax1.plot(observation_catcher_f2,
                                 color='xkcd:goldenrod',
                                 linewidth=4,
                                 label="achieved_goal-2")
                        ax1.plot(observation_catcher_f3,
                                 color='xkcd:orchid',
                                 linewidth=4,
                                 label="achieved_goal-3")
                        ax1.plot(observation_catcher_f4,
                                 color='xkcd:azure',
                                 linewidth=4,
                                 label="achieved_goal-4")
                        ax1.plot(observation_catcher_f5,
                                 color='xkcd:orangered',
                                 linewidth=4,
                                 label="achieved_goal-5")
                        ax1.plot(observation_catcher_f6,
                                 color='xkcd:tan',
                                 linewidth=4,
                                 label="achieved_goal-6")
                        ax1.plot(
                            observation_catcher,
                            'o',
                            color='xkcd:coral',
                            markevery=[-1],
                            markersize=10,
                            markeredgecolor='k')  # ball marker for each step
                        ax1.plot(observation_catcher_1,
                                 'o',
                                 color='xkcd:green',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')
                        ax1.plot(observation_catcher_2,
                                 'o',
                                 color='xkcd:goldenrod',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')
                        ax1.plot(observation_catcher_3,
                                 'o',
                                 color='xkcd:orchid',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')
                        ax1.plot(observation_catcher_4,
                                 'o',
                                 color='xkcd:azure',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')
                        ax1.plot(observation_catcher_5,
                                 'o',
                                 color='xkcd:orangered',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')
                        ax1.plot(observation_catcher_6,
                                 'o',
                                 color='xkcd:tan',
                                 markevery=[-1],
                                 markersize=10,
                                 markeredgecolor='k')

                        ax1.set_xlabel('Time-Step', fontsize=15)
                        ax1.set_ylabel('Observation-Values', fontsize=15)
                        ax1.set_title(
                            'Observation Vector Of The Shadow Hand (NN-Input)',
                            fontsize=18,
                            loc="left")
                        ax1.legend(loc='upper right',
                                   facecolor='#74dd93',
                                   frameon=False,
                                   fontsize='large',
                                   ncol=3,
                                   bbox_to_anchor=(1.03, 1.27))
                        ax1.set_facecolor('#74dd93')
                        ax1.set_xlim(xmin=-1)
                        ax1.set_xlim(xmax=99)
                        #ax1.set_ylim(ymin=-1.05) # default value --> should be checked according the y min in observed value - hard coded
                        #ax1.set_ylim(ymax=1.1)  # default value --> should be checked according the y max in observed value - hard coded

                        ax2.clear()
                        barlist = ax2.bar(x_bar,
                                          color='xkcd:silver',
                                          width=0.6,
                                          height=0.025)
                        barlist[0].set_color('xkcd:coral')
                        barlist[1].set_color('xkcd:green')
                        barlist[2].set_color('xkcd:goldenrod')
                        barlist[3].set_color('xkcd:orchid')
                        barlist[4].set_color('xkcd:azure')
                        barlist[5].set_color('xkcd:orangered')
                        barlist[6].set_color('xkcd:tan')
                        ax2.set_yticklabels([])
                        ax2.set_xticks(x_bar)
                        ax2.set_xticklabels(x_lab, rotation=90, fontsize=11)
                        ax2.set_facecolor('#74dd93')
                        ax2.set_frame_on(False)
                        ax2.axes.get_yaxis().set_visible(False)
                        if not os.path.exists(directory_plot):
                            os.makedirs(directory_plot)
                        if not os.path.exists(directory_env):
                            os.makedirs(directory_env)
                        plt.savefig(directory_plot +
                                    "pic_{0:05d}.png".format(t),
                                    facecolor=fig.get_facecolor(),
                                    edgecolor='none')
                        lov.save(directory_env + "pic_{0:05d}.png".format(t))
                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        #this below code snap is helping to write the observation values to csv file using python zip and csv module combination
        #with open('two_finger_ac_values/foo_0.csv','w') as csvfile:
        #values = csv.writer(csvfile)
        #values.writerows(zip(observation_catcher,observation_catcher_1,observation_catcher_2,observation_catcher_3,observation_catcher_4,observation_catcher_5,observation_catcher_6,observation_catcher_7,observation_catcher_8,observation_catcher_9,observation_catcher_10,observation_catcher_11,observation_catcher_12,observation_catcher_13,observation_catcher_14,observation_catcher_15,observation_catcher_16,observation_catcher_17,observation_catcher_18,observation_catcher_19,observation_catcher_20,observation_catcher_21,observation_catcher_22,observation_catcher_23,observation_catcher_24,observation_catcher_25,observation_catcher_26,observation_catcher_27,observation_catcher_28,observation_catcher_29,observation_catcher_30,observation_catcher_31,observation_catcher_32,observation_catcher_33,observation_catcher_34,observation_catcher_35,observation_catcher_36,observation_catcher_37,observation_catcher_38,observation_catcher_39,observation_catcher_40,observation_catcher_41,observation_catcher_42,observation_catcher_43,observation_catcher_44,observation_catcher_45,observation_catcher_46,observation_catcher_47,observation_catcher_48,observation_catcher_49,observation_catcher_50,observation_catcher_51,observation_catcher_52,observation_catcher_53,observation_catcher_54,observation_catcher_55,observation_catcher_56,observation_catcher_57,observation_catcher_58,observation_catcher_59,observation_catcher_60))
        #csvfile.close()

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value
        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size
        return convert_episode_to_batch_major(episode)
示例#4
0
    def initDemoBuffer(self, demoDataFile, update_stats=True):

        demoData = np.load(demoDataFile)
        info_keys = [
            key.replace('info_', '') for key in self.input_dims.keys()
            if key.startswith('info_')
        ]
        info_values = [
            np.empty((self.T, self.rollout_batch_size,
                      self.input_dims['info_' + key]), np.float32)
            for key in info_keys
        ]

        for epsd in range(self.num_demo):
            obs, acts, goals, achieved_goals = [], [], [], []
            i = 0
            for transition in range(self.T):
                obs.append(
                    [demoData['obs'][epsd][transition].get('observation')])
                acts.append([demoData['acs'][epsd][transition]])
                goals.append(
                    [demoData['obs'][epsd][transition].get('desired_goal')])
                achieved_goals.append(
                    [demoData['obs'][epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][
                        transition,
                        i] = demoData['info'][epsd][transition][key]

            obs.append([demoData['obs'][epsd][self.T].get('observation')])
            achieved_goals.append(
                [demoData['obs'][epsd][self.T].get('achieved_goal')])

            episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global demoBuffer
            demoBuffer.store_episode(episode)

            print("Demo buffer size currently ", demoBuffer.get_current_size())

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(
                    episode)
                transitions = self.sample_transitions(
                    episode, num_normalizing_transitions)

                o, o_2, g, ag = transitions['o'], transitions[
                    'o_2'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(
                    o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()
示例#5
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o.astype(np.float32)
        ag[:] = self.initial_ag.astype(np.float32)

        #Add initial states as "achieved goals"
        hashcode = self.countTracker.compute_hash_code(
            self.initial_ag[0].astype(np.float32))
        self.countTracker.update_count(hashcode)
        if len(self.initial_ag) > 1:
            hashcode_2 = self.countTracker.compute_hash_code(
                self.initial_ag[1].astype(np.float32))
            self.countTracker.update_count(hashcode_2)

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [
            np.empty(
                (self.T, self.rollout_batch_size, self.dims['info_' + key]),
                np.float32) for key in self.info_keys
        ]
        Qs = []
        #time horizon = number of states achieved (50), subtracting off initial state
        #in grand scheme of things, should include initial state as an achieved goal, so should be 51
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            #rollout_batch size by default is 2, dimensions of goal is 3
            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    #self.envs[i].step(u[i]) contains all of the environment feedback
                    #The above statement returns an observation key with all observation
                    #information (multiple values), an achieved goal key with a 3-D point, a
                    #desired goal key with a 3-D point, and an is_success key with a boolean value
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation'].astype(np.float32)
                    ag_new[i] = curr_o_new['achieved_goal'].astype(np.float32)

                    hashcode = self.countTracker.compute_hash_code(
                        curr_o_new['achieved_goal'].astype(np.float32))
                    self.countTracker.update_count(hashcode)

                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new

        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)

        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#6
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # print("Rollout. o_new={}, ag_new={},success={}".format(o_new,ag_new,success))
            # compute new states and observations
            obs_dict_new, _, done, info = self.venv.step(u)
            # print("HERE")
            # print("#########Debug##########")
            o_new = obs_dict_new['observation']
            # print("observation high : {}".format(o_new))
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            # print("############## obs = {}".format(obs))
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#7
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes, successes2, successes3 = [], [], [], [], [], [], []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []
        Fs = []
        Ks = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
                Fs.append(
                    np.abs(
                        np.float32((o[:, 11:13] * (o[:, 11:13] < 0.0)).sum(
                            axis=-1))).mean())  # block
                # Fs.append(np.abs(np.float32([e.env.prev_oforce for e in self.venv.envs])).mean()) # chip
                # Ks.append(np.abs(np.float32(o[:,13].sum(axis=-1))).mean()) # block 6D
                Ks.append(0.25)  # block 4D, chip 3D
                # Ks.append(np.abs(np.float32(o[:,14].sum(axis=-1))).mean()) # chip 5D
            else:
                u = policy_output
            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            success2 = np.zeros(self.rollout_batch_size)

            # compute new states and observations
            obs_dict_new, _, done, info = self.venv.step(u)
            # self.venv.render()
            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])
            success2 = (np.float32(o[:, 11:13].sum(axis=-1)) * 1000.0 > -300.0
                        )  # block -147.15/3*6
            # success2 = (np.float32(self.venv.envs[0].env.prev_oforce < self.venv.envs[0].env.object_fragility)) # chip

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            successes2.append(success2.copy())

            # successes3.append(success3.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
#        print("--------------------New Rollout--------------------")
        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        successful2 = np.array(successes2)
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        success_rate2 = np.mean(successful2.mean(axis=0))
        success_rate3 = np.mean(successful2.min(axis=0) * successful)
        self.success_history.append(success_rate)
        self.success_history2.append(success_rate2)
        self.success_history3.append(success_rate3)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
            self.F_history.append(np.mean(Fs))
            self.K_history.append(np.mean(Ks))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#8
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [
            np.empty(
                (self.T, self.rollout_batch_size, self.dims['info_' + key]),
                np.float32) for key in self.info_keys
        ]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                rewards = []
                infos = []
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, reward, _, info = self.envs[i].step(u[i])
                    rewards.append(reward)
                    infos.append(info)
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]

                    env = self.envs[i]  # type: PushEnv
                    env.render_camera = "camera_side"
                    img_left = env.render(mode='rgb_array')
                    env.render_camera = "camera_topdown"

                    img_center = env.render(mode='rgb_array')
                    # render rewards
                    img_center[-lower_part, :10] = orange
                    img_center[-lower_part, -10:] = orange
                    if TRAJ_SIZE < 512:
                        p_rew_x = 0
                        for j, r in enumerate(rewards):
                            rew_x = int(j * width_factor)
                            if r < 0:
                                color = blue if infos[j]["grasped"] else red
                                img_center[-1:, p_rew_x:rew_x] = color
                                img_center[-1:, p_rew_x:rew_x] = color
                            else:
                                rew_y = int(r / max_reward * lower_part)
                                color = blue if infos[j]["grasped"] else orange
                                img_center[-rew_y - 1:, p_rew_x:rew_x] = color
                                img_center[-rew_y - 1:, p_rew_x:rew_x] = color
                            p_rew_x = rew_x
                    else:
                        for j, r in enumerate(rewards):
                            rew_x = int(j * width_factor)
                            if r < 0:
                                color = blue if infos[j]["grasped"] else red
                                img_center[-1:, rew_x] = color
                                img_center[-1:, rew_x] = color
                            else:
                                rew_y = int(r / max_reward * lower_part)
                                color = blue if infos[j]["grasped"] else orange
                                img_center[-rew_y - 1:, rew_x] = color
                                img_center[-rew_y - 1:, rew_x] = color

                    env.render_camera = "camera_front"
                    img_right = env.render(mode='rgb_array')
                    img_left = Image.fromarray(np.uint8(img_left))
                    draw_left = ImageDraw.Draw(img_left)
                    draw_left.text((20, 20),
                                   "Batch %i" % (i + 1),
                                   fill="black",
                                   font=font)
                    img_right = Image.fromarray(np.uint8(img_right))
                    draw_right = ImageDraw.Draw(img_right)
                    draw_right.text((20, 20),
                                    "Step %i" % info["l"],
                                    fill="black",
                                    font=font)
                    self.video.append(
                        np.hstack((np.array(img_left), np.array(img_center),
                                   np.array(img_right))))

                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        imageio.mimsave('play_her.mp4', self.video, fps=20)

        return convert_episode_to_batch_major(episode)
示例#9
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o, ag, self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes, successes2 = [], [], [], [], [], []
        dones = []
        info_values = [np.empty((self.T-1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        Fs = []
        Ks = []
        
        with Session(bitfile="SCPC-lv-noFIFO_FPGATarget_FPGAmainepos_1XvgQEcJVeE.lvbitx", resource="rio://10.157.23.150/RIO0") as session:
            act_Rj1=session.registers['Mod3/AO0']
            enc_Rj1=session.registers['Rj1']
            act_Rj2=session.registers['Mod3/AO1']
            enc_Rj2=session.registers['Rj2']
            
            act_Lj1=session.registers['Mod3/AO3']
            enc_Lj1=session.registers['Lj1']
            act_Lj2=session.registers['Mod3/AO4']
            enc_Lj2=session.registers['Lj2']
            
            sen_f=session.registers['fsensor']
            sen_e=session.registers['fencoder']
            
            emergency = False
            
            Re1 = enc_Rj1.read()
            Re2 = enc_Rj2.read()
            Le1 = enc_Lj1.read()
            Le2 = enc_Lj2.read()
            
            f_sensor = 5.1203 * sen_f.read() - 5.2506
            e_sensor = (((sen_e.read()) - (f_sensor / 100.0 * 0.15)) -2.9440)/0.0148
            
            Rj = self.R_j_inv * self.R_e * np.array([[Re1-self.offset[0]],[-Re2+self.offset[1]]]) * np.pi/180.0
            Lj = self.R_j_inv_L * self.R_e * np.array([[Le1-self.offset[2]],[Le2-self.offset[3]]]) * np.pi/180.0
            
            Prev_Rj = Rj
            Prev_Lj = Lj
            
            xR = self.L1 * np.cos(Rj[0,0] + np.pi/2.0) + self.L2 * np.cos(Rj[0,0]-Rj[1,0] + np.pi/2.0)
            yR = self.L1 * np.sin(Rj[0,0] + np.pi/2.0) + self.L2 * np.sin(Rj[0,0]-Rj[1,0] + np.pi/2.0)
            xL = self.L1 * np.cos(Lj[0,0] + np.pi/2.0) + self.L2 * np.cos(Lj[0,0]+Lj[1,0] + np.pi/2.0)
            yL = self.L1 * np.sin(Lj[0,0] + np.pi/2.0) + self.L2 * np.sin(Lj[0,0]+Lj[1,0] + np.pi/2.0)
            
            P_R = np.array([xR, yR])
            P_L = np.array([xL, yL])
            Prel_R = self.Pc_R - P_R
            Prel_L = self.Pc_L - P_L
            l_R = np.sqrt(Prel_R[0]*Prel_R[0] + Prel_R[1]*Prel_R[1])
            l_L = np.sqrt(Prel_L[0]*Prel_L[0] + Prel_L[1]*Prel_L[1])
            p_R = np.array([[l_R],[np.arctan2(-Prel_R[1],-Prel_R[0])]])
            p_L = np.array([[l_L],[np.arctan2(Prel_L[1],Prel_L[0])]])
            
            
            for t in range(self.T):
                policy_output = self.policy.get_actions(
                    o, ag, self.g,
                    compute_Q=self.compute_Q,
                    noise_eps=self.noise_eps if not self.exploit else 0.,
                    random_eps=self.random_eps if not self.exploit else 0.,
                    use_target_net=self.use_target_net)
    
                if self.compute_Q:
                    u, Q = policy_output
                    Qs.append(Q)
                    Fs.append(f_sensor)
                    Ks.append(self.stiffness)
                else:
                    u = policy_output
                if u.ndim == 1:
                    # The non-batched case should still have a reasonable shape.
                    u = u.reshape(1, -1)
    
                o_new = np.empty((self.rollout_batch_size, self.dims['o']))
                ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
                success = np.zeros(self.rollout_batch_size)
                success2 = np.zeros(self.rollout_batch_size)
                
                # compute new states and observations
                self.stiffness_lim = np.clip(self.stiffness_lim + 0.2 * u[0][3], 0.1, 1.0)
                self.stiffness = np.clip(self.stiffness + 0.2 * u[0][2], 0, self.stiffness_lim)
                u[0][0] = np.clip(u[0][0], -self.l_step_limit*14.0, self.l_step_limit*14.0)
                u[0][1] = np.clip(u[0][1], -8.0*self.th_step_limit*2.0, self.th_step_limit*2.0)
                
                if emergency == False:
                    vel_R = Rj - Prev_Rj
                    vel_L = Lj - Prev_Lj
                    if vel_R[0,0] > self.vel_limit or vel_R[0,0] < -self.vel_limit or vel_L[0,0] > self.vel_limit or vel_L[0,0] < -self.vel_limit:
                        emergency = True
                        r = np.array([[self.stiffness], [1.0]]) * 0.5
                        print("***************Robot going insane! Safety on!***************")
                    else:
                        r = np.array([[self.stiffness], [1.0]])
                else:
                    r = np.array([[self.stiffness], [1.0]]) * 0.5
                
                des_p_R = np.array([[np.min([np.max([p_R[0,0] + u[0][0]/14.0, -self.l_limit/2.0]), self.l_limit/4.0])], [np.min([np.max([p_R[1,0] + u[0][1]/2.0, -self.th_limit]), self.th_limit])]])
                des_p_L = np.array([[np.min([np.max([p_L[0,0] + u[0][0]/14.0, -self.l_limit/2.0]), self.l_limit/4.0])], [np.min([np.max([p_L[1,0] + u[0][1]/2.0, -self.th_limit]), self.th_limit])]])
                
                Jp_R = np.matrix([[-Prel_R[0]/l_R, -Prel_R[1]/l_R],[Prel_R[1]/l_R/l_R, -Prel_R[0]/l_R/l_R]])
                Jp_L = np.matrix([[-Prel_L[0]/l_L, -Prel_L[1]/l_L],[Prel_L[1]/l_L/l_L, -Prel_L[0]/l_L/l_L]])
                Jp_inv_R = np.matrix([[Jp_R[1,1] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0]), -Jp_R[0,1] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0])], [-Jp_R[1,0] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0]), Jp_R[0,0] / (Jp_R[0,0]*Jp_R[1,1] - Jp_R[0,1]*Jp_R[1,0])]])
                Jp_inv_L = np.matrix([[Jp_L[1,1] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0]), -Jp_L[0,1] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0])], [-Jp_L[1,0] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0]), Jp_L[0,0] / (Jp_L[0,0]*Jp_L[1,1] - Jp_L[0,1]*Jp_L[1,0])]])
                J_R = np.matrix([[-yR, self.L2 * np.cos(Rj[0,0]-Rj[1,0])], 
                                 [xR, self.L2 * np.sin(Rj[0,0]-Rj[1,0])]])
                J_L = np.matrix([[-yL, -self.L2 * np.cos(Lj[0,0]+Lj[1,0])], 
                                 [xL, -self.L2 * np.sin(Lj[0,0]+Lj[1,0])]])
                J_inv_R = np.matrix([[J_R[1,1] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0]), -J_R[0,1] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0])], [-J_R[1,0] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0]), J_R[0,0] / (J_R[0,0]*J_R[1,1] - J_R[0,1]*J_R[1,0])]])
                J_inv_L = np.matrix([[J_L[1,1] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0]), -J_L[0,1] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0])], [-J_L[1,0] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0]), J_L[0,0] / (J_L[0,0]*J_L[1,1] - J_L[0,1]*J_L[1,0])]])
                
                max_kj_R = np.transpose(self.R_j) * np.matrix([[2*self.Ksc, 0],[0, 2*self.Ksc]]) * self.R_j
                max_kj_L = np.transpose(self.R_j_L) * np.matrix([[2*self.Ksc, 0],[0, 2*self.Ksc]]) * self.R_j_L
                max_k_R = np.transpose(J_inv_R) * max_kj_R * J_inv_R
                max_k_L = np.transpose(J_inv_L) * max_kj_L * J_inv_L
                max_kp_R = np.transpose(Jp_inv_R) * max_k_R * Jp_inv_R
                max_kp_L = np.transpose(Jp_inv_L) * max_k_L * Jp_inv_L
                max_kp_R[0,1] = 0.0
                max_kp_R[1,0] = 0.0
                max_kp_L[0,1] = 0.0
                max_kp_L[1,0] = 0.0
                des_Fp_R = max_kp_R * (r * (des_p_R - p_R)) * 0.9
                des_Fp_L = max_kp_L * (r * (des_p_L - p_L)) * 0.9
                des_F_R = np.transpose(Jp_R) * des_Fp_R
                des_F_L = np.transpose(Jp_L) * des_Fp_L
                des_tau_R = np.transpose(J_R) * des_F_R
                des_tau_L = np.transpose(J_L) * des_F_L
                if Rj[1,0] > -0.2: des_tau_R += np.array([[0.0],[-0.05]])
                if Lj[1,0] > -0.2: des_tau_L += np.array([[0.0],[-0.05]])
                if Rj[1,0] < -1.8: des_tau_R += np.array([[0.0],[0.05]])
                if Lj[1,0] < -1.8: des_tau_L += np.array([[0.0],[0.05]])
                
                if Rj[0,0] > 0: des_tau_R += np.array([[-0.05],[0.0]])
                if Lj[0,0] < 0: des_tau_L += np.array([[0.05],[0.0]])
                if Rj[0,0] < -0.8: des_tau_R += np.array([[0.05],[0.0]])
                if Lj[0,0] > 0.8: des_tau_L += np.array([[-0.05],[0.0]])
                
                
                des_mR = (np.transpose(self.R_j_inv)*des_tau_R / (2*self.Ksc) + self.R_j * Rj) / self.Rm 
                des_mL = (np.transpose(self.R_j_inv_L)*des_tau_L / (2*self.Ksc) + self.R_j_L * Lj) / self.Rm
                
                act_Rj1.write(np.min([np.max([des_mR[0,0] * 180.0 / np.pi * 0.117258, -10.0]),10.0]))
                act_Rj2.write(np.min([np.max([des_mR[1,0] * 180.0 / np.pi * 0.117541, -10.0]),10.0]))
                act_Lj1.write(np.min([np.max([des_mL[0,0] * 180.0 / np.pi * 0.117729, -10.0]),10.0]))
                act_Lj2.write(np.min([np.max([des_mL[1,0] * 180.0 / np.pi * 0.117679, -10.0]),10.0]))
                
                time.sleep(0.004)
                
                Re1 = enc_Rj1.read()
                Re2 = enc_Rj2.read()
                Le1 = enc_Lj1.read()
                Le2 = enc_Lj2.read()
                f_sensor = 5.1203 * sen_f.read() - 5.2506
                e_sensor = (((sen_e.read()) - (f_sensor / 100.0 * 0.15)) -2.9440)/0.0148
                
                Prev_Rj = Rj
                Prev_Lj = Lj
                
                Rj = self.R_j_inv * self.R_e * np.array([[Re1-self.offset[0]],[-Re2+self.offset[1]]]) * np.pi/180.0
                Lj = self.R_j_inv_L * self.R_e * np.array([[Le1-self.offset[2]],[Le2-self.offset[3]]]) * np.pi/180.0
                
                xR = self.L1 * np.cos(Rj[0,0] + np.pi/2.0) + self.L2 * np.cos(Rj[0,0]-Rj[1,0] + np.pi/2.0)
                yR = self.L1 * np.sin(Rj[0,0] + np.pi/2.0) + self.L2 * np.sin(Rj[0,0]-Rj[1,0] + np.pi/2.0)
                xL = self.L1 * np.cos(Lj[0,0] + np.pi/2.0) + self.L2 * np.cos(Lj[0,0]+Lj[1,0] + np.pi/2.0)
                yL = self.L1 * np.sin(Lj[0,0] + np.pi/2.0) + self.L2 * np.sin(Lj[0,0]+Lj[1,0] + np.pi/2.0)
                
                P_R = np.array([xR, yR])
                P_L = np.array([xL, yL])
                Prel_R = self.Pc_R - P_R
                Prel_L = self.Pc_L - P_L
                l_R = np.sqrt(Prel_R[0]*Prel_R[0] + Prel_R[1]*Prel_R[1])
                l_L = np.sqrt(Prel_L[0]*Prel_L[0] + Prel_L[1]*Prel_L[1])
                p_R = np.array([[l_R],[np.arctan2(-Prel_R[1],-Prel_R[0])]])
                p_L = np.array([[l_L],[np.arctan2(Prel_L[1],Prel_L[0])]])
                
                observation = np.array([[p_R[0,0] * 10 - 1.0, p_L[0,0] * 10 - 1.0, p_R[1,0], p_L[1,0], 
                                              ((e_sensor -2.9440)/0.0148* np.pi / 180.0 - p_R[1,0]) , ((e_sensor -2.9440)/0.0148* np.pi / 180.0 - p_L[1,0]), 
                                              (self.g[0][0] - (e_sensor -2.9440)/0.0148 * np.pi / 180.0),
                                              des_Fp_R[0,0] * 0.1, des_Fp_L[0,0] * 0.1, 
                                              vel_R[0,0], vel_R[1,0], vel_L[0,0], vel_L[1,0],
                                              self.stiffness, self.stiffness_lim]])
                
                obs_dict_new = dict(observation=observation, 
                                    achieved_goal=np.array([[((e_sensor -2.9440)/0.0148) * np.pi / 180.0, vel_R[0,0], vel_R[1,0], vel_L[0,0], vel_L[1,0], des_Fp_R[0,0] * 0.1, des_Fp_L[0,0] * 0.1]]), 
                                    desired_goal = self.g)
                done = [False] if t < self.T-1 else [True]
                info = [{
                    'is_success': self._is_success(obs_dict_new['achieved_goal'][0], obs_dict_new['desired_goal'][0]),
                }]
                o_new = obs_dict_new['observation']
                ag_new = obs_dict_new['achieved_goal']
                success = np.array([i.get('is_success', 0.0) for i in info])
                success2 = (np.float32(f_sensor < self.object_fragility))
    
                if any(done):
                    # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                    # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                    # after a reset
                    break
    
                for i, info_dict in enumerate(info):
                    for idx, key in enumerate(self.info_keys):
                        # print(info_values[idx][t, i])
                        # print(info[i][key])
                        # print(info_values)
                        # print(info)
                        info_values[idx][t, i] = info[i][key]
    
                # if np.isnan(o_new).any():
                #     self.logger.warn('NaN caught during rollout generation. Trying again...')
                #     self.reset_all_rollouts()
                #     return self.generate_rollouts()
    
                dones.append(done)
                obs.append(o.copy())
                achieved_goals.append(ag.copy())
                successes.append(success.copy())
                successes2.append(success2.copy())
                # print(o.copy())
                # print(o_new)
                
                # successes3.append(success3.copy())
                acts.append(u.copy())
                goals.append(self.g.copy())
                o[...] = o_new
                ag[...] = ag_new
    #        print("--------------------New Rollout--------------------")
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
    
            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(self.info_keys, info_values):
                episode['info_{}'.format(key)] = value
    
            # stats
            successful = np.array(successes)[-1, :]
            successful2 = np.array(successes2)
            assert successful.shape == (self.rollout_batch_size,)
            success_rate = np.mean(successful)
            success_rate2 = np.mean(successful2.mean(axis=0))
            success_rate3 = np.mean(successful2.min(axis=0) * successful)
            self.success_history.append(success_rate)
            self.success_history2.append(success_rate2)
            self.success_history3.append(success_rate3)
            if self.compute_Q:
                self.Q_history.append(np.mean(Qs))
                self.F_history.append(np.mean(Fs))
                self.K_history.append(np.mean(Ks))
            self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#11
0
    def generate_rollouts_ker(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        episodes = []
        episodes_batch = []

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []

        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o, ag, self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations, do not return the reward, and get it from her_sampler.py
            obs_dict_new, _, done, info = self.venv.step(u)
            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            # no need
            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break
            # no need
            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]
            # no need
            if np.isnan(o_new).any():
                self.logger.warn('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())

            o[...] = o_new
            ag[...] = ag_new

        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        # # ----------------Kaleidoscope ER---------------------------
        # original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals)
        # # ----------------end---------------------------
        # # ----------------pack up as transition---------------------------
        # for (obs,acts,goals,achieved_goals) in original_ka_episodes:
        #     episode = dict(o=obs,
        #                 u=acts,
        #                 g=goals,
        #                 ag=achieved_goals)
        #     for key, value in zip(self.info_keys, info_values):
        #         episode['info_{}'.format(key)] = value
        #     episodes.append(episode)
        # # ----------------end---------------------------


        n_KER = None
        if self.dynamic_KER:
            # set_trace()
            assert self.dynamic_KER <10000 and self.dynamic_KER > 10
            n_KER_1 = self.dynamic_KER%10
            n_KER_2 = self.dynamic_KER//10
            assert n_KER_1!=0 and n_KER_2!=0

            n_KER = n_KER_1
            ag = np.array(achieved_goals)
            delta_movement = np.linalg.norm(ag[1:] - ag[0], axis=2) # compare with the object starting pos
            if any(delta_movement > 0.05):  # if the object is moved
                # set_trace()
                self.count_ray +=1
                # print('move the ag')
                print('move the ag:', self.count_ray)
                n_KER = n_KER_2
                # print('xag:',x)
                # print('yag:',y)
                # print('g:',self.g)
                # print('successes:',successes)
        else:
            # ******************print
            # set_trace()
            ag = np.array(achieved_goals)
            delta_movement = np.linalg.norm(ag[1:] - ag[0], axis=2)  # compare with the object starting pos
            if any(delta_movement > 0.05):  # if the object is moved
                self.count_ray += 1
                print('move the ag:', self.count_ray)
            # ******************print


        original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals,n_KER)
        # ----------------end---------------------------
        # ----------------pack up as transition---------------------------
        for (obs,acts,goals,achieved_goals) in original_ka_episodes:
            episode = dict(o=obs,
                        u=acts,
                        g=goals,
                        ag=achieved_goals)
            for key, value in zip(self.info_keys, info_values):
                episode['info_{}'.format(key)] = value
            episodes.append(episode)

        # if self.dynamic_mirror_origin == 'True':    # untested
        #     temp_trajs = []
        #     step = 5
        #     for i in range(step):
        #         # ----------------Kaleidoscope ER---------------------------
        #         original_ka_episodes = self.ker.ker_process(obs, acts, goals, achieved_goals, n_KER, step=i)
        #         temp_trajs.append(original_ka_episodes)
        #         # ----------------end---------------------------
        #
        #     for temp_traj in temp_trajs:
        #         # ----------------pack up as transition---------------------------
        #         for (obs, acts, goals, achieved_goals) in temp_traj:
        #             episode = dict(o=obs,
        #                            u=acts,
        #                            g=goals,
        #                            ag=achieved_goals)
        #             for key, value in zip(self.info_keys, info_values):
        #                 episode['info_{}'.format(key)] = value
        #             episodes.append(episode)
        #         # ----------------end---------------------------
        #
        # else:
        #     # ----------------Kaleidoscope ER---------------------------
        #     original_ka_episodes = self.ker.ker_process(obs,acts,goals,achieved_goals,n_KER)
        #     # ----------------end---------------------------
        #     # ----------------pack up as transition---------------------------
        #     for (obs,acts,goals,achieved_goals) in original_ka_episodes:
        #         episode = dict(o=obs,
        #                     u=acts,
        #                     g=goals,
        #                     ag=achieved_goals)
        #         for key, value in zip(self.info_keys, info_values):
        #             episode['info_{}'.format(key)] = value
        #         episodes.append(episode)
        #     # ----------------end---------------------------


        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))

        # if success_rate != 0:
        #     set_trace()

        mul_factor = 1
        self.n_episodes += (mul_factor* self.rollout_batch_size)

        # ----------------format processing---------------------------
        # return dict: ['o', 'u', 'g', 'ag', 'info_is_success']
        for episode in episodes:
            episode_batch = convert_episode_to_batch_major(episode)
            episodes_batch.append(episode_batch)
        # ----------------end---------------------------

        return episodes_batch
示例#12
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        # addition for multi-tasks structures
        # decides whether the next runs are made to compute progress (exploit = True, means no noise on actions).
        if (self.structure == 'curious' or self.structure == 'task_experts') and not self.eval:
            self.exploit = True if np.random.random() < 0.1 else False
            if self.exploit and self.structure == 'curious':
                    self.p = 1 / self.nb_tasks * np.ones([self.nb_tasks])
        elif self.eval:
            self.exploit = True
            self.p = 1 / self.nb_tasks * np.ones([self.nb_tasks])

        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['ag']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []

        # addition for multi-tasks structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            task_descrs = []
            changes = []  # True when the achieved goal (outcome) has changed compared to the initial achieved goal

        for t in range(self.T):

            # when evaluating task_experts, the policy corresponding to the demanded task must be selected
            if self.structure=='task_experts' and self.eval:
                act_output = np.zeros([self.rollout_batch_size, self.dims['u']])
                q_output = np.zeros([self.rollout_batch_size, 1])
                for i in range(self.rollout_batch_size):
                    tsk = np.argwhere(self.task_descr[i] == 1).squeeze()
                    act_output[i, :], q_output[i, 0] = self.policy[tsk].get_actions(
                        o[i].reshape([1, o[i].size]), ag[i].reshape([1, ag[i].size]), self.g[i].reshape([1, self.g[i].size]),
                        task_descr=self.task_descr[i].reshape([1, self.task_descr[i].size]),
                        compute_Q=self.compute_Q,
                        noise_eps=self.noise_eps if not self.exploit else 0.,
                        random_eps=self.random_eps if not self.exploit else 0.,
                        use_target_net=self.use_target_net)
                policy_output = [act_output, q_output]
            else:
                policy_output = self.policy.get_actions(
                    o, ag, self.g,
                    task_descr = self.task_descr if self.structure == 'curious' else None,
                    compute_Q=self.compute_Q,
                    noise_eps=self.noise_eps if not self.exploit else 0.,
                    random_eps=self.random_eps if not self.exploit else 0.,
                    use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['ag']))
            success = np.zeros(self.rollout_batch_size)
            r_competence = np.zeros(self.rollout_batch_size)

            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    if self.render:
                        self.envs[i].render()
                    curr_o_new, r_competence[i], _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    self.g[i] = curr_o_new['desired_goal'] # in case desired goal changes depending on observation
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]

                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new

            # addition for goal task selection
            if self.structure == 'curious' or self.structure == 'task_experts':
                task_descrs.append(self.task_descr.copy())
                changes.append(np.abs(achieved_goals[0] - ag) > 1e-3)


        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals)

        # addition for multi-tasks structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            episode['task_descr'] = task_descrs
            episode['change'] = changes


        self.initial_o[:] = o
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        self.reward_history.append(r_competence)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size * self.nb_cpu

        # addition for multi-tasks structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            # only update competence if no noise has been used
            if self.exploit:
                tasks_for_competence = [self.envs[i].unwrapped.task for i in range(self.rollout_batch_size)]
                goals_for_competence = [self.envs[i].unwrapped.goal[self.tasks_g_id[tasks_for_competence[i]]] for i in range(self.rollout_batch_size)]
                full_goals_for_competence = [self.envs[i].unwrapped.goal for i in range(self.rollout_batch_size)]
                ag_for_competence = [achieved_goals[-1][i] for i in range(self.rollout_batch_size)]

                succ_list = successful.tolist()
            else:
                tasks_for_competence = []
                goals_for_competence = []
                full_goals_for_competence = []
                ag_for_competence = []
                succ_list = []

            succ_list = MPI.COMM_WORLD.gather(succ_list, root=0)
            tasks_for_competence = MPI.COMM_WORLD.gather(tasks_for_competence, root=0)
            goals_for_competence = MPI.COMM_WORLD.gather(goals_for_competence, root=0)
            full_goals_for_competence = MPI.COMM_WORLD.gather(full_goals_for_competence, root=0)
            ag_for_competence = MPI.COMM_WORLD.gather(ag_for_competence, root=0)

            # update competence queues for each task in cpu rank 0
            # compute next selection probabilities
            if self.rank == 0:
                tasks_for_competence = sum(tasks_for_competence, [])
                goals_for_competence = sum(goals_for_competence, [])
                succ_list = sum(succ_list, [])

                task_succ_list = [[] for _ in range(self.nb_tasks)]
                task_cp_list = [[] for _ in range(self.nb_tasks)]
                task_goal_list = [[] for _ in range(self.nb_tasks)]
                # update competence queues
                for succ, task in zip(succ_list, tasks_for_competence):
                    task_succ_list[task].append(succ)

                for goal, task in zip(goals_for_competence, tasks_for_competence):
                    task_goal_list[task].append(goal)

                for task in range(self.nb_tasks):
                    self.competence_computers[task].update(task_succ_list[task]) # update competence and competence progress (learning progress)
                    if self.goal_selection == 'active' and not self.eval:

                        new_split, _ = self.goal_selectors[task].update(task_goal_list[task], task_succ_list[task])
                        if new_split:
                            regions = self.goal_selectors[task].get_regions
                            probas = self.goal_selectors[task].probas
                            self.split_histories[task].append([regions, probas])
                        else:
                            self.split_histories[task].append(None)

                self.C = np.array([self.get_C()]).squeeze() # get new updated competence measures

                # record all tasks
                self.task_history.extend(self.tasks.copy())
                self.goal_history.extend(self.goals.copy())

                # update task selection probabilities if active task selection
                if not self.eval:
                    if self.task_selection == 'active_competence_progress' and self.structure != 'task_experts':
                        # compute competence progress for each task
                        self.CP = np.array([self.get_CP()]).squeeze()
                        # softmax
                        # exp_cp = np.exp(self.temperature*self.CP)
                        # self.p = exp_cp / exp_cp.sum()

                        # epsilon proportional
                        epsilon = 0.4
                        if self.CP.sum() == 0:
                            self.p = (1 / self.nb_tasks) * np.ones([self.nb_tasks])
                        else:
                            self.p = epsilon * (1 / self.nb_tasks) * np.ones([self.nb_tasks]) + \
                                     (1 - epsilon) * self.CP / self.CP.sum()

                        if self.p.sum() > 1:
                            self.p[np.argmax(self.p)] -= self.p.sum() - 1
                        elif self.p.sum() < 1:
                            self.p[-1] = 1 - self.p[:-1].sum()


                    elif self.structure == 'task_experts':
                        self.p = np.zeros([self.nb_tasks])
                        self.p[self.unique_task] = 1


            # broadcast the selection probability to all cpus and the competence
            if not self.eval:
                self.p = MPI.COMM_WORLD.bcast(self.p, root=0)
                self.CP = MPI.COMM_WORLD.bcast(self.CP, root=0)

        return convert_episode_to_batch_major(episode), self.CP, self.n_episodes
示例#13
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [
            np.empty(
                (self.T, self.rollout_batch_size, self.dims['info_' + key]),
                np.float32) for key in self.info_keys
        ]
        Qs = []

        ####################### hrl #############################

        Rt_high_sum = np.zeros((self.rollout_batch_size, 1), np.float32)
        total_timestep = 1
        high_goal_gt = np.empty((self.rollout_batch_size, self.dims['o']),
                                np.float32)
        #high_goal_gt_tilda = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)
        high_old_obj_st = np.empty((self.rollout_batch_size, self.dims['o']),
                                   np.float32)

        u_temp = np.empty((self.rollout_batch_size, self.dims['u']),
                          np.float32)

        low_nn_at = np.zeros(
            (self.high_level_train_step * self.rollout_batch_size,
             self.dims['u']), np.float32).reshape(self.rollout_batch_size,
                                                  self.high_level_train_step,
                                                  self.dims['u'])
        low_nn_st = np.zeros(
            (self.high_level_train_step * self.rollout_batch_size,
             self.dims['o']), np.float32).reshape(self.rollout_batch_size,
                                                  self.high_level_train_step,
                                                  self.dims['o'])
        intrinsic_reward = np.zeros((self.rollout_batch_size, 1), np.float32)

        high_goal_gt[:] = self.initial_high_goal_gt
        #high_goal_gt_tilda[:] = self.initial_high_goal_gt_tilda

        ##########################################################

        for t in range(self.T):
            #print_point
            #print("cont t : ", t)
            #print("cont total_timestep : ", total_timestep)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            reward_new = np.zeros(self.rollout_batch_size)
            done_new = np.zeros(self.rollout_batch_size)

            # compute new states and observations
            for i in range(self.rollout_batch_size):
                #print_point
                #print(" i : ", i)

                policy_output = self.policy.get_low_actions(
                    # o, ag, self.g,
                    o[i],
                    ag[i],
                    high_goal_gt[i],
                    compute_Q=self.compute_Q,
                    noise_eps=self.noise_eps if not self.exploit else 0.,
                    random_eps=self.random_eps if not self.exploit else 0.,
                    use_target_net=self.use_target_net)
                if self.compute_Q:
                    # u, Q = policy_output
                    u = policy_output
                    ## print_point
                    #print(" self.compute_Q u : ", u)
                    Q = self.policy.Get_Q_value(o[i], high_goal_gt[i], u)
                    Qs.append(Q)
                else:
                    u = policy_output
                    ## print_point
                    #print(" self.compute_Q else u : ", u)

                if u.ndim == 1:
                    # The non-batched case should still have a reasonable shape.
                    u = u.reshape(1, -1)

                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    # curr_o_new, _, _, info = self.envs[i].step(u[i])
                    ##################################### hrl ###############################
                    #curr_o_new, reward, done, info = self.envs[i].step(u[i])  # jangikim
                    #print("u.reshape(4,)", u.reshape(4,))
                    curr_o_new, reward, done, info = self.envs[i].step(
                        u.reshape(4, ))  # jangikim
                    #########################################################################
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    #jangikim
                    reward_new[i] = reward

                    ## print_point
                    #print(" curr_o_new [0] : ".format(i), curr_o_new)

                    #done_new[i] = done
                    #if success[i] == 1 or done==1:

                    if success[i] == 1:
                        #    done_new[i] = 1
                        print("done_new[{0}] : ".format(i), 1)
                    #else:
                    #    done_new[i] = 0

                    #done_new[i] = 0 if t + 1 == self.T else float(done)
                    done_new[i] = 0 if total_timestep == self.T else float(
                        done)

                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()

                except MujocoException as e:
                    return self.generate_rollouts()

                low_nn_at[i][t % self.high_level_train_step] = u
                low_nn_st[i][t % self.high_level_train_step] = o_new[i]
                Rt_high_sum[i] += reward_new[i]

                if total_timestep % self.high_level_train_step == 0:

                    high_goal_gt[i] = self.policy.get_high_goal_gt(
                        o[i],
                        ag[i],
                        self.g[i],
                        compute_Q=self.compute_Q,
                        noise_eps=self.noise_eps if not self.exploit else 0.,
                        random_eps=self.random_eps if not self.exploit else 0.,
                        use_target_net=self.use_target_net)
                    '''
                    high_goal_gt_tilda[i] = self.policy.get_high_goal_gt_tilda(high_old_obj_st[i], ag[i], self.g[i],
                                                                           o_new[i],
                                                                           low_nn_st[i],
                                                                           low_nn_at[i])
                    '''
                    self.policy.update_meta_controller(
                        self.g[i], Rt_high_sum[i] * 0.1, done_new[i],
                        low_nn_st[i], low_nn_at[i],
                        int((self.total_timestep + 1) /
                            self.high_level_train_step), ag[i])

                    high_old_obj_st[i] = o_new[i]
                    low_nn_at[i] = np.zeros(
                        (self.high_level_train_step, self.dims['u']),
                        np.float32)
                    low_nn_st[i] = np.zeros(
                        (self.high_level_train_step, self.dims['o']),
                        np.float32)
                    Rt_high_sum[i] = 0
                else:
                    high_goal_gt[i] = o[i] + high_goal_gt[i] - o_new[i]

                u_temp[i] = u
                #temp_test = (t % self.high_level_train_step)
                intrinsic_reward[i] = -LA.norm(o[i] + high_goal_gt[i] -
                                               o_new[i])

                self.policy.update_controller(o[i], o_new[i], high_goal_gt[i],
                                              u, intrinsic_reward[i],
                                              done_new[i], total_timestep)

            total_timestep += 1
            self.total_timestep += 1
            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            #acts.append(u.copy())
            acts.append(u_temp.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new

        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o
        ########################## hrl #########################
        self.initial_high_goal_gt[:] = high_goal_gt
        #self.initial_high_goal_gt_tilda[:] = high_goal_gt_tilda
        ########################################################
        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#14
0
    def generate_rollouts(self, render=False, test=False, exploit=False):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts(test)

        # Annealing
        if self.expert != None:
            beta = self.beta()
        else:
            beta = 0

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes, returns, sigmas = [], [], [], [], [], [], []
        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        for t in range(self.T):
            if np.random.rand() < beta:
                # The expert is in charge
                o_, g_, ag_ = self.trim(o, self.g, ag, self.expert.dimo, self.expert.dimg)
                policy_output = self.expert.get_actions(o_, ag_, g_, compute_raw=True)
                u, raw = policy_output
            else:
                policy_output = self.policy.get_actions(
                    o, ag, self.g, exploit=exploit)
                u, raw, sigma = policy_output
            # We can't report sigma accurately when we are using the expert
            if self.expert != None:
                sigma = np.zeros((self.rollout_batch_size, self.dims['u']))

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)
                raw = raw.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # --------------
            r_new = np.zeros(self.rollout_batch_size)
            # --------------
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                # print(u[i])
                try:
                    # We don't ignore reward here 
                    # because we need to compute the return
                    curr_o_new, r, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    # --------------
                    r_new[i] = r
                    # --------------
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if render:
                        self.envs[i].render()
                except MujocoException as e:
                    self.logger.info(str(e))
                    self.logger.info('Exception thrown by Mujoco. Giving up on life...')
                    assert(False)
                    return self.generate_rollouts(render, test)

            if np.isnan(o_new).any():
                self.logger.info('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts(test)
                return self.generate_rollouts(render, test)

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(raw.copy())
            goals.append(self.g.copy())
            sigmas.append(sigma.copy())
            # ---------
            returns.append(r_new.copy())
            for t_ in range(t):
                r_new = r_new.copy()
                returns[t_] += self.gamma ** (t - t_) * r_new
            # ---------
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals,
                       # --------
                       G=returns,
                       sigma=sigmas)
                       # --------
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)

        self.success_history.append(success_rate)
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#15
0
    def generate_rollouts(self,
                          ex_init=None,
                          record=False,
                          random=False,
                          log_hit_time=False):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        if not self.active:
            return
        self.reset_all_rollouts(ex_init, record=record)

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        qpos = np.empty((self.rollout_batch_size, self.dims['qpos']),
                        np.float32)
        qvel = np.empty((self.rollout_batch_size, self.dims['qvel']),
                        np.float32)

        qpos[:] = self.initial_qpos
        qvel[:] = self.initial_qvel

        num_envs = self.venv.num_envs

        random_action = self.policy._random_action(num_envs)

        reached_goal = [False] * num_envs
        hit_time = [None] * num_envs

        if random:
            self.exploration = 'random'
        else:
            self.exploration = 'eps_greedy'  # 'go'

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs, qposes, qvels, hit_times = [], [], [], []

        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net,
                exploration=self.exploration,
                go=np.logical_not(reached_goal),
                random_action=random_action,
            )

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            # compute new states and observations
            obs_dict_new, _, done, info = self.venv.step(u)
            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']

            qpos_new = obs_dict_new['qpos']
            qvel_new = obs_dict_new['qvel']

            success = np.array([i.get('is_success', 0.0) for i in info])

            for e_idx, (suc, ht) in enumerate(zip(success, hit_time)):
                if suc and hit_time[e_idx] is None:
                    hit_time[e_idx] = t

            reached_goal = [hit is not None for hit in hit_time]

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            qposes.append(qpos.copy())
            qvels.append(qvel.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
            qpos[...] = qpos_new
            qvel[...] = qvel_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        qposes.append(qpos.copy())
        qvels.append(qvel.copy())

        episode = dict(
            o=obs,
            u=acts,
            g=goals,
            ag=achieved_goals,
            qpos=qposes,
            qvel=qvels,
            # t=Ts
        )

        if self.compute_Q:
            episode["Qs"] = Qs

        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        if self.exploration != 'random':
            if self.exploration in ['go_explore', 'go']:
                successful = np.asarray(
                    [1 if hit is not None else 0 for hit in hit_time])
            elif self.exploration in ['eps_greedy']:
                successful = np.array(successes)[-1, :]
            assert successful.shape == (self.rollout_batch_size, )
            success_rate = np.mean(successful)
            self.success_history.append(success_rate)

            hit_times = np.asarray(
                [hit if hit is not None else 0 for hit in hit_time])
            if log_hit_time:
                hit_time_mean = np.mean(hit_times)
                hit_time_std = np.std(hit_times)
                self.hit_time_mean_history.append(hit_time_mean)
                self.hit_time_std_history.append(hit_time_std)

        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#16
0
    def generate_rollouts(self, FLAGS):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        print("마침내 generate_rollout!")

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []

        ####################### hrl #############################

        # Rt_high_sum = np.zeros((self.rollout_batch_size, 1), np.float32)
        # total_timestep = 1
        # high_goal_gt = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)
        # #high_goal_gt_tilda = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)
        # high_old_obj_st = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)

        # u_temp = np.empty((self.rollout_batch_size, self.dims['u']), np.float32)

        # low_nn_at = np.zeros((self.high_level_train_step*self.rollout_batch_size, self.dims['u']),
        #                           np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['u'])
        # low_nn_st = np.zeros((self.high_level_train_step*self.rollout_batch_size, self.dims['o']),
        #                           np.float32).reshape(self.rollout_batch_size, self.high_level_train_step, self.dims['o'])
        # intrinsic_reward = np.zeros((self.rollout_batch_size, 1), np.float32)

        # high_goal_gt[:] = self.initial_high_goal_gt
        # #high_goal_gt_tilda[:] = self.initial_high_goal_gt_tilda

        ##########################################################

        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)
            # FLAGS=FLAGS)

            # policy_output = self.policy.get_actions(
            #     o, ag, self.g,
            #     compute_Q=self.compute_Q,
            #     noise_eps=self.noise_eps if not self.exploit else 0.,
            #     random_eps=self.random_eps if not self.exploit else 0.,
            #     use_target_net=self.use_target_net)

            ## from run_HAC.py
            # Determine training mode.  If not testing and not solely training, interleave training and testing to track progress
            # mix_train_test = False
            # if not FLAGS.test and not FLAGS.train_only:
            #     mix_train_test = True

            ## from run_HAC.py, 이 뒤로 다 indentation해줌
            # Evaluate policy every TEST_FREQ batches if interleaving training and testing
            # if mix_train_test and t % TEST_FREQ == 0:
            #     print("\n--- HAC TESTING ---")
            #     # agent.FLAGS.test = True ## agent를 인스턴스로 받아야하나 ㅡㅡ
            #     num_episodes = num_test_episodes

            #     # Reset successful episode counter
            #     successful_episodes = 0

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # print("Rollout. o_new={}, ag_new={},success={}".format(o_new,ag_new,success))
            # compute new states and observations
            obs_dict_new, _, done, info = self.venv.step(u)
            # print("HERE")
            # print("#########Debug##########")
            o_new = obs_dict_new['observation']
            # print("observation high : {}".format(o_new))
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                # 여기에서 우리는 모든 환경이 동일한 단계 수라고 가정합니다.
                # 그래서 envs가 vecenvs를 사용하여 수행 한 트릭을 반환 할 때마다 롤아웃을 종료합니다.
                # 왜냐하면 그것들은 이미 재설정된 후의 관찰이기 때문이다.

                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                # self.logger.warn('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            # print("############## obs = {}".format(obs))
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations. Initialize array of zeros
        observations = np.empty((self.rollout_batch_size, self.dims['o']),
                                np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        # Whole array assigned
        observations[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes, ep_reward_list = [], [], [], [], [], []

        ep_reward = 0
        env_step_counter = 0
        dones = []

        # print(self.info_keys)
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []

        # Do for rollout time horizon. This is equal to 50 because episode length is 50
        # Not really much use if we go for a longer trajectory. If anything, shorten and test results
        # TODO: Shorten trajectory and check results
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                observations,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:  # Evaluator  only
                action, Q = policy_output
                Qs.append(Q)
            else:
                action = policy_output

            if action.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                action = action.reshape(1, -1)

            new_observation = np.empty(
                (self.rollout_batch_size, self.dims['o']))
            new_achieved_goal = np.empty(
                (self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)

            # compute new states and observations
            obs_dict_new, reward, done, info = self.venv.step(action)

            # obs_dict_new {'achieved_goal': array([[1.3519502 , 0.73200333, 0.5274352 ]], dtype=float32),
            # 'desired_goal': array([[1.2729537 , 0.62809974, 0.51270455]], dtype=float32),
            # 'observation': array([[ 1.3519502e+00,  7.3200333e-01,  5.2743518e-01,  0.0000000e+00,
            # 0.0000000e+00,  1.7498910e-03, -3.6469495e-03, -1.8837147e-03,
            # -5.2045716e-06,  1.0831429e-04]], dtype=float32)}
            # reward [-1.]
            # info [{'is_success': 0.0}]

            # print(reward)
            # ep_reward_list.append(ep_reward)
            ep_reward += reward
            env_step_counter += 1
            # print("env_step_counter, ep_reward ", env_step_counter, ep_reward)

            new_observation = obs_dict_new['observation']
            new_achieved_goal = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever
                # any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those
                # are already observations after a reset
                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(new_observation).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(observations.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(action.copy())
            goals.append(self.g.copy())
            observations[...] = new_observation
            ag[...] = new_achieved_goal

        self.episode_counter += 1
        self.episode_reward = ep_reward[
            -1]  # Appending total ep_reward to episode_reward
        # print("episode_counter, episode_reward", self.episode_counter, self.episode_reward)

        obs.append(observations.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)

        for key, value in zip(self.info_keys, info_values):
            # print(key, value)
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)

        # print("success_rate: ", success_rate)
        self.success_history.append(success_rate)  # Used for tensorboard
        # print(self.success_history)

        self.reward_history.append(self.episode_reward)  # Used for tensorboard
        # print(self.reward_history)

        if self.compute_Q:  # Evaluator only
            self.Q_history.append(np.mean(Qs))

        self.n_episodes += self.rollout_batch_size

        # print("Rollout Done")

        return convert_episode_to_batch_major(episode)
示例#18
0
文件: rollout.py 项目: buoyancy99/sap
    def gen_rollouts_render(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
                policy acting on it accordingly.
                """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag
        grip_poses = []
        local_voxels = []
        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in
                       self.info_keys]
        Qs = []
        for i in range(20):
            _, _, _, _ = self.venv.step(np.array([0,0,0,0]))
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o, ag, self.g,
                compute_Q=self.compute_Q,
                noise_eps=0,
                random_eps=0,
                use_target_net=self.use_target_net)
            # self.random_eps
            # if not self.exploit else 0.,
            # if not self.exploit else 0.,
            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            # TODO: adding noise to the action?
            sigma = 0.1
            mu = 0
            # import pdb; pdb.set_trace()
            disc_actions = np.array([-0.5, 0.5])
            # hacked_u = np.array([])
            if np.random.uniform(0, 1) < 0.9:
                u =  np.random.uniform(-0.5, 0.5, 4)#(sigma * np.random.randn(4) + mu)
            else:
                u =   np.squeeze(u)


            distance = np.abs(u.reshape([-1, 1]) - disc_actions)
            u_idx = np.squeeze(np.argmin(distance, -1))
            u = np.array([disc_actions[u_idx[i]] for i in range(4)])
            u[1] = 0.5
            # import pdb;
            # pdb.set_trace()
            # import pdb; pdb.set_trace()
            obs_dict_new, _, done, info = self.venv.step(u)
            grip_pos = self.venv.loc2grid(obs_dict_new['observation'][:3])

            # print(np.sum([local_voxels[x] for x in local_voxels.keys()]))
            # import pdb; pdb.set_trace()
            # self.venv.render()
            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']
            # import pdb; pdb.set_trace()
            success = info['is_success']  #np.array([i.get('is_success', 0.0) for i in info])
            # print(f'hahahahah success {success}')
            # print(f'I am done {done}')
            if done or success == 1:
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break

            # for i, info_dict in enumerate(info):
            #     for idx, key in enumerate(self.info_keys):
            #         info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()
            grip_poses.append(grip_pos)  # harry added it
            local_voxels.append(info['local_voxel'])
            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        episode = dict(o=np.squeeze(obs),
                       u=np.squeeze(acts),
                       g=goals,
                       ag=achieved_goals,
                       grip_pos=np.array(grip_poses),
                       local_voxels=local_voxels)
        # for key, value in zip(self.info_keys, info_values):
        #     episode['info_{}'.format(key)] = value

        # stats

        # successful = np.array(successes)
        # assert successful.shape == (self.rollout_batch_size,)
        # success_rate = np.mean(successful)
        # self.success_history.append(success_rate)
        # if self.compute_Q:
        #     self.Q_history.append(np.mean(Qs))
        # self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#19
0
    def generate_rollouts_ker(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        episodes = []
        episodes_batch = []

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []

        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)  # i.e. from (4,) to (1,4)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)

            # compute new states and observations, do not return the reward, and get it from her_sampler.py
            obs_dict_new, _, done, info = self.venv.step(u)
            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            # no need
            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break
            # no need
            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]
            # no need
            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            # Set s = s' for new step
            o[...] = o_new
            ag[...] = ag_new
        # Append for last time step
        obs.append(o.copy())
        achieved_goals.append(ag.copy())

        # ----------------Kaleidoscope ER---------------------------
        original_ka_episodes = self.ker.ker_process(
            obs, acts, goals, achieved_goals
        )  # KER augments original episodes by an amount of 2*n_ker
        # ----------------end---------------------------

        # ----------------pack up as transition---------------------------
        for (obs, acts, goals, achieved_goals) in original_ka_episodes:
            episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
            for key, value in zip(self.info_keys, info_values):
                episode['info_{}'.format(key)] = value
            episodes.append(episode)
        # ----------------end---------------------------

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )

        success_rate = np.mean(successful)
        self.success_history.append(success_rate)

        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))

        mul_factor = 1
        self.n_episodes += (mul_factor * self.rollout_batch_size)

        # ----------------format processing---------------------------
        # return dict: ['o', 'u', 'g', 'ag', 'info_is_success']
        for episode in episodes:
            episode_batch = convert_episode_to_batch_major(
                episode)  # i.e. from 50,1,25 to 1,50,25
            episodes_batch.append(episode_batch)
        # ----------------end---------------------------

        return episodes_batch
示例#20
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes, successes_pos = [], [], [], [], [], []
        info_values = [
            np.empty(
                (self.T, self.rollout_batch_size, self.dims['info_' + key]),
                np.float32) for key in self.info_keys
        ]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            success_pos = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success'][1]
                        success_pos[i] = info['is_success'][0]
                    if 'done' in info:
                        self.first_policy_done = info['done']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                except MujocoException as e:
                    return self.generate_rollouts()

            if self.first_policy_done:
                break

            if np.isnan(o_new).any():
                self.logger.warning(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            successes_pos.append(success_pos.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1))
            o[...] = o_new
            ag[...] = ag_new

        episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        successful_pos = np.array(successes_pos)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        success_rate_pos = np.mean(successful_pos)
        self.success_history.append(success_rate)
        self.success_pos_history.append(success_rate_pos)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        for t in range(self.T):

            # self.g = np.array([[1, 1, 1, 1,
            #                   0.82950088,  0.19504257,  0.74951634,  0.82558665,  0.19408095,  0.72752193,
            #                   0.8294237,  0.19509856,  0.70551644,  0.83616574,  0.19685965,  0.6825068]], 'Float32')

            self.g = np.array([[
                1, 1, 1, 1, 0.81399449, 0.08906187, 0.36651383, 0.80723628,
                0.08749478, 0.34525658, 0.80821288, 0.08766061, 0.32291785,
                0.81195864, 0.08844444, 0.29918275
            ]], 'Float32')

            policy_output = self.policy.get_actions(
                o,
                ag,
                self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            success_pos = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success'][1]
                        success_pos[i] = info['is_success'][0]
                    if 'done' in info:
                        self.first_policy_done = info['done']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                except MujocoException as e:
                    return self.generate_rollouts()

            if self.first_policy_done:
                break

            if np.isnan(o_new).any():
                self.logger.warning(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            successes_pos.append(success_pos.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1))
            o[...] = o_new
            ag[...] = ag_new

        # for t in range(self.T):
        #
        #     # self.g = np.array([[1, 1, 1, 1, 0.77939238,
        #     #        0.01007279,
        #     #        0.77396591,
        #     #        0.78406789,
        #     #        0.01003857,
        #     #        0.75209954,
        #     #        0.7807472,
        #     #        0.01000307,
        #     #        0.72998683,
        #     #        0.77445873,
        #     #        0.00996551,
        #     #        0.70678223,
        #     #        0.86935003,
        #     #        0.00708711,
        #     #        0.7767419,
        #     #        0.87402555,
        #     #        0.00705288,
        #     #        0.75487552,
        #     #        0.87070486,
        #     #        0.00701738,
        #     #        0.73276282,
        #     #        0.86441638,
        #     #        0.00697982,
        #     #        0.70955821]], 'Float32')
        #
        #
        #
        #     policy_output = self.policy.get_actions(
        #         o, ag, self.g,
        #         compute_Q=self.compute_Q,
        #         noise_eps=self.noise_eps if not self.exploit else 0.,
        #         random_eps=self.random_eps if not self.exploit else 0.,
        #         use_target_net=self.use_target_net)
        #
        #
        #     if self.compute_Q:
        #         u, Q = policy_output
        #         Qs.append(Q)
        #     else:
        #         u = policy_output
        #
        #     if u.ndim == 1:
        #         # The non-batched case should still have a reasonable shape.
        #         u = u.reshape(1, -1)
        #
        #     o_new = np.empty((self.rollout_batch_size, self.dims['o']))
        #     ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
        #     success = np.zeros(self.rollout_batch_size)
        #     success_pos = np.zeros(self.rollout_batch_size)
        #     # compute new states and observations
        #     for i in range(self.rollout_batch_size):
        #         try:
        #             # We fully ignore the reward here because it will have to be re-computed
        #             # for HER.
        #             curr_o_new, _, _, info = self.envs[i].step(u[i])
        #             if 'is_success' in info:
        #                 success[i] = info['is_success'][1]
        #                 success_pos[i] = info['is_success'][0]
        #             o_new[i] = curr_o_new['observation']
        #             ag_new[i] = curr_o_new['achieved_goal']
        #             for idx, key in enumerate(self.info_keys):
        #                 info_values[idx][t, i] = info[key]
        #             if self.render:
        #                 self.envs[i].render()
        #         except MujocoException as e:
        #             return self.generate_rollouts()
        #
        #     if np.isnan(o_new).any():
        #         self.logger.warning('NaN caught during rollout generation. Trying again...')
        #         self.reset_all_rollouts()
        #         return self.generate_rollouts()
        #
        #     obs.append(o.copy())
        #     achieved_goals.append(ag.copy())
        #     successes.append(success.copy())
        #     successes_pos.append(success_pos.copy())
        #     acts.append(u.copy())
        #     goals.append(self.g.copy())
        #     #Qs.append(np.linalg.norm(self.g.copy()-ag.copy(),axis=-1))
        #     o[...] = o_new
        #     ag[...] = ag_new

        return convert_episode_to_batch_major(episode)
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        directory_plot = '../../../test_plot_png/' + datetime.datetime.now().strftime("%m%d_%H%M%S") + os.sep
        directory_env = '../../../test_env_png/' + datetime.datetime.now().strftime("%m%d_%H%M%S") + os.sep
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []

        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        x_bar = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]
        x_lab = ["grip_pos-1","grip_pos-2","grip_pos-3","object_pos-1","object_pos-2","object_pos-3","object_rel_pos-1","object_rel_pos-2","object_rel_pos-3", "gripper_state-1", "gripper_state-2", "object_rot-1", "object_rot-2", "object_rot-3", "object_velp-1", "object_velp-2", "object_velp-3", "object_velr-1", "object_velr-2", "object_velr-3", "grip_velp-1", "grip_velp-2", "grip_velp-3", "gripper_vel-1", "gripper_vel-2"]
        observation_catcher = []
        observation_catcher_1 = []
        observation_catcher_2 = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o, ag, self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']

                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                    elif self.rendder_and_save_png: #ndrw
                        rgb_array = self.envs[i].render(mode='rgb_array')
                        im = Image.fromarray(rgb_array)
                        lov = im.crop((300,200,1000,650))
                        observation_catcher.append(o_new[i][0]) # use to append the requried set values from the observation vector (0-24)
                        observation_catcher_1.append(o_new[i][1])
                        observation_catcher_2.append(o_new[i][2])

                        ax1.clear()
                        ax1.plot(observation_catcher,color='xkcd:coral',linewidth=3,marker='o',markevery=[-1])
                        ax1.plot(observation_catcher_1,color='xkcd:green',linewidth=3,marker='o',markevery=[-1])
                        ax1.plot(observation_catcher_2,color='xkcd:goldenrod',linewidth=3,marker='o',markevery=[-1])
                        ax1.set_xlabel('Time-Step',fontsize=11)
                        ax1.set_ylabel('Observation-Values',fontsize=11)
                        ax1.legend(['gripper_vel-1','gripper_vel-2'], loc = 'upper right',facecolor='#74dd93',frameon=False,fontsize='x-small', ncol=3, bbox_to_anchor=(1,1.03))
                        ax1.set_facecolor('#74dd93')
                        ax1.set_xlim(xmin=0)
                        ax1.set_xlim(xmax=50)
                        ax1.set_ylim(ymin=0.4) # default value --> should be checked according the y min in observed value - hard coded
                        ax1.set_ylim(ymax=1.4)  # default value --> should be checked according the y max in observed value - hard coded

                        ax2.clear()
                        barlist = ax2.bar(x_bar,color='xkcd:silver',width=0.4,height=0.025)
                        barlist[0].set_color('xkcd:coral')
                        barlist[1].set_color('xkcd:green')
                        barlist[2].set_color('xkcd:goldenrod')
                        ax2.set_yticklabels([])
                        ax2.set_xticks(x_bar)
                        ax2.set_xticklabels(x_lab,rotation=90,fontsize=9)
                        ax2.set_title('Observation Vector Of The Two-Finger Gripper(NN-Input)',fontsize=12)
                        ax2.set_facecolor('#74dd93')
                        ax2.set_frame_on(False)
                        ax2.axes.get_yaxis().set_visible(False)
                        if not os.path.exists(directory_plot):
                            os.makedirs(directory_plot)
                        if not os.path.exists(directory_env):
                            os.makedirs(directory_env)
                        plt.savefig(directory_plot + "pic_{0:05d}.png".format(t),facecolor=fig.get_facecolor(), edgecolor='none')
                        lov.save(directory_env + "pic_{0:05d}.png".format(t))
                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warning('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value
        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size
        #print(observation_catcher_2)
        return convert_episode_to_batch_major(episode)
示例#22
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
        Qs = []
        for t in range(self.T):
            policy_output = self.policy.get_actions(
                o, ag, self.g,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            for i in range(self.rollout_batch_size):
                try:
                    # We fully ignore the reward here because it will have to be re-computed
                    # for HER.
                    curr_o_new, _, _, info = self.envs[i].step(u[i])
                    if 'is_success' in info:
                        success[i] = info['is_success']
                    o_new[i] = curr_o_new['observation']
                    ag_new[i] = curr_o_new['achieved_goal']
                    for idx, key in enumerate(self.info_keys):
                        info_values[idx][t, i] = info[key]
                    if self.render:
                        self.envs[i].render()
                except MujocoException as e:
                    return self.generate_rollouts()

            if np.isnan(o_new).any():
                self.logger.warn('NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            o[...] = o_new
            ag[...] = ag_new
        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        self.initial_o[:] = o

        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size,)
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)
示例#23
0
    def generate_rollouts(self):
        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
        policy acting on it accordingly.
        """
        self.reset_all_rollouts()

        # compute observations
        o = np.empty((self.rollout_batch_size, self.dims['o']),
                     np.float32)  # observations
        ag = np.empty((self.rollout_batch_size, self.dims['g']),
                      np.float32)  # achieved goals
        o[:] = self.initial_o
        ag[:] = self.initial_ag

        # generate episodes
        obs, achieved_goals, acts, goals, successes = [], [], [], [], []
        consistent_sgss = []
        dones = []
        info_values = [
            np.empty((self.T - 1, self.rollout_batch_size,
                      self.dims['info_' + key]), np.float32)
            for key in self.info_keys
        ]
        Qs = []
        # print("new ep")
        # g_index = 0
        g_indices = [0] * self.rollout_batch_size
        # self.policies.g_index = 0
        for t in range(self.T):
            # policy_output = self.policy.get_actions(
            #     # o, ag, self.gs[self.g_index],
            #     # o, ag, self.g,
            #     o, ag, self.gs[2],
            #     compute_Q=self.compute_Q,
            #     noise_eps=self.noise_eps if not self.exploit else 0.,
            #     random_eps=self.random_eps if not self.exploit else 0.,
            #     use_target_net=self.use_target_net)

            # print(o)
            # print(o.shape)
            # print(ag.shape)
            # print(self.gs)
            # print(self.gs.shape)
            # print(self.gs[0][g_index])
            # print(self.g)
            # print(self.gs[0][g_index].shape)

            #num_env = 2: (same with num_cpu = n)
            #2,25                       | 1,25
            #2,3                        | 1,3
            #2,3,3                      | 1,3,3
            #[1.47,.62,.45]             | [1.46,.62,.45]
            #3,                         | 3,
            # [g[i][g_inds[i]] for i in range(len(g_inds))]
            # sgs = np.array([a[b] for a,b in zip(self.gs,g_indices)])

            self.g = np.array([a[b] for a, b in zip(self.gs, g_indices)])
            # print(self.gs)
            # print(g_indices)
            # print(self.g)
            policy_output = self.policies.get_actions(
                # o, ag, self.gs, g_index,
                # o, ag, self.gs[0][g_index], g_index,
                # o, ag, sgs, g_indices,
                o,
                ag,
                self.g,
                g_indices,
                compute_Q=self.compute_Q,
                noise_eps=self.noise_eps if not self.exploit else 0.,
                random_eps=self.random_eps if not self.exploit else 0.,
                use_target_net=self.use_target_net)

            if self.compute_Q:
                u, Q = policy_output
                Qs.append(Q)
            else:
                u = policy_output

            if u.ndim == 1:
                # The non-batched case should still have a reasonable shape.
                u = u.reshape(1, -1)

            o_new = np.empty((self.rollout_batch_size, self.dims['o']))
            ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
            success = np.zeros(self.rollout_batch_size)
            # compute new states and observations
            obs_dict_new, rewards, done, info = self.venv.step(u)

            #TODO: All definitely only works for one env, extend for any num
            # g_index_new = obs_dict_new['goal_index'] #make sure this doesn't change outside of this
            # consistent_sgs = info[0]['consistent_subgoals']
            consistent_sgs = np.array(
                [i.get('consistent_subgoals', 0.0) for i in info])

            o_new = obs_dict_new['observation']
            ag_new = obs_dict_new['achieved_goal']
            success = np.array([i.get('is_success', 0.0) for i in info])

            # self.g_index = g_index_new

            #update goal/goal_index if we achieve a subgoal
            for i in np.where(rewards != -1)[0]:
                # print(i)
                g_indices[i] = min(g_indices[i] + 1,
                                   self.policies.num_goals - 1)
                # print("?")
                # self.g = [self.gs[:,g_indices]]

            # if reward != -1 and g_index < len(self.gs[0])-1:#[0])-1:
            #     g_index += 1
            #     #would have to be of len(numenvs)
            #     self.g = [self.gs[0][g_index]]

            # #identify transition as candidate for subgoal experience replay
            # for i in range(len(consistent_sgs)):
            #     if consistent_sgs[i] == 1:
            #         self.subgoal_timesteps[i].append(t)

            # if g_index_new != self.g_index:
            #     self.subgoal_timesteps.append(t)
            #     self.g_index = g_index_new

            if any(done):
                # here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
                # trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
                # after a reset
                break

            for i, info_dict in enumerate(info):
                for idx, key in enumerate(self.info_keys):
                    info_values[idx][t, i] = info[i][key]

            if np.isnan(o_new).any():
                self.logger.warn(
                    'NaN caught during rollout generation. Trying again...')
                self.reset_all_rollouts()
                return self.generate_rollouts()

            consistent_sgss.append(consistent_sgs.copy())
            dones.append(done)
            obs.append(o.copy())
            achieved_goals.append(ag.copy())
            successes.append(success.copy())
            acts.append(u.copy())
            goals.append(self.g.copy())
            # goals.append(self.gs[self.g_index].copy())
            o[...] = o_new
            ag[...] = ag_new

            #in case subgoal was achieved
            # self.g = obs_dict_new['desired_goal'].copy()

            # if reward != -1 and self.g_index < len(self.goals):
            # self.g_index += 1

        obs.append(o.copy())
        achieved_goals.append(ag.copy())
        episode = dict(o=obs,
                       u=acts,
                       g=goals,
                       ag=achieved_goals,
                       sgt=consistent_sgss)
        for key, value in zip(self.info_keys, info_values):
            episode['info_{}'.format(key)] = value

        # stats
        successful = np.array(successes)[-1, :]
        assert successful.shape == (self.rollout_batch_size, )
        success_rate = np.mean(successful)
        self.success_history.append(success_rate)
        if self.compute_Q:
            self.Q_history.append(np.mean(Qs))
        self.n_episodes += self.rollout_batch_size

        return convert_episode_to_batch_major(episode)