示例#1
0
    def __init__(self,
                 args,
                 observation_size,
                 action_size,
                 network_type,
                 task_queue,
                 result_queue,
                 worker_id,
                 name_scope='planning_worker'):

        # the multiprocessing initialization
        multiprocessing.Process.__init__(self)
        self.args = args
        self._name_scope = name_scope
        self._worker_id = worker_id
        self._network_type = network_type
        self._npr = np.random.RandomState(args.seed + self._worker_id)

        self._observation_size = observation_size
        self._action_size = action_size
        self._task_queue = task_queue
        self._result_queue = result_queue

        logger.info('Worker {} online'.format(self._worker_id))
        self._base_dir = init_path.get_base_dir()
示例#2
0
def load_tf_model(sess, model_path, tf_var_list=[], ignore_prefix='INVALID'):
    '''
        @brief: load the tensorflow variables from a numpy npy files
    '''
    is_file_valid(model_path)
    logger.info('\tLOADING tensorflow variables')

    # load the parameters
    output_save_list = np.load(model_path, encoding='latin1').item()
    tf_name_list = [var.name for var in tf_var_list]

    # get the weights one by one
    for name, val in output_save_list.items():
        if name in tf_name_list:
            logger.info('\t\tloading TF pretrained parameters {}'.format(name))
            tf_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            var = [var for var in tf_var_list if var.name == name][0]

            assign_op = var.assign(val)
            sess.run(assign_op)  # or `assign_op.op.run()`
        else:
            logger.warning(
                '\t\t**** Parameters Not Exist **** {}'.format(name))

    if len(tf_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}'.format(
                tf_name_list))
示例#3
0
    def _play(self, planning_data):
        if self.args.num_expert_episode_to_save > 0 and \
                self._previous_reward > self._env_solved_reward and \
                self._worker_id == 0:
            start_save_episode = True
            logger.info('Last episodic reward: %.4f' % self._previous_reward)
            logger.info('Minimum reward of %.4f is needed to start saving' %
                        self._env_solved_reward)
            logger.info('[SAVING] Worker %d will record its episode data' %
                        self._worker_id)
        else:
            start_save_episode = False
            if self.args.num_expert_episode_to_save > 0 \
                    and self._worker_id == 0:
                logger.info('Last episodic reward: %.4f' %
                            self._previous_reward)
                logger.info(
                    'Minimum reward of %.4f is needed to start saving' %
                    self._env_solved_reward)

        traj_episode = play_episode_with_env(
            self._env, self._act, {
                'use_random_action': planning_data['use_random_action'],
                'record_flag': start_save_episode,
                'num_episode': self.args.num_expert_episode_to_save,
                'data_name': self.args.task + '_' + self.args.exp_id
            })
        self._previous_reward = np.sum(traj_episode['rewards'])
        return traj_episode
示例#4
0
    def rollouts_using_worker_planning(self,
                                       num_timesteps=None,
                                       use_random_action=False):
        ''' @brief:
                Run the experiments until a total of @timesteps_per_batch
                timesteps are collected.
        '''
        self._current_iteration += 1
        num_timesteps_received = 0
        timesteps_needed = self.args.timesteps_per_batch \
            if num_timesteps is None else num_timesteps
        rollout_data = []

        while True:
            # init the data
            self._ilqr_data_wrapper.init_episode_data()
            traj_episode = self._play(use_random_action)
            logger.info('done with episode')
            rollout_data.append(traj_episode)
            num_timesteps_received += len(traj_episode['rewards'])

            # update average timesteps per episode
            timesteps_needed = self.args.timesteps_per_batch - \
                num_timesteps_received

            if timesteps_needed <= 0 or self.args.test:
                break

        logger.info('{} timesteps from {} episodes collected'.format(
            num_timesteps_received, len(rollout_data)))

        return {'data': rollout_data}
示例#5
0
    def _act(self, state, control_info={'use_random_action': False}):
        if 'use_random_action' in control_info and \
                control_info['use_random_action']:
            # use random policy
            action = self._npr.uniform(-1, 1, [self._action_size])
            return action, [-1], [-1]

        else:
            # update the data
            self._update_plan_data(state)
            pred_reward = [
                -self._plan_data[i_traj]['l'].sum()
                for i_traj in range(self.args.num_ilqr_traj)
            ]

            for _ in range(self.args.ilqr_iteration):
                self._backward_pass()
                self._forward_pass()

            # logging information
            for i_traj in range(self.args.num_ilqr_traj):
                diff = -self._plan_data[i_traj]['l'].sum() - pred_reward[i_traj]
                logger.info('Traj {}: Pred ({}) + ({})'.format(
                    i_traj, pred_reward[i_traj], diff))

            # get control signals from the best traj
            traj_id = np.argsort(
                [np.sum(traj_data['l']) for traj_data in self._plan_data])[0]
            return self._plan_data[traj_id]['u'][0], [-1], [-1]
示例#6
0
def is_file_valid(model_path, save_file=False):
    assert model_path.endswith('.npy'), logger.error(
        'Invalid file provided {}'.format(model_path))
    if not save_file:
        assert os.path.exists(model_path), logger.error(
            'file not found: {}'.format(model_path))
    logger.info('[LOAD/SAVE] checkpoint path is {}'.format(model_path))
示例#7
0
def load_numpy_model(model_path, numpy_var_list={}):
    '''
        @brief: load numpy variables from npy files. The variables could be
            from baseline or from ob_normalizer
        @output:
            It is worth mentioning that this function only returns the value,
            but won't load the value (while the tf variables will be loaded at
            the same time)
    '''
    is_file_valid(model_path)
    logger.info('LOADING numpy variables')

    output_save_list = np.load(model_path, encoding='latin1').item()
    numpy_name_list = [key for key, val in numpy_var_list.items()]

    # get the weights one by one
    for name, val in output_save_list.items():
        if name in numpy_name_list:
            logger.info(
                '\t\tloading numpy pretrained parameters {}'.format(name))
            numpy_name_list.remove(name)  # just for sanity check
            numpy_var_list[name] = val
        else:
            logger.warning(
                '\t\t**** Parameters Not Exist **** {}'.format(name))

    if len(numpy_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}'.format(
                numpy_name_list))
    return numpy_var_list
示例#8
0
def train(trainer, sampler, worker, dynamics, policy, reward, args=None):
    logger.info('Training starts at {}'.format(init_path.get_abs_base_dir()))
    network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward}

    # make the trainer and sampler
    sampler_agent = make_sampler(sampler, worker, network_type, args)
    trainer_tasks, trainer_results, trainer_agent, init_weights = \
        make_trainer(trainer, network_type, args)
    sampler_agent.set_weights(init_weights)

    timer_dict = OrderedDict()
    timer_dict['Program Start'] = time.time()
    totalsteps = 0
    current_iteration = 0

    while True:
        timer_dict['** Program Total Time **'] = time.time()

        # step 1: collect rollout data
        if current_iteration == 0 and args.random_timesteps > 0 and \
                (not (args.gt_dynamics and args.gt_reward)):
            # we could first generate random rollout data for exploration
            logger.info(
                'Generating {} random timesteps'.format(args.random_timesteps)
            )
            rollout_data = sampler_agent.rollouts_using_worker_planning(
                args.random_timesteps, use_random_action=True
            )
        else:
            rollout_data = sampler_agent.rollouts_using_worker_planning()

        timer_dict['Generate Rollout'] = time.time()

        # step 2: train the weights for dynamics and policy network
        training_info = {'network_to_train': ['dynamics', 'reward', 'policy']}
        trainer_tasks.put(
            (parallel_util.TRAIN_SIGNAL,
             {'data': rollout_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()
        timer_dict['Train Weights'] = time.time()

        # step 4: update the weights
        sampler_agent.set_weights(training_return['network_weights'])
        timer_dict['Assign Weights'] = time.time()

        # log and print the results
        log_results(training_return, timer_dict)

        totalsteps = training_return['totalsteps']
        if totalsteps > args.max_timesteps:
            break
        else:
            current_iteration += 1

    # end of training
    sampler_agent.end()
    trainer_tasks.put((parallel_util.END_SIGNAL, None))
示例#9
0
    def _get_groundtruth_reward(self, rollout_data, training_stats):

        for i_episode in rollout_data:
            i_episode['raw_episodic_reward'] = sum(i_episode['raw_rewards'])
        avg_reward = np.mean(
            [i_episode['raw_episodic_reward'] for i_episode in rollout_data])
        logger.info('Raw reward: {}'.format(avg_reward))
        training_stats['RAW_reward'] = avg_reward
示例#10
0
 def pred(self, data_dict):
     logger.info('This function should not be used!')
     reward = []
     for i_data in range(len(data_dict['action'])):
         i_reward = self._env.reward(
             {key: data_dict[key][i_data]
              for key in ['start_state', 'action']}
         )
         reward.append(i_reward)
     return np.stack(reward), -1, -1
示例#11
0
def train_mf(mb_steps, policy_weight, trainer, sampler, worker, dynamics,
             policy, reward, args=None):
    logger.info('Training starts at {}'.format(init_path.get_abs_base_dir()))
    network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward}

    # make the trainer and sampler
    sampler_agent = make_sampler(sampler, worker, network_type, args)
    trainer_tasks, trainer_results, trainer_agent, init_weights = \
        make_trainer(trainer, network_type, args)

    # Initialize the policy with dagger policy weight.
    trainer_tasks.put((parallel_util.SET_POLICY_WEIGHT, policy_weight))
    trainer_tasks.join()
    init_weights['policy'][0] = policy_weight
    sampler_agent.set_weights(init_weights)

    timer_dict = OrderedDict()
    timer_dict['Program Start'] = time.time()
    current_iteration = 0

    while True:
        timer_dict['** Program Total Time **'] = time.time()

        # step 1: collect rollout data
        rollout_data = \
            sampler_agent.rollouts_using_worker_playing(use_true_env=True)

        timer_dict['Generate Rollout'] = time.time()

        # step 2: train the weights for dynamics and policy network
        training_info = {'network_to_train': ['dynamics', 'reward', 'policy']}
        trainer_tasks.put(
            (parallel_util.TRAIN_SIGNAL,
             {'data': rollout_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()
        timer_dict['Train Weights'] = time.time()

        # step 4: update the weights
        sampler_agent.set_weights(training_return['network_weights'])
        timer_dict['Assign Weights'] = time.time()

        # log and print the results
        log_results(training_return, timer_dict, mb_steps)

        if training_return['totalsteps'] > args.max_timesteps:
            break
        else:
            current_iteration += 1

    # end of training
    sampler_agent.end()
    trainer_tasks.put((parallel_util.END_SIGNAL, None))
示例#12
0
    def train_initial_policy(self, data_dict, replay_buffer, training_info={}):
        # get the validation set
        # Hack the policy val percentage to 0.1 for policy initialization.
        self.args.policy_val_percentage = 0.1

        new_data_id = list(range(len(data_dict['start_state'])))
        self._npr.shuffle(new_data_id)
        num_val = int(len(new_data_id) * self.args.policy_val_percentage)
        val_data = {
            key: data_dict[key][new_data_id][:num_val]
            for key in ['start_state', 'end_state', 'action']
        }

        # get the training set
        train_data = {
            key: data_dict[key][new_data_id][num_val:]
            for key in ['start_state', 'end_state', 'action']
        }

        for i_epoch in range(self.args.dagger_epoch):
            # get the number of batches
            num_batches = len(train_data['action']) // \
                self.args.initial_policy_bs
            # from util.common.fpdb import fpdb; fpdb().set_trace()
            assert num_batches > 0, logger.error('batch_size > data_set')
            avg_training_loss = []

            for i_batch in range(num_batches):
                # train for each sub batch
                feed_dict = {
                    self._input_ph[key]: train_data[key][
                        i_batch * self.args.initial_policy_bs:
                        (i_batch + 1) * self.args.initial_policy_bs
                    ] for key in ['start_state', 'action']
                }
                fetch_dict = {
                    'update_op': self._update_operator['initial_update_op'],
                    'train_loss': self._update_operator['initial_policy_loss']
                }

                training_stat = self._session.run(fetch_dict, feed_dict)
                avg_training_loss.append(training_stat['train_loss'])

            val_loss = self.eval(val_data)

            logger.info(
                '[dynamics at epoch {}]: Val Loss: {}, Train Loss: {}'.format(
                    i_epoch, val_loss, np.mean(avg_training_loss)
                )
            )

        training_stat['val_loss'] = val_loss
        training_stat['avg_train_loss'] = np.mean(avg_training_loss)
        return training_stat
示例#13
0
    def rollouts_using_worker_playing(self,
                                      num_timesteps=None,
                                      use_random_action=False,
                                      use_true_env=False):
        """ @brief:
            In this case, the sampler will call workers to generate data
        """
        self._current_iteration += 1
        num_timesteps_received = 0
        numsteps_indicator = False if num_timesteps is None else True
        timesteps_needed = self.args.timesteps_per_batch \
            if num_timesteps is None else num_timesteps
        rollout_data = []

        while True:
            # how many episodes are expected to complete the current dataset?
            num_estimiated_episode = max(
                int(np.ceil(timesteps_needed / self._avg_episode_len)), 1)

            # send out the task for each worker to play
            for _ in range(num_estimiated_episode):
                self._task_queue.put((parallel_util.WORKER_PLAYING, {
                    'use_true_env': use_true_env,
                    'use_random_action': use_random_action
                }))
            self._task_queue.join()

            # collect the data
            for _ in range(num_estimiated_episode):
                traj_episode = self._result_queue.get()
                rollout_data.append(traj_episode)
                num_timesteps_received += len(traj_episode['rewards'])

            # update average timesteps per episode and timestep remains
            self._avg_episode_len = \
                float(num_timesteps_received) / len(rollout_data)
            if numsteps_indicator:
                timesteps_needed = num_timesteps - \
                    num_timesteps_received
            else:
                timesteps_needed = self.args.timesteps_per_batch - \
                    num_timesteps_received

            logger.info('Finished {}th episode'.format(len(rollout_data)))
            if timesteps_needed <= 0 or self.args.test:
                break

        logger.info('{} timesteps from {} episodes collected'.format(
            num_timesteps_received, len(rollout_data)))

        return {'data': rollout_data}
    def train(self, data_dict, replay_buffer, training_info={}):
        # update the whitening stats of the network
        self._set_whitening_var(data_dict['whitening_stats'])

        # get the validation data
        new_data_id = list(range(len(data_dict['start_states'])))
        self._npr.shuffle(new_data_id)
        num_val = max(int(len(new_data_id) * self.args.dynamics_val_percentage),
                      self.args.dynamics_val_max_size)
        val_data = {
            'start_states': data_dict['start_states'][new_data_id][:num_val],
            'end_states': data_dict['end_states'[new_data_id]][:num_val],
            'actions': data_dict['actions'][new_data_id][:num_val],
        }

        # TODO(GD): update coeff

        total_iters = 0
        for i_epochs in range(self.args.dynamics_epochs):
            train_data = self._replay_buffer.get_all_data(self)
            num_batches = len(train_data) // self.args.dynamics_batch_size
            avg_training_loss = []
            for i_batch in range(num_batches):
                # feed in the sub-batch
                feed_dict = {
                    self._input_ph[key]: train_data[key][
                        i_batch * self.args.dynamics_batch_size:
                        (i_batch + 1) * self.args.dynamics_batch_size
                    ] for key in ['start_states', 'end_states', 'actions']
                }
                fetch_dict = {
                    'update_op': self._update_operator['update_op'],
                    'loss': self._update_operator['loss']
                }

                training_stat = self._session.run(fetch_dict, feed_dict)
                avg_training_loss.append(training_stat['loss'])

                if total_iters % 2 == 0:
                    self._session.run(self._update_operator['cov_update_op'], feed_dict)

                if total_iters % 20 == 0:
                    self._session.run(self._update_operator['var_update_op'])

            val_loss = self.eval(val_data)

            logger.info('[dynamics]: Val Loss: {}, Train Loss'.format(
                val_loss, np.mean(avg_training_loss))
            )
示例#15
0
def save_tf_model(sess, model_path, tf_var_list=[]):
    '''
        @brief: save the tensorflow variables into a numpy npy file
    '''
    is_file_valid(model_path, save_file=True)
    logger.info('\tSAVING tensorflow variables')

    # get the tf weights one by one
    output_save_list = dict()
    for var in tf_var_list:
        weights = sess.run(var)
        output_save_list[var.name] = weights
        logger.info('\t\t[Checkpoint] saving tf parameter {}'.format(var.name))

    # save the model
    np.save(model_path, output_save_list)
示例#16
0
def save_numpy_model(model_path, numpy_var_list=[]):
    '''
        @brief: save the numpy variables into a numpy npy file
    '''
    is_file_valid(model_path, save_file=True)

    logger.info('\tSAVING numpy variables')

    # get the numpy weights one by one
    output_save_list = dict()
    for key, var in numpy_var_list.items():
        output_save_list[key] = var
        logger.info('\t\t[Checkpoint] saving numpy parameter {}'.format(key))

    # save the model
    np.save(model_path, output_save_list)
示例#17
0
    def _preprocess_data(self, rollout_data):
        """ @brief:
                Process the data, collect the element of
                ['start_state', 'end_state', 'action', 'reward', 'return',
                 'ob', 'action_dist_mu', 'action_dist_logstd']
        """
        # get the observations
        training_data = {}

        # get the returns (might be needed to train policy)
        for i_episode in rollout_data:
            i_episode["returns"] = \
                misc_utils.get_return(i_episode["rewards"], self.args.gamma)

        training_data['start_state'] = np.concatenate(
            [i_episode['obs'][:-1] for i_episode in rollout_data])
        training_data['end_state'] = np.concatenate(
            [i_episode['obs'][1:] for i_episode in rollout_data])
        for key in [
                'action', 'reward', 'return', 'old_action_dist_mu',
                'old_action_dist_logstd'
        ]:
            training_data[key] = np.concatenate(
                [i_episode[key + 's'][:] for i_episode in rollout_data])

        # record the length
        training_data['episode_length'] = \
            [len(i_episode['rewards']) for i_episode in rollout_data]

        # get the episodic reward
        for i_episode in rollout_data:
            i_episode['episodic_reward'] = sum(i_episode['rewards'])
        avg_reward = np.mean(
            [i_episode['episodic_reward'] for i_episode in rollout_data])
        logger.info('Mean reward: {}'.format(avg_reward))

        training_data['whitening_stats'] = self._whitening_stats
        training_data['avg_reward'] = avg_reward
        training_data['avg_reward_std'] = \
            np.std([i_episode['episodic_reward'] for i_episode in rollout_data])

        training_data['rollout_data'] = rollout_data

        # update timesteps so far
        self._timesteps_so_far += len(training_data['action'])
        return training_data
示例#18
0
def load_expert_trajectory(traj_data_name, traj_episode_num):
    '''
        @brief:
            load the expert trajectory. It could either be a full trajectory
            or keyframe states.
        @output:
            The expert_trajectory is a list of dict. Each dict
            corresponds to one episode, and has key of 'observation', and
            'timestep'. The size of expert_trajectory[0]['observation']
            is @num_timestep by @(num_ob_size)
            example: expert_trajectory[0]['timestep'] = [2, 3, 5, ...]
    '''
    expert_trajectory = load_expert_data(traj_data_name, traj_episode_num)
    expert_trajectory_obs = np.concatenate(
        [i_traj['observation'] for i_traj in expert_trajectory])
    logger.info('Loaded expert trajectory')
    logger.info('Num_traj: {}, size: {}'.format(len(expert_trajectory),
                                                expert_trajectory_obs.shape))
    return expert_trajectory_obs
示例#19
0
def model_save_from_list(sess, model_path, tf_var_list=[], numpy_var_list={}):
    '''
        @brief:
            if the var list is given, we just save them
    '''
    if not model_path.endswith('.npy'):
        model_path = model_path + '.npy'

    logger.info('saving checkpoint to {}'.format(model_path))
    output_save_list = dict()

    # get the tf weights one by one
    for var in tf_var_list:
        weights = sess.run(var)
        output_save_list[var.name] = weights
        logger.info('[checkpoint] saving tf parameter {}'.format(var.name))

    # get the numpy weights one by one
    for key, var in numpy_var_list.items():
        output_save_list[key] = var
        logger.info('[checkpoint] saving numpy parameter {}'.format(key))

    # save the model
    np.save(model_path, output_save_list)

    return
示例#20
0
    def _loss_function(self, sol, fetch_data_dict={}):
        """ @brief: the loss function to be used by the LBFGS optimizer

            @fetch_data_dict:
                We can fetch some intermediate variables (interpolated qpos /
                qvel / qacc)
        """

        if self._camera_info['mode'] in ['static', 'trackcom']:
            # only the qposes
            sol_qpos = sol[self._var_to_sol_id['qpos']]
            sol_qpos = sol_qpos.reshape([-1, self._len_qpos])
            camera_state = sol[self._var_to_sol_id['camera_state']]
            total_loss, _fetch_data_dict = \
                self._loss_from_sol_qpos_camera_state(sol_qpos, camera_state)

        else:
            raise NotImplementedError  # TODO for free

        # gather the data that can be reused
        for key in fetch_data_dict:
            fetch_data_dict[key] = _fetch_data_dict[key]
        logger.info("Current loss: {}".format(total_loss))
        logger.info("\tphysics loss: {}".format(
            np.mean(_fetch_data_dict['physics_loss']))
        )
        logger.info("\tproject loss: {}".format(
            np.mean(_fetch_data_dict['projection_loss']))
        )
        return total_loss
示例#21
0
    def __init__(self, sess, summary_name, enable=True, summary_dir=None):
        # the interface we need
        self.summary = None
        self.sess = sess
        self.enable = enable
        if not self.enable:  # the summary handler is disabled
            return
        if summary_dir is None:
            self.path = os.path.join(
                init_path.get_base_dir(), 'summary'
            )
        else:
            self.path = os.path.join(summary_dir, 'summary')
        self.path = os.path.abspath(self.path)

        if not os.path.exists(self.path):
            os.makedirs(self.path)
        self.path = os.path.join(self.path, summary_name)

        self.train_writer = tf.summary.FileWriter(self.path, self.sess.graph)

        logger.info(
            'summary write initialized, writing to {}'.format(self.path))
示例#22
0
    def _build_models(self):
        self._build_session()
        self._network = {'policy': [], 'dynamics': [], 'reward': []}
        self._num_model_ensemble = {
            'policy': max(1, self.args.num_policy_ensemble),
            'dynamics': max(1, self.args.num_dynamics_ensemble),
            'reward': max(1, self.args.num_reward_ensemble),
        }

        for key in ['policy', 'dynamics', 'reward']:
            for i_model in range(self._num_model_ensemble[key]):
                name_scope = self._name_scope + '_' + key + '_' + str(i_model)
                self._network[key].append(self._network_type[key](
                    self.args, self._session, name_scope,
                    self._observation_size, self._action_size))
                with tf.variable_scope(name_scope):
                    self._network[key][-1].build_network()
                    self._network[key][-1].build_loss()

                logger.info('Trainer maintains [{}] {} network'.format(
                    self._num_model_ensemble[key], key))
        # init the weights
        self._session.run(tf.global_variables_initializer())
示例#23
0
def model_load_from_list(sess,
                         model_path,
                         tf_var_list=[],
                         numpy_var_list={},
                         target_scope_switch='trpo_agent_policy'):
    '''
        @brief:
            if the var list is given, we just save them
        @input:
            @target_scope_switch:
    '''
    if not model_path.endswith('.npy'):
        model_path = model_path + '.npy'
        logger.warning('[checkpoint] adding the ".npy" to the path name')
    logger.info('[checkpoint] loading checkpoint from {}'.format(model_path))

    output_save_list = np.load(model_path, encoding='latin1').item()
    tf_name_list = [var.name for var in tf_var_list]
    numpy_name_list = [key for key, val in numpy_var_list.items()]

    # get the weights one by one
    for name, val in output_save_list.items():
        name = name.replace('trpo_agent_policy', target_scope_switch)
        if name not in tf_name_list and name not in numpy_var_list:
            logger.info('**** Parameters Not Exist **** {}'.format(name))
            continue
        elif name in tf_name_list:
            logger.info('loading TF pretrained parameters {}'.format(name))
            tf_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            var = [var for var in tf_var_list if var.name == name][0]
            assign_op = var.assign(val)
            sess.run(assign_op)  # or `assign_op.op.run()`
        else:
            logger.info('loading numpy pretrained parameters {}'.format(name))
            numpy_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            numpy_var_list[name] = val

    if len(tf_name_list) or len(numpy_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}\n {}'.format(
                tf_name_list, numpy_name_list))
    return numpy_var_list
示例#24
0
def log_results(results, timer_dict, start_timesteps=0):
    logger.info("-" * 15 + " Iteration %d " % results['iteration'] + "-" * 15)

    for i_id in range(len(timer_dict) - 1):
        start_key, end_key = list(timer_dict.keys())[i_id: i_id + 2]
        time_elapsed = (timer_dict[end_key] - timer_dict[start_key]) / 60.0

        logger.info("Time elapsed for [{}] is ".format(end_key) +
                    "%.4f mins" % time_elapsed)

    logger.info("{} total steps have happened".format(results['totalsteps']))

    # the stats
    from tensorboard_logger import log_value
    for key in results['stats']:
        logger.info("[{}]: {}".format(key, results['stats'][key]))
        if results['stats'][key] is not None:
            log_value(key, results['stats'][key], start_timesteps +
                      results['totalsteps'])
示例#25
0
    def _get_fd_gradient(self, sol):
        """ @brief: use finite_difference to calculate the gradient. Due to the
            locality of the solution space, we can use some small trick to speed
            up the gradient process
        """

        gradient = np.zeros([1, len(sol)])
        epsilon = 1e-3  # used for finite difference

        # get the base values, and the base interpolation values:
        center_data_dict = {'physics_loss': None, 'projection_loss': None}
        center_loss = self._loss_function(sol, fetch_data_dict=center_data_dict)
        sol_qpos = np.reshape(sol[self._var_to_sol_id['qpos']],
                              [-1, self._len_qpos])
        camera_state = sol[self._var_to_sol_id['camera_state']]

        if 'qpos' in self._opt_var_list:

            logger.info('Calculating the gradient of qpos')
            # utilize the local connectivity of the qpos
            # locate the id of the qpos
            for i_derivative in range(self._num_sol_qpos * self._len_qpos):
                sol_id = i_derivative + self._var_to_sol_id['qpos'][0]
                center_sol_qpos_id = i_derivative // self._len_qpos
                start_sol_qpos_id = max(center_sol_qpos_id - 3, 0)
                end_sol_qpos_id = min(center_sol_qpos_id + 3,
                                      self._num_sol_qpos - 1)

                # get everything within the range of [start_sol_qpos_id,
                # end_sol_qpos_id], take the forward finite difference step
                forward_sol_qpos = np.array(
                    sol_qpos[start_sol_qpos_id: end_sol_qpos_id + 1], copy=True
                )
                forward_sol_qpos[center_sol_qpos_id - start_sol_qpos_id,
                                 i_derivative % self._len_qpos] += epsilon

                forward_loss, forward_data_dict = \
                    self._loss_from_sol_qpos_camera_state(
                        forward_sol_qpos, camera_state,
                        center_sol_qpos_id=center_sol_qpos_id
                    )

                center_physics_loss = center_data_dict['physics_loss'][
                    start_sol_qpos_id * self._sol_qpos_freq:
                    end_sol_qpos_id * self._sol_qpos_freq
                ]
                center_projection_loss = center_data_dict['projection_loss'][
                    start_sol_qpos_id * self._sol_qpos_freq:
                    end_sol_qpos_id * self._sol_qpos_freq + 1
                ]

                # make sure the ids are matched
                assert len(forward_data_dict['physics_loss']) == \
                    len(center_physics_loss) and \
                    len(forward_data_dict['projection_loss']) == \
                    len(center_projection_loss)

                difference_of_loss = forward_loss - \
                    np.mean(center_physics_loss) - \
                    np.mean(center_projection_loss)

                gradient[0, sol_id] = difference_of_loss

        for opt_var in ['xyz_pos', 'cam_view', 'fov', 'image_size']:
            if opt_var not in self._opt_var_list:
                continue
            logger.info('Calculating the gradient of {}'.format(opt_var))

            # TODO: for xyz_pos / fov / image_size, there is speed-up available
            for i_derivative in range(len(self._var_to_sol_id[opt_var])):

                sol_id = i_derivative + self._var_to_sol_id[opt_var][0]
                camera_state_id = sol_id - len(self._var_to_sol_id['qpos'])
                forward_camera_state = np.array(camera_state, copy=True)
                if opt_var == 'cam_view':
                    # for quaternion, take care of the length invariance
                    quat_id = self._var_to_sol_id['quaternion']
                    raise NotImplementedError
                    forward_camera_state[camera_state_id] += \
                        epsilon * np.linalg.norm(sol[quat_id])
                else:
                    forward_camera_state[camera_state_id] += epsilon

                forward_loss, _ = self._loss_from_sol_qpos_camera_state(
                    sol_qpos, forward_camera_state
                )
                gradient[0, sol_id] = forward_loss - center_loss

        if len(self._opt_var_list) == 0:
            raise ValueError('At least one of the var needs to be optimzied')
        logger.info('Gradient calculated')

        return gradient
示例#26
0
    def run(self):
        self._build_model()

        while True:
            next_task = self._task_queue.get(block=True)

            if next_task[0] == parallel_util.WORKER_PLANNING:
                # collect rollouts
                plan = self._plan(next_task[1])
                self._task_queue.task_done()
                self._result_queue.put(plan)

            elif next_task[0] == parallel_util.WORKER_PLAYING:
                # collect rollouts
                traj_episode = self._play(next_task[1])
                self._task_queue.task_done()
                self._result_queue.put(traj_episode)

            elif next_task[0] == parallel_util.WORKER_RATE_ACTIONS:
                # predict reward of a sequence of action
                reward = self._rate_action(next_task[1])
                self._task_queue.task_done()
                self._result_queue.put(reward)

            elif next_task[0] == parallel_util.WORKER_GET_MODEL:
                # collect the gradients
                data_id = next_task[1]['data_id']

                if next_task[1]['type'] == 'dynamics_derivative':
                    model_data = self._dynamics_derivative(
                        next_task[1]['data_dict'], next_task[1]['target'])
                elif next_task[1]['type'] == 'reward_derivative':
                    model_data = self._reward_derivative(
                        next_task[1]['data_dict'], next_task[1]['target'])
                elif next_task[1]['type'] == 'forward_model':
                    # get the next state
                    model_data = self._dynamics(next_task[1]['data_dict'])
                    model_data.update(self._reward(next_task[1]['data_dict']))
                    if next_task[1]['end_of_traj']:
                        # get the start reward for the initial state
                        model_data['end_reward'] = self._reward({
                            'start_state':
                            model_data['end_state'],
                            'action':
                            next_task[1]['data_dict']['action'] * 0.0
                        })['reward']
                else:
                    assert False

                self._task_queue.task_done()
                self._result_queue.put({
                    'data': model_data,
                    'data_id': data_id
                })

            elif next_task[0] == parallel_util.AGENT_SET_WEIGHTS:
                # set parameters of the actor policy
                self._set_weights(next_task[1])
                time.sleep(0.001)  # yield the process
                self._task_queue.task_done()

            elif next_task[0] == parallel_util.END_ROLLOUT_SIGNAL or \
                    next_task[0] == parallel_util.END_SIGNAL:
                # kill all the thread
                logger.info("kill message for worker {}".format(
                    self._worker_id))
                # logger.info("kill message for worker")
                self._task_queue.task_done()
                break
            else:
                logger.error('Invalid task type {}'.format(next_task[0]))
        return
def visualize_sol_pose(physics_engine, output_dir, data_dict, env_name,
                       iteration, sub_iter):
    """ @brief: visualize the following four images

        0. the rendered image from dm_control
        1. the image using the qpos + camera_state (trained)
        2. the image using the qpos + gt_camera_state
        3. the image using the (gt_qpos + gt_camera_state)
    """
    logger.info("generating the visualization")
    image_size = int(data_dict['gt']['camera_state'][-1])
    assert image_size == int(data_dict['gt']['camera_state'][-1])  # TODO

    # from camera and qposes to the 2d poses
    for qpos_key, camera_state_key in \
            [['gt', 'gt'], ['sol', 'sol'], ['sol', 'gt']]:
        pose_2d_key = qpos_key + "-" + camera_state_key
        data_dict[pose_2d_key] = {}  # save pose_2d

        is_trackcom = data_dict[camera_state_key]['mode'] == 'trackcom'
        pose_3d, center_of_mass = physics_engine.get_pose3d(
            data_dict[qpos_key]['qpos'], get_center_of_mass=is_trackcom)
        matrix = physics_engine.camera_matrix_from_state(
            data_dict[camera_state_key]['camera_state'], center_of_mass)
        data_dict[pose_2d_key]['pose_2d'] = \
            physics_engine.get_projected_2dpose(pose_3d, matrix)
        data_dict[pose_2d_key]['image_size'] = \
            data_dict[camera_state_key]['camera_state'][-1]

    pos_connection = POS_CONNECTION[env_name]

    # the output directory
    directory = os.path.join(output_dir, "video")
    if not os.path.exists(directory):
        os.mkdir(directory)
    output_dir = os.path.join(
        directory,
        "pos_Iter_" + str(iteration) + '_sub_' + str(sub_iter) + '.mp4')
    video = cv2.VideoWriter(
        os.path.join(init_path.get_abs_base_dir(), output_dir),
        cv2.VideoWriter_fourcc(*'mp4v'), 40, (image_size * 4, image_size))

    for i_pos_id in range(len(data_dict['gt']['qpos'])):
        # render the image using the default renderer
        render_image = physics_engine._env.render(
            camera_id=0, qpos=data_dict['gt']['qpos'][i_pos_id])

        # the sol_qpos + sol_camera_state
        sol_sol_image = draw_pose3d(render_image * 0.0,
                                    data_dict['sol-sol']['pose_2d'][i_pos_id],
                                    pos_connection)
        # the sol_qpos + gt_camera_state
        sol_gt_image = draw_pose3d(render_image * 0.0,
                                   data_dict['sol-gt']['pose_2d'][i_pos_id],
                                   pos_connection)
        # the gt_qpos + gt_camera_state
        gt_gt_image = draw_pose3d(render_image * 0.0,
                                  data_dict['gt-gt']['pose_2d'][i_pos_id],
                                  pos_connection)

        image = \
            np.hstack([render_image, sol_sol_image, sol_gt_image, gt_gt_image])
        # import pdb; pdb.set_trace()

        print('Processing %d out of %d' %
              (i_pos_id, len(data_dict['gt']['qpos'])))
        video.write(np.array(image[:, :, [2, 1, 0]], dtype=np.uint8))

    video.release()
示例#28
0
    def train(self, data_dict, replay_buffer, training_info={}):
        # make sure the needed data is ready
        assert 'plan_data' in training_info
        self._plan_data = training_info['plan_data']
        self._set_whitening_var(data_dict['whitening_stats'])

        # step 1: get the target action mean and target precision matrix
        '''
        assert len(self._plan_data) == self._num_traj and \
            len(self._plan_data[0]['new_u']) == self._traj_depth
        num_data = len(self._plan_data) * len(self._plan_data[0]['u'])
        '''
        '''
        target_mu = np.zeros([num_data, self._action_size])
        target_precision = np.ones([num_data, self._action_size,
                                    self._action_size])
        '''
        training_data, num_data = self._get_training_dataset(data_dict)

        # step 2: train the mean of the action
        if num_data < self.args.policy_sub_batch_size:
            logger.warning("Not enough data!")
            return {}

        batch_per_epoch = num_data // self.args.policy_sub_batch_size
        feed_dict = {
            self._input_ph['target_action_mu']: training_data['target_mu'],
            self._input_ph['target_precision']:
            training_data['target_precision'],
            self._input_ph['start_state']: training_data['start_state']
        }
        for i_iteration in range(self.args.policy_epochs):
            data_id = range(num_data)
            self._npr.shuffle(data_id)
            avg_loss = []

            for i_batch in range(batch_per_epoch):
                batch_idx = data_id[i_batch *
                                    self.args.policy_sub_batch_size:(i_batch +
                                                                     1) *
                                    self.args.policy_sub_batch_size]
                sub_feed_dict = {
                    key: feed_dict[key][batch_idx]
                    for key in feed_dict
                }

                fetch_dict = {
                    'update_op': self._update_operator['update_op'],
                    'loss': self._update_operator['loss']
                }
                training_stat = self._session.run(fetch_dict, sub_feed_dict)
                avg_loss.append(training_stat['loss'])
                '''
                for i_ in range(10000):
                    fetch_dict['pred_act'] = self._tensor['action_dist_mu']
                    training_stat = self._session.run(fetch_dict, sub_feed_dict)
                    if i_ % 10 == 0:
                        import matplotlib.pyplot as plt
                        print training_stat
                        ga = sub_feed_dict[self._input_ph['target_action_mu']].flatten()
                        plt.plot(ga, label='target')
                        pa = training_stat['pred_act'].flatten()
                        plt.plot(pa, label='pred')
                        plt.legend()
                        plt.show()
                        from util.common.fpdb import fpdb; fpdb().set_trace()
                '''
            logger.info('GPS policy loss {}'.format(np.mean(avg_loss)))

        # the covariance of the controller
        self._policy_cov_data['inv_cov'] = \
            np.mean(training_data['target_precision'], 0) + \
            self.args.gps_policy_cov_damping * \
            np.ones([self._action_size, self._action_size])
        # self._policy_cov_data['precision'] = \
        # np.diag(self._policy_cov_data['inv_cov'])
        # self._policy_cov_data['cov'] = \
        # np.diag(1.0 / self._policy_cov_data['precision'])
        self._policy_cov_data['var'] = \
            1 / np.diag(self._policy_cov_data['inv_cov'])  # vec
        self._policy_cov_data['sig'] = \
            np.diag(self._policy_cov_data['var'])  # matrix
        self._policy_cov_data['chol_pol_covar'] = \
            np.diag(np.sqrt(self._policy_cov_data['var']))  # matrix
        self._policy_cov_data['flat_cov_L'][:] = \
            np.diag(self._policy_cov_data['chol_pol_covar'])  # vec

        return training_stat
示例#29
0
    if args.gt_reward:
        from mbbl.network.reward.groundtruth_reward import reward_network
    else:
        from mbbl.network.reward.deterministic_reward import reward_network

    mb_timesteps, policy_weight = train_mb(mbmf_trainer, mbmf_sampler,
                                           mbmf_worker, dynamics_network,
                                           mbmf_policy_network, reward_network,
                                           args)
    tf.reset_default_graph()
    print('==================TRPO starts at==================')

    # Manully set the bs to 50K.
    # args.timesteps_per_batch = 50000
    # args.policy_batch_size = 50000
    logger.info("batch size for trpo is {}".format(args.timesteps_per_batch))

    from mbbl.sampler import singletask_sampler
    from mbbl.worker import mf_worker
    # from mbbl.network.policy.trpo_policy import policy_network
    import mbbl.network.policy.trpo_policy
    import mbbl.network.policy.ppo_policy

    policy_network = {
        'ppo': mbbl.network.policy.ppo_policy.policy_network,
        'trpo': mbbl.network.policy.trpo_policy.policy_network
    }[args.trust_region_method]

    # here the dynamics and reward are simply placeholders, which cannot be
    # called to pred next state or reward
    from mbbl.network.dynamics.base_dynamics import base_dynamics_network
示例#30
0
def train_mb(trainer, sampler, worker, dynamics, policy, reward, args=None):
    logger.info('Training starts at {}'.format(init_path.get_abs_base_dir()))
    network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward}

    # make the trainer and sampler
    sampler_agent = make_sampler(sampler, worker, network_type, args)
    trainer_tasks, trainer_results, trainer_agent, init_weights = \
        make_trainer(trainer, network_type, args)
    sampler_agent.set_weights(init_weights)

    timer_dict = OrderedDict()
    timer_dict['Program Start'] = time.time()
    totalsteps = 0
    current_iteration = 0
    init_data = {}

    # Start mb training.
    while True:
        timer_dict['** Program Total Time **'] = time.time()

        # step 1: collect rollout data
        if current_iteration == 0 and args.random_timesteps > 0 and \
                (not (args.gt_dynamics and args.gt_reward)):
            # we could first generate random rollout data for exploration
            logger.info(
                'Generating {} random timesteps'.format(args.random_timesteps)
            )
            rollout_data = sampler_agent.rollouts_using_worker_planning(
                args.random_timesteps, use_random_action=True
            )
        else:
            rollout_data = sampler_agent.rollouts_using_worker_planning()

        timer_dict['Generate Rollout'] = time.time()

        # step 2: train the weights for dynamics and policy network
        training_info = {'network_to_train': ['dynamics', 'reward']}
        trainer_tasks.put(
            (parallel_util.TRAIN_SIGNAL,
             {'data': rollout_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()
        timer_dict['Train Weights'] = time.time()

        # step 4: update the weights
        sampler_agent.set_weights(training_return['network_weights'])
        timer_dict['Assign Weights'] = time.time()

        # log and print the results
        log_results(training_return, timer_dict)

        for key in rollout_data.keys():
            if key not in init_data.keys():
                init_data[key] = []
            init_data[key].extend(rollout_data[key])

        # Add noise to initial data to encourge trpo to explore.
        import numpy as np
        for i_rollout in init_data['data']:
            action = i_rollout['actions']
            i_rollout['actions'] += np.random.normal(scale=0.005,
                                                     size=action.shape)
        if totalsteps > args.max_timesteps or \
                training_return['replay_buffer'].get_current_size() > \
                args.mb_timesteps:
            break
        else:
            current_iteration += 1
    totalsteps = training_return['totalsteps']

    # Initilize policy network
    training_info = {'network_to_train': ['reward', 'policy']}
    trainer_tasks.put(
        (parallel_util.MBMF_INITIAL,
         {'data': init_data['data'], 'training_info': training_info})
    )
    trainer_tasks.join()
    training_return = trainer_results.get()
    timer_dict['Train Weights'] = time.time()

    # Start dagger iteration.
    for dagger_i in range(args.dagger_iter):
        print('=================Doing dagger iteration {}=================='.
              format(dagger_i))
        # Collect on policy rollout.
        rollout_data = sampler_agent.rollouts_using_worker_playing(
            num_timesteps=args.dagger_timesteps_per_iter,
            use_true_env=True)
        sampler_agent.dagger_rollouts(rollout_data['data'])
        init_data['data'] += rollout_data['data']
        trainer_tasks.put(
            (parallel_util.MBMF_INITIAL,
             {'data': init_data['data'], 'training_info': training_info})
        )
        trainer_tasks.join()
        training_return = trainer_results.get()

    trainer_tasks.put((parallel_util.GET_POLICY_WEIGHT, None))
    trainer_tasks.join()
    policy_weight = trainer_results.get()

    # end of training
    sampler_agent.end()
    trainer_tasks.put((parallel_util.END_SIGNAL, None))
    return totalsteps, policy_weight