示例#1
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str, help="name of gym env")
    parser.add_argument('dataset_path',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels')
    parser.add_argument('--restore', action='store_true')
    # Training parameters
    parser.add_argument('--val_ratio',
                        type=float,
                        default=0.05,
                        help="ratio of validation sets")
    parser.add_argument('--num_itr', type=int, default=10000000)
    parser.add_argument('--val_freq', type=int, default=200)
    parser.add_argument('--log_freq', type=int, default=50)
    parser.add_argument('--save_freq', type=int, default=5000)

    # ICM parameters
    parser.add_argument('--init_lr', type=float, default=1e-4)
    parser.add_argument('--forward_weight',
                        type=float,
                        default=0.8,
                        help="the ratio of forward loss vs inverse loss")
    parser.add_argument('--cos_forward',
                        action='store_true',
                        help="whether to use cosine forward loss")

    args = parser.parse_args()

    # Get dataset
    dataset_names = list(
        map(lambda file_name: osp.join(args.dataset_path, file_name),
            listdir(args.dataset_path)))
    val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)]
    train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):]

    train_queue = tf.train.string_input_producer(train_set_names,
                                                 num_epochs=None)
    val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None)

    obs_shape = OBS_SHAPE_MAP[args.env_name]
    action_dim = ACTION_DIM_MAP[args.env_name]
    train_obs, train_next_obs, train_action, train_state, train_next_state = inputs(
        train_set_names, obs_shape, train=True)
    val_obs, val_next_obs, val_action, val_state, val_next_state = inputs(
        val_set_names, obs_shape, train=False)

    #not yet implemented
    if args.restore:
        raise NotImplementedError
        models_dict = joblib.load(args.tfmodel_path)
        _encoder = models_dict['encoder']
        _inverse_model = model.dict['inverse_model']
        _forward_model = model.dict['forward_model']
    else:
        _encoder = ConvEncoder(
            feature_dim=256,
            input_shape=obs_shape,
            conv_filters=(64, 64, 64, 64, 32),
            conv_filter_sizes=((5, 5), (5, 5), (5, 5), (5, 5), (3, 3)),
            conv_strides=(2, 2, 2, 2, 2),
            conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'),
            hidden_sizes=(256, ),
            hidden_activation=tf.nn.elu,
        )
        _state_encoder = FullyConnectedEncoder(
            200,
            observation_dim=8,
            hidden_sizes=(256, ),
            hidden_activation=tf.nn.elu,
        )
        #TODO: add one more encoder before inverse model?
        _inverse_model = InverseModel(
            feature_dim=256 + 200,
            action_dim=action_dim,
            hidden_sizes=(256, ),
            hidden_activation=tf.nn.elu,
            output_activation=tf.nn.tanh,
        )
        _forward_model = ForwardModel(
            feature_dim=256 + 200,
            action_dim=action_dim,
            hidden_sizes=(256, ),
            hidden_activation=tf.nn.elu,
        )

    sess = tf.Session()
    _encoder.sess = sess
    _inverse_model.sess = sess
    _forward_model.sess = sess
    _state_encoder.sess = sess

    with sess.as_default():
        # Initialize variables for get_copy to work
        sess.run(tf.initialize_all_variables())

        train_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=train_obs)
        train_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=train_next_obs)
        train_state_encoder1 = _state_encoder.get_weight_tied_copy(
            observation_input=train_state)
        train_state_encoder2 = _state_encoder.get_weight_tied_copy(
            observation_input=train_next_state)
        train_feature1 = tf.concat(
            1, [train_encoder1.output, train_state_encoder1.output])
        train_feature2 = tf.concat(
            1, [train_encoder2.output, train_state_encoder2.output])
        train_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=train_feature1, feature_input2=train_feature2)
        train_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=train_feature1, action_input=train_action)

        val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs)
        val_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=val_next_obs)
        val_state_encoder1 = _state_encoder.get_weight_tied_copy(
            observation_input=val_state)
        val_state_encoder2 = _state_encoder.get_weight_tied_copy(
            observation_input=val_next_state)
        val_feature1 = tf.concat(
            1, [val_encoder1.output, val_state_encoder1.output])
        val_feature2 = tf.concat(
            1, [val_encoder2.output, val_state_encoder2.output])
        val_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=val_feature1, feature_input2=val_feature2)
        val_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=val_feature1, action_input=val_action)

        if args.cos_forward:
            train_forward_loss = cos_loss(train_feature2,
                                          train_forward_model.output)
            val_forward_loss = cos_loss(val_feature2, val_forward_model.output)
        else:
            train_forward_loss = tf.reduce_mean(
                tf.square(train_feature2 - train_forward_model.output))
            val_forward_loss = tf.reduce_mean(
                tf.square(val_feature2 - val_forward_model.output))

        train_inverse_losses = tf.reduce_mean(
            tf.square(train_action - train_inverse_model.output), axis=0)
        val_inverse_losses = tf.reduce_mean(
            tf.square(val_action - val_inverse_model.output), axis=0)
        train_inverse_separate_summ = []
        val_inverse_separate_summ = []
        for joint_idx in range(action_dim):
            train_inverse_separate_summ.append(
                tf.summary.scalar(
                    "train/icm_inverse_loss/joint_{}".format(joint_idx),
                    train_inverse_losses[joint_idx]))
            val_inverse_separate_summ.append(
                tf.summary.scalar(
                    "val/icm_inverse_loss/joint_{}".format(joint_idx),
                    val_inverse_losses[joint_idx]))

        train_inverse_loss = tf.reduce_mean(train_inverse_losses)
        val_inverse_loss = tf.reduce_mean(val_inverse_losses)

        train_total_loss = args.forward_weight * train_forward_loss + (
            1. - args.forward_weight) * train_inverse_loss
        val_total_loss = args.forward_weight * val_forward_loss + (
            1. - args.forward_weight) * val_inverse_loss
        icm_opt = tf.train.AdamOptimizer(
            args.init_lr).minimize(train_total_loss)

        # Setup summaries
        summary_writer = tf.summary.FileWriter(args.tfboard_path,
                                               graph=tf.get_default_graph())

        train_inverse_loss_summ = tf.summary.scalar(
            "train/icm_inverse_loss/total_mean", train_inverse_loss)
        train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss",
                                                    train_forward_loss)
        train_total_loss_summ = tf.summary.scalar("train/icm_total_loss",
                                                  train_total_loss)
        val_inverse_loss_summ = tf.summary.scalar(
            "val/icm_inverse_loss/total_mean", val_inverse_loss)
        val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss",
                                                  val_forward_loss)
        val_total_loss_summ = tf.summary.scalar("val/icm_total_loss",
                                                val_total_loss)

        train_summary_op = tf.summary.merge([
            train_inverse_loss_summ, train_forward_loss_summ,
            train_total_loss_summ
        ] + train_inverse_separate_summ)
        val_summary_op = tf.summary.merge([
            val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ
        ] + val_inverse_separate_summ)

        logger.log("Finished creating ICM model")

        sess.run(tf.initialize_all_variables())

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            for timestep in range(args.num_itr):
                if timestep % args.log_freq == 0:
                    logger.log("Start itr {}".format(timestep))
                    _, train_summary = sess.run([icm_opt, train_summary_op])
                    summary_writer.add_summary(train_summary, timestep)
                else:
                    sess.run(icm_opt)

                if timestep % args.save_freq == 0:
                    save_snapshot(_encoder, _inverse_model, _forward_model,
                                  _state_encoder, args.tfmodel_path)

                if timestep % args.val_freq == 0:
                    val_summary = sess.run(val_summary_op)
                    summary_writer.add_summary(val_summary, timestep)

        except KeyboardInterrupt:
            print("End training...")
            pass

        coord.join(threads)
        sess.close()
示例#2
0
    def __init__(self,
                 env,
                 trpo: TRPO,
                 tensorboard_path,
                 no_encoder=False,
                 feature_dim=10,
                 forward_weight=0.8,
                 external_reward_weight=0.01,
                 forward_cos=False,
                 init_learning_rate=1e-4,
                 icm_batch_size=128,
                 replay_pool_size=1000000,
                 min_pool_size=200,
                 n_updates_per_iter=10,
                 obs_dtype='float32',
                 normalize_input=False,
                 gpu_fraction=0.95,
                 pretrained_icm=False,
                 pretrained_icm_path=None,
                 freeze_icm=False,
                 **kwargs):
        """
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
        self.trpo = trpo
        self.freeze_icm = freeze_icm
        # Replace sampler to inject intrinsic reward
        self.trpo.sampler = self.get_sampler(self.trpo)
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        self.sess = tf.get_default_session() or tf.Session(
            config=tf.ConfigProto(gpu_options=gpu_options))
        self.external_reward_weight = external_reward_weight
        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.n_updates_per_iter = n_updates_per_iter
        self.icm_batch_size = icm_batch_size
        self.act_space = env.action_space
        self.obs_space = env.observation_space

        self.pool = TRPOReplayPool(replay_pool_size,
                                   self.obs_space.flat_dim,
                                   self.act_space.flat_dim,
                                   obs_dtype=obs_dtype)

        self.min_pool_size = min_pool_size
        # Setup ICM models
        self.s1 = tf.placeholder(tf.float32,
                                 [None] + list(self.obs_space.shape))
        self.s2 = tf.placeholder(tf.float32,
                                 [None] + list(self.obs_space.shape))
        if normalize_input:
            s1 = self.s1 / 255.0 - 0.5
            s2 = self.s2 / 255.0 - 0.5
        else:
            s1 = self.s1
            s2 = self.s2

        self.asample = tf.placeholder(tf.float32,
                                      [None, self.act_space.flat_dim])
        self.external_rewards = tf.placeholder(tf.float32, (None, ))

        # Hack
        temp_vars = set(tf.all_variables())

        if pretrained_icm:
            with self.sess.as_default():
                icm_data = joblib.load(pretrained_icm_path)
                _encoder = icm_data['encoder']
                _forward_model = icm_data['forward_model']
                _inverse_model = icm_data['inverse_model']

            icm_vars = set(tf.all_variables()) - temp_vars
        else:
            icm_vars = set([])

        if pretrained_icm:
            self._encoder = _encoder
            # raise NotImplementedError("Currently only supports flat observation input!")
        else:
            if len(self.obs_space.shape) == 1:
                if no_encoder:
                    self._encoder = NoEncoder(self.obs_space.flat_dim,
                                              env_spec=env.spec)
                else:
                    self._encoder = FullyConnectedEncoder(feature_dim,
                                                          env_spec=env.spec)
            else:
                self._encoder = ConvEncoder(feature_dim,
                                            env.spec.observation_space.shape)

        self._encoder.sess = self.sess

        if not pretrained_icm:
            # Initialize variables for get_copy to work
            self.sess.run(tf.initialize_all_variables())

        with self.sess.as_default():
            self.encoder1 = self._encoder.get_weight_tied_copy(
                observation_input=s1)
            self.encoder2 = self._encoder.get_weight_tied_copy(
                observation_input=s2)

        if not pretrained_icm:
            self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
            self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
        else:
            self._inverse_model = _inverse_model
            self._forward_model = _forward_model

        self._inverse_model.sess = self.sess
        self._forward_model.sess = self.sess

        if not pretrained_icm:
            # Initialize variables for get_copy to work
            self.sess.run(tf.initialize_all_variables())

        # Clip actions to make sure it is consistent with what get input in env
        clipped_asample = tf.clip_by_value(self.asample, -1.0, 1.0)

        with self.sess.as_default():
            self.inverse_model = self._inverse_model.get_weight_tied_copy(
                feature_input1=self.encoder1.output,
                feature_input2=self.encoder2.output)
            self.forward_model = self._forward_model.get_weight_tied_copy(
                feature_input=self.encoder1.output,
                action_input=clipped_asample)

        # Define losses, by default it uses L2 loss
        if forward_cos:
            self.forward_loss = cos_loss(self.encoder2.output,
                                         self.forward_model.output)
        else:
            self.forward_loss = tf.reduce_mean(
                tf.square(self.encoder2.output - self.forward_model.output))
        if isinstance(self.act_space, Box):
            self.inverse_loss = tf.reduce_mean(
                tf.square(clipped_asample - self.inverse_model.output))
        elif isinstance(self.act_space, Discrete):
            # TODO: Implement softmax loss
            raise NotImplementedError
        else:
            raise NotImplementedError

        if forward_cos:
            self.internal_rewards = cos_loss(self.encoder2.output,
                                             self.forward_model.output,
                                             mean=False)
        else:
            self.internal_rewards = tf.reduce_sum(
                tf.square(self.encoder2.output - self.forward_model.output),
                axis=1)
        self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
        self.mean_external_rewards = tf.reduce_mean(self.external_rewards)

        self.total_loss = forward_weight * self.forward_loss + \
            (1. - forward_weight) * self.inverse_loss
        self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
            minimize(self.total_loss)

        # Setup summaries
        inverse_loss_summ = tf.summary.scalar("icm_inverse_loss",
                                              self.inverse_loss)
        forward_loss_summ = tf.summary.scalar("icm_forward_loss",
                                              self.forward_loss)
        total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
        internal_rewards = tf.summary.scalar("mean_internal_rewards",
                                             self.mean_internal_rewards)
        external_rewards = tf.summary.scalar("mean_external_rewards",
                                             self.mean_external_rewards)
        # Setup env_info logs
        var_summ = []

        self.summary = tf.summary.merge([
            inverse_loss_summ, forward_loss_summ, total_loss_summ,
            internal_rewards, external_rewards
        ])
        # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)

        ## Initialize uninitialized variables
        self.sess.run(
            tf.initialize_variables(set(tf.all_variables()) - icm_vars))
示例#3
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str, help="name of gym env")
    parser.add_argument('dataset_path',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels')
    # Training parameters
    parser.add_argument('--val_ratio',
                        type=float,
                        default=0.1,
                        help="ratio of validation sets")
    parser.add_argument('--num_itr', type=int, default=10000000)
    parser.add_argument('--val_freq', type=int, default=1000)
    parser.add_argument('--log_freq', type=int, default=200)
    parser.add_argument('--save_freq', type=int, default=5000)

    # ICM parameters
    parser.add_argument('--init_lr', type=float, default=1e-4)
    parser.add_argument('--forward_weight',
                        type=float,
                        default=0.8,
                        help="the ratio of forward loss vs inverse loss")
    parser.add_argument('--cos_forward',
                        action='store_true',
                        help="whether to use cosine forward loss")
    # parser.add_argument('--norm_input', action='store_true',
    # 					help="whether to normalize observation input")

    args = parser.parse_args()

    env = TfEnv(normalize(env=GymEnv(args.env_name,record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    # Get dataset
    dataset_names = list(
        map(lambda file_name: osp.join(args.dataset_path, file_name),
            listdir(args.dataset_path)))
    val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)]
    train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):]

    train_queue = tf.train.string_input_producer(train_set_names,
                                                 num_epochs=None)
    val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None)

    train_obs, train_next_obs, train_action = read_and_decode(
        train_queue, env.observation_space.shape, env.action_space.shape)
    val_obs, val_next_obs, val_action = read_and_decode(
        val_queue, env.observation_space.shape, env.action_space.shape)

    # Build ICM model
    # if args.norm_input:
    # 	train_obs = train_obs * (1./255) - 0.5
    # 	train_next_obs = train_next_obs *(1./255) - 0.5
    # 	val_obs = val_obs * (1./255) - 0.5
    # 	val_next_obs = val_next_obs * (1./255) - 0.5
    # 	train_obs = tf.cast(train_obs, tf.float32) / 255.0 - 0.5
    # 	train_next_obs = tf.cast(train_next_obs, tf.float32) / 255.0 - 0.5
    # 	val_obs = tf.cast(val_obs, tf.float32) / 255.0 - 0.5
    # 	val_next_obs = tf.cast(val_next_obs, tf.float32) / 255.0 - 0.5
    # else:
    # 	train_obs = tf.cast(train_obs, tf.float32)
    # 	train_next_obs = tf.cast(train_next_obs, tf.float32)
    # 	val_obs = tf.cast(val_obs, tf.float32)
    # 	val_next_obs = tf.cast(val_next_obs, tf.float32)

    _encoder = ConvEncoder(
        feature_dim=256,
        input_shape=env.observation_space.shape,
        conv_filters=(64, 64, 64, 32),
        conv_filter_sizes=((5, 5), (5, 5), (5, 5), (3, 3)),
        conv_strides=(3, 2, 2, 2),
        conv_pads=('SAME', 'SAME', 'SAME', 'SAME'),
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.elu,
    )
    _inverse_model = InverseModel(
        feature_dim=256,
        env_spec=env.spec,
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.tanh,
        output_activation=tf.nn.tanh,
    )
    _forward_model = ForwardModel(
        feature_dim=256,
        env_spec=env.spec,
        hidden_sizes=(256, ),
        hidden_activation=tf.nn.elu,
    )

    sess = tf.Session()
    _encoder.sess = sess
    _inverse_model.sess = sess
    _forward_model.sess = sess

    with sess.as_default():
        # Initialize variables for get_copy to work
        sess.run(tf.initialize_all_variables())

        train_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=train_obs)
        train_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=train_next_obs)
        train_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=train_encoder1.output,
            feature_input2=train_encoder2.output)
        train_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=train_encoder1.output, action_input=train_action)

        val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs)
        val_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=val_next_obs)
        val_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=val_encoder1.output,
            feature_input2=val_encoder2.output)
        val_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=val_encoder1.output, action_input=val_action)
        if args.cos_forward:
            train_forward_loss = cos_loss(train_encoder2.output,
                                          train_forward_model.output)
            val_forward_loss = cos_loss(val_encoder2.output,
                                        val_forward_model.output)
        else:
            train_forward_loss = tf.reduce_mean(
                tf.square(train_encoder2.output - train_forward_model.output))
            val_forward_loss = tf.reduce_mean(
                tf.square(val_encoder2.output - val_forward_model.output))

        train_inverse_loss = tf.reduce_mean(
            tf.square(train_action - train_inverse_model.output))
        val_inverse_loss = tf.reduce_mean(
            tf.square(val_action - val_inverse_model.output))
        train_total_loss = args.forward_weight * train_forward_loss + (
            1. - args.forward_weight) * train_inverse_loss
        val_total_loss = args.forward_weight * val_forward_loss + (
            1. - args.forward_weight) * val_inverse_loss
        icm_opt = tf.train.AdamOptimizer(
            args.init_lr).minimize(train_total_loss)

        # Setup summaries
        summary_writer = tf.summary.FileWriter(args.tfboard_path,
                                               graph=tf.get_default_graph())

        train_inverse_loss_summ = tf.summary.scalar("train/icm_inverse_loss",
                                                    train_inverse_loss)
        train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss",
                                                    train_forward_loss)
        train_total_loss_summ = tf.summary.scalar("train/icm_total_loss",
                                                  train_total_loss)
        val_inverse_loss_summ = tf.summary.scalar("val/icm_inverse_loss",
                                                  val_inverse_loss)
        val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss",
                                                  val_forward_loss)
        val_total_loss_summ = tf.summary.scalar("val/icm_total_loss",
                                                val_total_loss)

        train_summary_op = tf.summary.merge([
            train_inverse_loss_summ, train_forward_loss_summ,
            train_total_loss_summ
        ])
        val_summary_op = tf.summary.merge([
            val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ
        ])

        logger.log("Finished creating ICM model")

        sess.run(tf.initialize_all_variables())

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            for timestep in range(args.num_itr):
                if timestep % args.log_freq == 0:
                    logger.log("Start itr {}".format(timestep))
                    _, train_summary = sess.run([icm_opt, train_summary_op])
                else:
                    sess.run(icm_opt)

                if timestep % args.log_freq == 0:
                    summary_writer.add_summary(train_summary, timestep)
                if timestep % args.save_freq == 0:
                    save_snapshot(_encoder, _inverse_model, _forward_model,
                                  args.tfmodel_path)

                if timestep % args.val_freq == 0:
                    val_summary = sess.run(val_summary_op)
                    summary_writer.add_summary(val_summary, timestep)

        except KeyboardInterrupt:
            print("End training...")
            pass

        coord.join(threads)
        sess.close()
	def __init__(
			self,
			env,
			algo: OnlineAlgorithm,
			no_encoder=False,
			feature_dim=10,
			forward_weight=0.8,
			external_reward_weight=0.01,
			inverse_tanh=False,
			init_learning_rate=1e-4,
			algo_update_freq=1,
			**kwargs
	):
		"""
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
		self.algo = algo
		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
		self.sess = self.algo.sess or tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
		self.external_reward_weight = external_reward_weight
		self.summary_writer = self.algo.summary_writer
		self.algo_update_freq = algo_update_freq
		act_space = env.action_space
		obs_space = env.observation_space
		
		# Setup ICM models
		self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
		self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
		self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim])
		self.external_rewards = tf.placeholder(tf.float32, (None,))

		if len(obs_space.shape) == 1:
			if no_encoder:
				self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec)
			else:
				self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec)
		else:
			# TODO: implement conv encoder
			raise NotImplementedError("Currently only supports flat observation input!")

		self._encoder.sess = self.sess
		# Initialize variables for get_copy to work
		self.sess.run(tf.initialize_all_variables())
		with self.sess.as_default():
			self.encoder1 = self._encoder.get_weight_tied_copy(observation_input=self.s1)
			self.encoder2 = self._encoder.get_weight_tied_copy(observation_input=self.s2)

		self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
		self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
		self._inverse_model.sess = self.sess
		self._forward_model.sess = self.sess
		# Initialize variables for get_copy to work
		self.sess.run(tf.initialize_all_variables())
		with self.sess.as_default():
			self.inverse_model = self._inverse_model.get_weight_tied_copy(feature_input1=self.encoder1.output, 
																		  feature_input2=self.encoder2.output)
			self.forward_model = self._forward_model.get_weight_tied_copy(feature_input=self.encoder1.output,
																	  	  action_input=self.asample)

		# Define losses
		self.forward_loss = tf.reduce_mean(tf.square(self.encoder2.output - self.forward_model.output))
		# self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output)
		if isinstance(act_space, Box):
			self.inverse_loss = tf.reduce_mean(tf.square(self.asample - self.inverse_model.output))
		elif isinstance(act_space, Discrete):
			# TODO: Implement softmax loss
			raise NotImplementedError
		else:
			raise NotImplementedError
		self.internal_rewards = tf.reduce_sum(tf.square(self.encoder2.output - self.forward_model.output), axis=1)
		self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
		self.mean_external_rewards = tf.reduce_mean(self.external_rewards)
		
		self.total_loss = forward_weight * self.forward_loss + \
						(1. - forward_weight) * self.inverse_loss
		self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
						minimize(self.total_loss)


		# Setup summaries
		inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss)
		forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss)
		total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
		internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards)
		external_rewards = tf.summary.scalar("mean_external_rewards_training", self.mean_external_rewards)
		var_summ = []
		for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
			var_summ.append(tf.summary.histogram(var.op.name, var))
		self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ,\
							internal_rewards, external_rewards])
		# Initialize variables
		
		self.sess.run(tf.initialize_all_variables())
示例#5
0
class ICM(RLAlgorithm):
    """
	RL with intrinsic curiosity module
	"""
    def __init__(self,
                 env,
                 trpo: TRPO,
                 tensorboard_path,
                 no_encoder=False,
                 feature_dim=10,
                 forward_weight=0.8,
                 external_reward_weight=0.01,
                 forward_cos=False,
                 init_learning_rate=1e-4,
                 icm_batch_size=128,
                 replay_pool_size=1000000,
                 min_pool_size=200,
                 n_updates_per_iter=10,
                 obs_dtype='float32',
                 normalize_input=False,
                 gpu_fraction=0.95,
                 pretrained_icm=False,
                 pretrained_icm_path=None,
                 freeze_icm=False,
                 **kwargs):
        """
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
        self.trpo = trpo
        self.freeze_icm = freeze_icm
        # Replace sampler to inject intrinsic reward
        self.trpo.sampler = self.get_sampler(self.trpo)
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        self.sess = tf.get_default_session() or tf.Session(
            config=tf.ConfigProto(gpu_options=gpu_options))
        self.external_reward_weight = external_reward_weight
        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.n_updates_per_iter = n_updates_per_iter
        self.icm_batch_size = icm_batch_size
        self.act_space = env.action_space
        self.obs_space = env.observation_space

        self.pool = TRPOReplayPool(replay_pool_size,
                                   self.obs_space.flat_dim,
                                   self.act_space.flat_dim,
                                   obs_dtype=obs_dtype)

        self.min_pool_size = min_pool_size
        # Setup ICM models
        self.s1 = tf.placeholder(tf.float32,
                                 [None] + list(self.obs_space.shape))
        self.s2 = tf.placeholder(tf.float32,
                                 [None] + list(self.obs_space.shape))
        if normalize_input:
            s1 = self.s1 / 255.0 - 0.5
            s2 = self.s2 / 255.0 - 0.5
        else:
            s1 = self.s1
            s2 = self.s2

        self.asample = tf.placeholder(tf.float32,
                                      [None, self.act_space.flat_dim])
        self.external_rewards = tf.placeholder(tf.float32, (None, ))

        # Hack
        temp_vars = set(tf.all_variables())

        if pretrained_icm:
            with self.sess.as_default():
                icm_data = joblib.load(pretrained_icm_path)
                _encoder = icm_data['encoder']
                _forward_model = icm_data['forward_model']
                _inverse_model = icm_data['inverse_model']

            icm_vars = set(tf.all_variables()) - temp_vars
        else:
            icm_vars = set([])

        if pretrained_icm:
            self._encoder = _encoder
            # raise NotImplementedError("Currently only supports flat observation input!")
        else:
            if len(self.obs_space.shape) == 1:
                if no_encoder:
                    self._encoder = NoEncoder(self.obs_space.flat_dim,
                                              env_spec=env.spec)
                else:
                    self._encoder = FullyConnectedEncoder(feature_dim,
                                                          env_spec=env.spec)
            else:
                self._encoder = ConvEncoder(feature_dim,
                                            env.spec.observation_space.shape)

        self._encoder.sess = self.sess

        if not pretrained_icm:
            # Initialize variables for get_copy to work
            self.sess.run(tf.initialize_all_variables())

        with self.sess.as_default():
            self.encoder1 = self._encoder.get_weight_tied_copy(
                observation_input=s1)
            self.encoder2 = self._encoder.get_weight_tied_copy(
                observation_input=s2)

        if not pretrained_icm:
            self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
            self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
        else:
            self._inverse_model = _inverse_model
            self._forward_model = _forward_model

        self._inverse_model.sess = self.sess
        self._forward_model.sess = self.sess

        if not pretrained_icm:
            # Initialize variables for get_copy to work
            self.sess.run(tf.initialize_all_variables())

        # Clip actions to make sure it is consistent with what get input in env
        clipped_asample = tf.clip_by_value(self.asample, -1.0, 1.0)

        with self.sess.as_default():
            self.inverse_model = self._inverse_model.get_weight_tied_copy(
                feature_input1=self.encoder1.output,
                feature_input2=self.encoder2.output)
            self.forward_model = self._forward_model.get_weight_tied_copy(
                feature_input=self.encoder1.output,
                action_input=clipped_asample)

        # Define losses, by default it uses L2 loss
        if forward_cos:
            self.forward_loss = cos_loss(self.encoder2.output,
                                         self.forward_model.output)
        else:
            self.forward_loss = tf.reduce_mean(
                tf.square(self.encoder2.output - self.forward_model.output))
        if isinstance(self.act_space, Box):
            self.inverse_loss = tf.reduce_mean(
                tf.square(clipped_asample - self.inverse_model.output))
        elif isinstance(self.act_space, Discrete):
            # TODO: Implement softmax loss
            raise NotImplementedError
        else:
            raise NotImplementedError

        if forward_cos:
            self.internal_rewards = cos_loss(self.encoder2.output,
                                             self.forward_model.output,
                                             mean=False)
        else:
            self.internal_rewards = tf.reduce_sum(
                tf.square(self.encoder2.output - self.forward_model.output),
                axis=1)
        self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
        self.mean_external_rewards = tf.reduce_mean(self.external_rewards)

        self.total_loss = forward_weight * self.forward_loss + \
            (1. - forward_weight) * self.inverse_loss
        self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
            minimize(self.total_loss)

        # Setup summaries
        inverse_loss_summ = tf.summary.scalar("icm_inverse_loss",
                                              self.inverse_loss)
        forward_loss_summ = tf.summary.scalar("icm_forward_loss",
                                              self.forward_loss)
        total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
        internal_rewards = tf.summary.scalar("mean_internal_rewards",
                                             self.mean_internal_rewards)
        external_rewards = tf.summary.scalar("mean_external_rewards",
                                             self.mean_external_rewards)
        # Setup env_info logs
        var_summ = []

        self.summary = tf.summary.merge([
            inverse_loss_summ, forward_loss_summ, total_loss_summ,
            internal_rewards, external_rewards
        ])
        # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)

        ## Initialize uninitialized variables
        self.sess.run(
            tf.initialize_variables(set(tf.all_variables()) - icm_vars))

    @overrides
    def train(self):
        with self.sess.as_default():
            self.trpo.start_worker()
            for itr in range(self.trpo.start_itr, self.trpo.n_itr):
                paths = self.trpo.obtain_samples(itr)
                modified_paths = self.process_paths(itr, paths)
                samples_data = self.trpo.process_samples(itr, modified_paths)

                if self.pool.size >= self.min_pool_size:
                    if self.freeze_icm:
                        logger.log("Freezing ICM")
                    else:
                        logger.log("ICM Training started")
                        start_time = time.time()
                        for _ in range(self.n_updates_per_iter):
                            self.train_icm(_ + itr * self.n_updates_per_iter)

                        logger.log("ICM Training finished. Time: {0}".format(
                            time.time() - start_time))

                for path in samples_data['paths']:
                    path_len = len(path['rewards'])
                    for i in range(path_len):
                        obs = path['observations'][i]
                        act = path['actions'][i]
                        term = (i == path_len - 1)
                        rew = 0.0
                        self.pool.add_sample(obs, act, rew, term)

                # pdb.set_trace()

                self.trpo.log_diagnostics(paths)
                self.trpo.optimize_policy(itr, samples_data)
                params = self.trpo.get_itr_snapshot(itr, samples_data)
                params['encoder'] = self._encoder
                params['inverse_model'] = self._inverse_model
                params['forward_model'] = self._forward_model
                logger.save_itr_params(itr, params)
                logger.dump_tabular(with_prefix=False)

            self.trpo.shutdown_worker()

    def train_icm(self, timestep):
        batch = self.pool.random_batch(self.icm_batch_size)
        obs = self.reshape_obs(batch['observations'])
        next_obs = self.reshape_obs(batch['next_observations'])
        acts = batch['actions']
        rewards = batch['rewards']
        feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts)
        ops = [self.summary, self.icm_opt]
        # ops = [self.icm_opt]
        results = self.sess.run(ops, feed_dict=feed_dict)
        if timestep % TENSORBOARD_PERIOD == 0:
            self.summary_writer.add_summary(results[0], timestep)

    def process_paths(self, itr, paths):
        modified_paths = copy(paths)

        for path in modified_paths:
            obs = self.reshape_obs(path['observations'][:-1])
            acts = path['actions'][:-1]
            next_obs = self.reshape_obs(path['observations'][1:])
            internal_rewards = self.sess.run(self.internal_rewards,
                                             feed_dict={
                                                 self.s1: obs,
                                                 self.s2: next_obs,
                                                 self.asample: acts
                                             })
            internal_rewards = np.append(internal_rewards, 0.0)
            path['t_rewards'] = self.external_reward_weight * path['rewards'] \
                  + (1. - self.external_reward_weight) * internal_rewards
        return modified_paths

    def reshape_obs(self, obs):
        if len(self.obs_space.shape) >= 1:
            length = obs.shape[0]
            obs = obs.reshape([length] + list(self.obs_space.shape))
        return obs

    def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs,
                          sampled_actions):
        return {
            self.s1: sampled_obs,
            self.s2: sampled_next_obs,
            self.asample: sampled_actions,
            self.external_rewards: sampled_rewards,
        }

    def get_sampler(self, trpo):
        from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler as OldBatchSampler
        from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler as OldVectorizedSampler
        if isinstance(trpo.sampler, OldBatchSampler):
            return BatchSampler(trpo)
        elif isinstance(trpo.sampler, OldVectorizedSampler):
            return VectorizedSampler(trpo)
        else:
            raise NotImplementedError(
                "Only supports batch sampler and vectorized sampler right now!"
            )
class ICM(RLAlgorithm):
	"""
	RL with inverse curiosity module
	"""
	def __init__(
			self,
			env,
			algo: OnlineAlgorithm,
			no_encoder=False,
			feature_dim=10,
			forward_weight=0.8,
			external_reward_weight=0.01,
			inverse_tanh=False,
			init_learning_rate=1e-4,
			algo_update_freq=1,
			**kwargs
	):
		"""
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
		self.algo = algo
		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
		self.sess = self.algo.sess or tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
		self.external_reward_weight = external_reward_weight
		self.summary_writer = self.algo.summary_writer
		self.algo_update_freq = algo_update_freq
		act_space = env.action_space
		obs_space = env.observation_space
		
		# Setup ICM models
		self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
		self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
		self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim])
		self.external_rewards = tf.placeholder(tf.float32, (None,))

		if len(obs_space.shape) == 1:
			if no_encoder:
				self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec)
			else:
				self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec)
		else:
			# TODO: implement conv encoder
			raise NotImplementedError("Currently only supports flat observation input!")

		self._encoder.sess = self.sess
		# Initialize variables for get_copy to work
		self.sess.run(tf.initialize_all_variables())
		with self.sess.as_default():
			self.encoder1 = self._encoder.get_weight_tied_copy(observation_input=self.s1)
			self.encoder2 = self._encoder.get_weight_tied_copy(observation_input=self.s2)

		self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
		self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
		self._inverse_model.sess = self.sess
		self._forward_model.sess = self.sess
		# Initialize variables for get_copy to work
		self.sess.run(tf.initialize_all_variables())
		with self.sess.as_default():
			self.inverse_model = self._inverse_model.get_weight_tied_copy(feature_input1=self.encoder1.output, 
																		  feature_input2=self.encoder2.output)
			self.forward_model = self._forward_model.get_weight_tied_copy(feature_input=self.encoder1.output,
																	  	  action_input=self.asample)

		# Define losses
		self.forward_loss = tf.reduce_mean(tf.square(self.encoder2.output - self.forward_model.output))
		# self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output)
		if isinstance(act_space, Box):
			self.inverse_loss = tf.reduce_mean(tf.square(self.asample - self.inverse_model.output))
		elif isinstance(act_space, Discrete):
			# TODO: Implement softmax loss
			raise NotImplementedError
		else:
			raise NotImplementedError
		self.internal_rewards = tf.reduce_sum(tf.square(self.encoder2.output - self.forward_model.output), axis=1)
		self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
		self.mean_external_rewards = tf.reduce_mean(self.external_rewards)
		
		self.total_loss = forward_weight * self.forward_loss + \
						(1. - forward_weight) * self.inverse_loss
		self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
						minimize(self.total_loss)


		# Setup summaries
		inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss)
		forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss)
		total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
		internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards)
		external_rewards = tf.summary.scalar("mean_external_rewards_training", self.mean_external_rewards)
		var_summ = []
		for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
			var_summ.append(tf.summary.histogram(var.op.name, var))
		self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ,\
							internal_rewards, external_rewards])
		# Initialize variables
		
		self.sess.run(tf.initialize_all_variables())


	@overrides
	def train(self):
		with self.sess.as_default():
			self.algo._init_training()
			self.algo._start_worker()
			self.algo._switch_to_training_mode()

			observation = self.algo.training_env.reset()
			self.algo.exploration_strategy.reset()
			itr = 0
			path_length = 0
			path_return = 0
			for epoch in range(self.algo.n_epochs):
				logger.push_prefix('Epoch #%d | ' % epoch)
				logger.log("Training started")
				start_time = time.time()
				for t in range(self.algo.epoch_length):
					with self.algo._eval_then_training_mode():
						# Bug here!!!!!!
						# action, _ = self.algo.policy.get_action(observation)
						action = self.algo.exploration_strategy.get_action(itr,
																		   observation,
																		   self.algo.policy)
					if self.algo.render:
						self.algo.training_env.render()
					next_ob, raw_reward, terminal, _ = self.algo.training_env.step(
						self.algo.process_action(action)
					)
					# Some envs return a Nx1 vector for the observation
					next_ob = next_ob.flatten()
					reward = raw_reward * self.algo.scale_reward
					
					# # JUST FOR DEBUG: save data
					# save_data("/data0/dianchen/forward_data", \
					# 	observation,
					# 	action,
					# 	next_ob)
					
					path_length += 1
					path_return += reward
					self.algo.pool.add_sample(observation,
					                     action,
					                     reward,
					                     terminal,
					                     False)
					if terminal or path_length >= self.algo.max_path_length:
					    self.algo.pool.add_sample(next_ob,
					                         np.zeros_like(action),
					                         np.zeros_like(reward),
					                         np.zeros_like(terminal),
					                         True)
					    observation = self.algo.training_env.reset()
					    self.algo.exploration_strategy.reset()
					    self.algo.es_path_returns.append(path_return)
					    path_length = 0
					    path_return = 0
					else:
					    observation = next_ob

					if self.algo.pool.size >= self.algo.min_pool_size:
					    for _ in range(self.algo.n_updates_per_time_step):
					        self._do_training(epoch * self.algo.epoch_length + t)
					itr += 1

				logger.log("Training finished. Time: {0}".format(time.time() -
                                                                 start_time))

				with self.algo._eval_then_training_mode():
					if self.algo.pool.size >= self.algo.min_pool_size:
						start_time = time.time()
						if self.algo.n_eval_samples > 0:
							# self.algo.evaluate(epoch, self.algo.es_path_returns)
							# self.algo.es_path_returns = []
							pass
						params = self.get_epoch_snapshot(epoch)
						logger.log(
							"Eval time: {0}".format(time.time() - start_time))
						logger.save_itr_params(epoch, params)
					logger.dump_tabular(with_prefix=False)
					logger.pop_prefix()
			self.algo._switch_to_eval_mode()
			self.algo.training_env.terminate()
			self.algo._shutdown_worker()
			return self.algo.last_statistics

	
	def _do_training(self, timestep):
		minibatch = self.algo.pool.random_batch(self.algo.batch_size)
		sampled_obs = minibatch["observations"]
		sampled_terminals = minibatch['terminals']
		sampled_next_obs = minibatch["next_observations"]
		sampled_actions = minibatch["actions"]
		sampled_rewards = minibatch['rewards']
		icm_feed_dict = self._update_feed_dict(sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions)
		algo_ops = self.algo._get_training_ops()
		if timestep % self.algo_update_freq == 0:
			icm_ops = [self.icm_opt]
		else:
			icm_ops = []
		icm_results = self.sess.run([self.summary, self.internal_rewards] + icm_ops, feed_dict=icm_feed_dict)
		icm_summary = icm_results[0]
		internal_rewards = icm_results[1]
		# Add up internal and external rewards
		algo_feed_dict = self.algo._update_feed_dict(self.external_reward_weight * sampled_rewards + \
													(1. - self.external_reward_weight) * internal_rewards,
	       											 sampled_terminals,
	       											 sampled_obs,
	       											 sampled_actions,
	       											 sampled_next_obs)
		# If algo has summary, run it. 
		# TODO: Clean this code. It is a mess right now
		if self.algo.summary is not None:
			algo_ops = [self.algo.summary] + algo_ops
		algo_results = self.sess.run(algo_ops, feed_dict=algo_feed_dict)
		if self.algo.summary is not None:
			algo_summary = algo_results[0]
		if timestep % TENSORBOARD_PERIOD == 0:
			if self.algo.summary is not None:
				self.summary_writer.add_summary(algo_summary, timestep)
			self.summary_writer.add_summary(icm_summary, timestep)

	def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions):
		return {
			self.s1: sampled_obs,
			self.s2: sampled_next_obs,
			self.asample: sampled_actions,
			self.external_rewards: sampled_rewards,
		}

	def get_epoch_snapshot(self, epoch):
		snapshot = self.algo.get_epoch_snapshot(epoch)
		snapshot['encoder'] = self._encoder
		snapshot['inverse_model'] = self._inverse_model
		snapshot['forward_model'] = self._forward_model
		return snapshot
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str, help="name of gym env")
    parser.add_argument('dataset_path',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('val_random_data',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('val_contact_data',
                        type=str,
                        help="path of training and validation dataset")
    parser.add_argument('tfboard_path', type=str, default='/tmp/tfboard')
    parser.add_argument('tfmodel_path', type=str, default='/tmp/tfmodels')
    parser.add_argument('--restore', action='store_true')
    # Training parameters
    parser.add_argument('--num_itr', type=int, default=10000000)
    parser.add_argument('--val_freq', type=int, default=200)
    parser.add_argument('--log_freq', type=int, default=50)
    parser.add_argument('--save_freq', type=int, default=5000)

    # ICM parameters
    parser.add_argument('--init_lr', type=float, default=2e-3)
    parser.add_argument('--forward_weight',
                        type=float,
                        default=0.5,
                        help="the ratio of forward loss vs inverse loss")
    parser.add_argument('--cos_forward',
                        action='store_true',
                        help="whether to use cosine forward loss")

    args = parser.parse_args()

    # Get dataset
    train_set_names = list(
        map(lambda file_name: osp.join(args.dataset_path, file_name),
            listdir(args.dataset_path)))
    val_random_set_names = list(
        map(lambda file_name: osp.join(args.val_random_data, file_name),
            listdir(args.val_random_data)))
    val_contact_set_names = list(
        map(lambda file_name: osp.join(args.val_contact_data, file_name),
            listdir(args.val_contact_data)))
    # import pdb; pdb.set_trace()

    obs_shape = OBS_SHAPE_MAP[args.env_name]
    action_dim = ACTION_DIM_MAP[args.env_name]
    train_obs, train_next_obs, train_action = inputs(train_set_names,
                                                     obs_shape,
                                                     train=True)
    val_random_obs, val_random_next_obs, val_random_action = inputs(
        val_random_set_names, obs_shape, train=False)
    val_contact_obs, val_contact_next_obs, val_contact_action = inputs(
        val_contact_set_names, obs_shape, train=False)

    if args.restore:
        models_dict = joblib.load(args.tfmodel_path)
        _encoder = models_dict['encoder']
        _inverse_model = model.dict['inverse_model']
        _forward_model = model.dict['forward_model']
    else:
        _encoder = NoEncoder(obs_shape, observation_dim=[obs_shape])
        _inverse_model = InverseModel(
            feature_dim=obs_shape,
            action_dim=action_dim,
            hidden_sizes=(256, 256),
            hidden_activation=tf.nn.elu,
            output_activation=tf.nn.tanh,
        )
        _forward_model = ForwardModel(
            feature_dim=obs_shape,
            action_dim=action_dim,
            hidden_sizes=(256, 257),
            hidden_activation=tf.nn.elu,
        )
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    _encoder.sess = sess
    _inverse_model.sess = sess
    _forward_model.sess = sess

    with sess.as_default():
        # Initialize variables for get_copy to work
        sess.run(tf.initialize_all_variables())

        train_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=train_obs)
        train_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=train_next_obs)
        # import pdb; pdb.set_trace()
        train_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=train_encoder1.output,
            feature_input2=train_encoder2.output)
        train_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=train_encoder1.output, action_input=train_action)

        val_random_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=val_random_obs)
        val_random_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=val_random_next_obs)
        val_random_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=val_random_encoder1.output,
            feature_input2=val_random_encoder2.output)
        val_random_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=val_random_encoder1.output,
            action_input=val_random_action)

        val_contact_encoder1 = _encoder.get_weight_tied_copy(
            observation_input=val_contact_obs)
        val_contact_encoder2 = _encoder.get_weight_tied_copy(
            observation_input=val_contact_next_obs)
        val_contact_inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=val_contact_encoder1.output,
            feature_input2=val_contact_encoder2.output)
        val_contact_forward_model = _forward_model.get_weight_tied_copy(
            feature_input=val_contact_encoder1.output,
            action_input=val_contact_action)

        if args.cos_forward:
            train_forward_loss = cos_loss(train_encoder2.output,
                                          train_forward_model.output)
            val_forward_loss = cos_loss(val_encoder2.output,
                                        val_forward_model.output)
        else:
            train_forward_loss = tf.reduce_mean(
                tf.square(train_encoder2.output - train_forward_model.output))
            #use only if in state space!!!!!!!
            train_forward_loss_arm = tf.reduce_mean(
                tf.square(train_encoder2.output[:, :4] -
                          train_forward_model.output[:, :4]))
            train_forward_loss_box = tf.reduce_mean(
                tf.square(train_encoder2.output[:, 4:7] -
                          train_forward_model.output[:, 4:7]))

            val_random_forward_loss = tf.reduce_mean(
                tf.square(val_random_encoder2.output -
                          val_random_forward_model.output))
            val_random_forward_loss_arm = tf.reduce_mean(
                tf.square(val_random_encoder2.output[:, :4] -
                          val_random_forward_model.output[:, :4]))
            val_random_forward_loss_box = tf.reduce_mean(
                tf.square(val_random_encoder2.output[:, 4:7] -
                          val_random_forward_model.output[:, 4:7]))

            val_contact_forward_loss = tf.reduce_mean(
                tf.square(val_contact_encoder2.output -
                          val_contact_forward_model.output))
            val_contact_forward_loss_arm = tf.reduce_mean(
                tf.square(val_contact_encoder2.output[:, :4] -
                          val_contact_forward_model.output[:, :4]))
            val_contact_forward_loss_box = tf.reduce_mean(
                tf.square(val_contact_encoder2.output[:, 4:7] -
                          val_contact_forward_model.output[:, 4:7]))

        train_inverse_losses = tf.reduce_mean(
            tf.square(train_action - train_inverse_model.output), axis=0)
        val_random_inverse_losses = tf.reduce_mean(
            tf.square(val_random_action - val_random_inverse_model.output),
            axis=0)
        val_contact_inverse_losses = tf.reduce_mean(
            tf.square(val_contact_action - val_contact_inverse_model.output),
            axis=0)

        train_inverse_separate_summ = []
        val_random_inverse_separate_summ = []
        val_contact_inverse_separate_summ = []
        for joint_idx in range(action_dim):
            train_inverse_separate_summ.append(
                tf.summary.scalar(
                    "train/icm_inverse_loss/joint_{}".format(joint_idx),
                    train_inverse_losses[joint_idx]))
            val_random_inverse_separate_summ.append(
                tf.summary.scalar(
                    "random_val/icm_inverse_random_loss/joint_{}".format(
                        joint_idx), val_random_inverse_losses[joint_idx]))
            val_contact_inverse_separate_summ.append(
                tf.summary.scalar(
                    "contact_val/icm_inverse_random_loss/joint_{}".format(
                        joint_idx), val_contact_inverse_losses[joint_idx]))

        train_inverse_loss = tf.reduce_mean(train_inverse_losses)
        val_random_inverse_loss = tf.reduce_mean(val_random_inverse_losses)
        val_contact_inverse_loss = tf.reduce_mean(val_contact_inverse_losses)

        train_total_loss = args.forward_weight * train_forward_loss + (
            1. - args.forward_weight) * train_inverse_loss
        val_random_total_loss = args.forward_weight * val_random_forward_loss + (
            1. - args.forward_weight) * val_random_inverse_loss
        val_contact_total_loss = args.forward_weight * val_contact_forward_loss + (
            1. - args.forward_weight) * val_contact_inverse_loss

        icm_opt = tf.train.AdamOptimizer(
            args.init_lr).minimize(train_total_loss)
        _, train_data_forward_var = tf.nn.moments(train_obs, axes=[1])
        _, train_data_box_var = tf.nn.moments(train_obs[:, 4:7], axes=[1])

        # Setup summaries
        summary_writer = tf.summary.FileWriter(args.tfboard_path,
                                               graph=tf.get_default_graph())
        train_forward_loss_arm_summ = tf.summary.scalar(
            "train/forward_loss_arm", train_forward_loss_arm)
        train_forward_loss_box_summ = tf.summary.scalar(
            "train/forward_loss_box", train_forward_loss_box)
        train_inverse_loss_summ = tf.summary.scalar(
            "train/icm_inverse_loss/total_mean", train_inverse_loss)
        train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss",
                                                    train_forward_loss)
        train_total_loss_summ = tf.summary.scalar("train/icm_total_loss",
                                                  train_total_loss)

        random_val_forward_loss_arm_summ = tf.summary.scalar(
            "random_val/forward_loss_arm", val_random_forward_loss_arm)
        random_val_forward_loss_box_summ = tf.summary.scalar(
            "random_val/forward_loss_box", val_random_forward_loss_box)
        random_val_inverse_loss_summ = tf.summary.scalar(
            "random_val/icm_inverse_loss/total_mean", val_random_inverse_loss)
        random_val_forward_loss_summ = tf.summary.scalar(
            "random_val/icm_forward_loss", val_random_forward_loss)
        random_val_total_loss_summ = tf.summary.scalar(
            "random_val/icm_total_loss", val_random_total_loss)

        contact_val_forward_loss_arm_summ = tf.summary.scalar(
            "contact_val/forward_loss_arm", val_contact_forward_loss_arm)
        contact_val_forward_loss_box_summ = tf.summary.scalar(
            "contact_val/forward_loss_box", val_contact_forward_loss_box)
        contact_val_inverse_loss_summ = tf.summary.scalar(
            "contact_val/icm_inverse_loss/total_mean",
            val_contact_inverse_loss)
        contact_val_forward_loss_summ = tf.summary.scalar(
            "contact_val/icm_forward_loss", val_contact_forward_loss)
        contact_val_total_loss_summ = tf.summary.scalar(
            "contact_val/icm_total_loss", val_contact_total_loss)

        forward_data_variance_summ = tf.summary.scalar("training_data_forward_variance", \
                     tf.reduce_mean(train_data_forward_var))
        forward_data_box_variance_summ = tf.summary.scalar("training_data_forward_box_variance", \
                     tf.reduce_mean(train_data_box_var))

        train_summary_op = tf.summary.merge([
            train_inverse_loss_summ,
            train_forward_loss_summ,
            train_forward_loss_arm_summ,
            train_forward_loss_box_summ,
            train_total_loss_summ,
            forward_data_variance_summ,
            forward_data_box_variance_summ,
        ] + train_inverse_separate_summ)

        val_summary_op = tf.summary.merge([
            random_val_forward_loss_arm_summ,
            random_val_forward_loss_box_summ,
            random_val_inverse_loss_summ,
            random_val_forward_loss_summ,
            random_val_total_loss_summ,
            contact_val_forward_loss_arm_summ,
            contact_val_forward_loss_box_summ,
            contact_val_inverse_loss_summ,
            contact_val_forward_loss_summ,
            contact_val_total_loss_summ,
        ] + val_random_inverse_separate_summ +
                                          val_contact_inverse_separate_summ)

        logger.log("Finished creating ICM model")

        sess.run(tf.initialize_all_variables())

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            for timestep in range(args.num_itr):
                # print(timestep)
                # print(sess.run(train_action))
                # print("wow")
                if timestep % args.log_freq == 0:
                    logger.log("Start itr {}".format(timestep))
                    _, train_summary = sess.run([icm_opt, train_summary_op])
                    summary_writer.add_summary(train_summary, timestep)
                else:
                    sess.run(icm_opt)

                if timestep % args.save_freq == 0:
                    save_snapshot(_encoder, _inverse_model, _forward_model,
                                  args.tfmodel_path)

                if timestep % args.val_freq == 0:
                    val_summary = sess.run(val_summary_op)
                    summary_writer.add_summary(val_summary, timestep)

        except KeyboardInterrupt:
            print("End training...")
            pass

        coord.join(threads)
        sess.close()
class ICM(RLAlgorithm):
    """
	RL with inverse curiosity module
	"""
    def __init__(self,
                 env,
                 trpo: TRPO,
                 tensorboard_path,
                 no_encoder=False,
                 feature_dim=10,
                 forward_weight=0.8,
                 external_reward_weight=0.01,
                 inverse_tanh=True,
                 init_learning_rate=1e-4,
                 icm_batch_size=128,
                 replay_pool_size=1000000,
                 min_pool_size=1000,
                 n_updates_per_sample=500,
                 **kwargs):
        """
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
        self.trpo = trpo
        self.trpo.sampler = BatchSampler(self.trpo)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
        self.sess = tf.get_default_session() or tf.Session(
            config=tf.ConfigProto(gpu_options=gpu_options))
        self.external_reward_weight = external_reward_weight
        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.n_updates_per_sample = n_updates_per_sample
        self.icm_batch_size = icm_batch_size
        act_space = env.action_space
        obs_space = env.observation_space

        self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim,
                                   act_space.flat_dim)
        self.min_pool_size = min_pool_size
        # Setup ICM models
        self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim])
        self.external_rewards = tf.placeholder(tf.float32, (None, ))

        if len(obs_space.shape) == 1:
            if no_encoder:
                self._encoder = NoEncoder(obs_space.flat_dim,
                                          env_spec=env.spec)
            else:
                self._encoder = FullyConnectedEncoder(feature_dim,
                                                      env_spec=env.spec)
        else:
            # TODO: implement conv encoder
            raise NotImplementedError(
                "Currently only supports flat observation input!")

        self._encoder.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.encoder1 = self._encoder.get_weight_tied_copy(
                observation_input=self.s1)
            self.encoder2 = self._encoder.get_weight_tied_copy(
                observation_input=self.s2)

        self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
        self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
        self._inverse_model.sess = self.sess
        self._forward_model.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.inverse_model = self._inverse_model.get_weight_tied_copy(
                feature_input1=self.encoder1.output,
                feature_input2=self.encoder2.output)
            self.forward_model = self._forward_model.get_weight_tied_copy(
                feature_input=self.encoder1.output, action_input=self.asample)

        # Define losses
        self.forward_loss = tf.reduce_mean(
            tf.square(self.encoder2.output - self.forward_model.output))
        # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output)
        if isinstance(act_space, Box):
            self.inverse_loss = tf.reduce_mean(
                tf.square(self.asample - self.inverse_model.output))
        elif isinstance(act_space, Discrete):
            # TODO: Implement softmax loss
            raise NotImplementedError
        else:
            raise NotImplementedError
        self.internal_rewards = tf.reduce_sum(
            tf.square(self.encoder2.output - self.forward_model.output),
            axis=1)
        self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
        self.mean_external_rewards = tf.reduce_mean(self.external_rewards)

        self.total_loss = forward_weight * self.forward_loss + \
            (1. - forward_weight) * self.inverse_loss
        self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
            minimize(self.total_loss)

        # Setup summaries
        inverse_loss_summ = tf.summary.scalar("icm_inverse_loss",
                                              self.inverse_loss)
        forward_loss_summ = tf.summary.scalar("icm_forward_loss",
                                              self.forward_loss)
        total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
        internal_rewards = tf.summary.scalar("mean_internal_rewards",
                                             self.mean_internal_rewards)
        external_rewards = tf.summary.scalar("mean_external_rewards",
                                             self.mean_external_rewards)
        var_summ = []
        # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        # 	var_summ.append(tf.summary.histogram(var.op.name, var))
        self.summary = tf.summary.merge(
            [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)
        # Initialize variables

        self.sess.run(tf.initialize_all_variables())

    @overrides
    def train(self):
        with self.sess.as_default():
            self.trpo.start_worker()
            self.trpo.init_opt()
            for itr in range(self.trpo.current_itr, self.trpo.n_itr):
                paths = self.trpo.sampler.obtain_samples(itr)
                modified_paths = self.process_paths(itr, paths)
                samples_data = self.trpo.sampler.process_samples(
                    itr, modified_paths)

                if self.pool.size >= self.min_pool_size:
                    logger.log("ICM Training started")
                    start_time = time.time()
                    for _ in range(self.n_updates_per_sample):
                        self.train_icm(_ + itr * self.n_updates_per_sample)

                    logger.log(
                        "ICM Training finished. Time: {0}".format(time.time() -
                                                                  start_time))

                for path in samples_data['paths']:
                    path_len = len(path['rewards'])
                    for i in range(path_len):
                        obs = path['observations'][i]
                        # print (obs)
                        act = path['actions'][i]
                        term = (i == path_len - 1)
                        rew = 0.0
                        self.pool.add_sample(obs, act, rew, term)

                self.trpo.log_diagnostics(paths)
                self.trpo.optimize_policy(itr, samples_data)
                params = self.trpo.get_itr_snapshot(itr, samples_data)
                params['encoder'] = self._encoder
                params['inverse_model'] = self._inverse_model
                params['forward_model'] = self._forward_model
                logger.save_itr_params(itr, params)
                logger.dump_tabular(with_prefix=False)

            self.trpo.shutdown_worker()

    def train_icm(self, timestep):
        batch = self.pool.random_batch(self.icm_batch_size)
        obs = batch['observations']
        next_obs = batch['next_observations']
        acts = batch['actions']
        rewards = batch['rewards']
        feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts)
        ops = [self.summary, self.icm_opt]
        results = self.sess.run(ops, feed_dict=feed_dict)
        if timestep % TENSORBOARD_PERIOD == 0:
            self.summary_writer.add_summary(results[0], timestep)

    def process_paths(self, itr, paths):
        modified_paths = copy(paths)
        if itr == 0:
            for path in modified_paths:
                path['t_rewards'] = path["rewards"]
        else:
            for path in modified_paths:
                obs = path['observations'][:-1]
                acts = path['actions'][:-1]
                next_obs = path['observations'][1:]
                internal_rewards = self.sess.run(self.internal_rewards,
                                                 feed_dict={
                                                     self.s1: obs,
                                                     self.s2: next_obs,
                                                     self.asample: acts
                                                 })
                internal_rewards = np.append([0.0], internal_rewards)
                path['t_rewards'] = self.external_reward_weight * path['rewards'] \
                     + (1. - self.external_reward_weight) * internal_rewards
        return modified_paths

    def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs,
                          sampled_actions):
        return {
            self.s1: sampled_obs,
            self.s2: sampled_next_obs,
            self.asample: sampled_actions,
            self.external_rewards: sampled_rewards,
        }
示例#9
0
    def __init__(self,
                 env,
                 trpo: TRPO,
                 tensorboard_path,
                 no_encoder=False,
                 feature_dim=10,
                 forward_weight=0.8,
                 external_reward_weight=0.01,
                 inverse_tanh=True,
                 forward_cos=False,
                 init_learning_rate=1e-4,
                 icm_batch_size=128,
                 replay_pool_size=1000000,
                 min_pool_size=1000,
                 n_updates_per_iter=500,
                 rel_curiosity=False,
                 clip_curiosity=0.0,
                 debug_save_data=False,
                 debug_log_weights=False,
                 **kwargs):
        """
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
        self.trpo = trpo
        # Replace sampler to inject intrinsic reward
        self.trpo.sampler = self.get_sampler(self.trpo)
        self.sess = tf.get_default_session() or tf.Session()
        self.external_reward_weight = external_reward_weight
        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.n_updates_per_iter = n_updates_per_iter
        self.forward_cos = forward_cos
        self.inverse_tanh = inverse_tanh
        self.icm_batch_size = icm_batch_size
        self.rel_curiosity = rel_curiosity
        self.clip_curiosity = clip_curiosity
        self.debug_save_data = debug_save_data
        self.debug_log_weights = debug_log_weights

        # Debug purpose: Save (ob1, a, ob2, if_contact)
        if self.debug_save_data:
            self.DEBUG_DATA_PATH = "/home/dianchen/icm_data.csv"
            with open(self.DEBUG_DATA_PATH, 'w+') as csvfile:
                fieldnames = ['obs', 'a', 'next_obs', 'contact']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

        act_space = env.action_space
        obs_space = env.observation_space

        self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim,
                                   act_space.flat_dim)
        self.min_pool_size = min_pool_size
        # Setup ICM models
        self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim])

        self.external_rewards = tf.placeholder(tf.float32, (None, ))
        self.contact_rewards = tf.placeholder(tf.float32, (None, ))

        if len(obs_space.shape) == 1:
            if no_encoder:
                self._encoder = NoEncoder(obs_space.flat_dim,
                                          env_spec=env.spec)
            else:
                self._encoder = FullyConnectedEncoder(feature_dim,
                                                      env_spec=env.spec)
        else:
            # TODO: implement conv encoder
            raise NotImplementedError(
                "Currently only supports flat observation input!")

        self._encoder.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.encoder1 = self._encoder.get_weight_tied_copy(
                observation_input=self.s1)
            self.encoder2 = self._encoder.get_weight_tied_copy(
                observation_input=self.s2)

        if self.inverse_tanh:
            self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
        else:
            self._inverse_model = InverseModel(feature_dim,
                                               env_spec=env.spec,
                                               output_activation=None)
        self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
        self._inverse_model.sess = self.sess
        self._forward_model.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.inverse_model = self._inverse_model.get_weight_tied_copy(
                feature_input1=self.encoder1.output,
                feature_input2=self.encoder2.output)
            self.forward_model = self._forward_model.get_weight_tied_copy(
                feature_input=self.encoder1.output, action_input=self.asample)

        # Define losses
        if self.forward_cos:
            self.forward_loss = cosine_loss(self.encoder2.output,
                                            self.forward_model.output)
        else:
            self.forward_loss = tf.reduce_mean(
                tf.square(self.encoder2.output - self.forward_model.output))
        # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output)
        if isinstance(act_space, Box):
            self.inverse_loss = tf.reduce_mean(
                tf.square(self.asample - self.inverse_model.output))
        elif isinstance(act_space, Discrete):
            # TODO: Implement softmax loss
            raise NotImplementedError
        else:
            raise NotImplementedError
        if self.forward_cos:
            self.internal_rewards = 1.0 - tf.reduce_sum(
                tf.multiply(tf.nn.l2_normalize(self.forward_model.output, 1),
                            tf.nn.l2_normalize(self.encoder2.output, 1)), 1)
        else:
            self.internal_rewards = tf.reduce_sum(
                tf.square(self.encoder2.output - self.forward_model.output),
                axis=1)
        self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
        self.mean_external_rewards = tf.reduce_mean(self.external_rewards)
        self.mean_contact_rewards = tf.reduce_mean(self.contact_rewards)

        self.total_loss = forward_weight * self.forward_loss + \
            (1. - forward_weight) * self.inverse_loss
        self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
            minimize(self.total_loss)

        # Setup summaries
        inverse_loss_summ = tf.summary.scalar("icm_inverse_loss",
                                              self.inverse_loss)
        forward_loss_summ = tf.summary.scalar("icm_forward_loss",
                                              self.forward_loss)
        total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
        internal_rewards = tf.summary.scalar("mean_internal_rewards",
                                             self.mean_internal_rewards)
        external_rewards = tf.summary.scalar("mean_external_rewards",
                                             self.mean_external_rewards)
        # Setup env_info logs
        contact_summ = tf.summary.scalar("mean_contact_rewards",
                                         self.mean_contact_rewards)

        var_summ = []
        # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        # 	var_summ.append(tf.summary.histogram(var.op.name, var))
        if self.debug_log_weights:
            for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                var_summ.append(tf.summary.histogram(var.op.name, var))

        self.summary_icm = tf.summary.merge(
            [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)
        self.summary_env = tf.summary.merge(
            [internal_rewards, external_rewards, contact_summ])
        # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)
        # Initialize variables

        self.sess.run(tf.initialize_all_variables())
示例#10
0
class ICM(RLAlgorithm):
    """
	RL with inverse curiosity module
	"""
    def __init__(self,
                 env,
                 trpo: TRPO,
                 tensorboard_path,
                 no_encoder=False,
                 feature_dim=10,
                 forward_weight=0.8,
                 external_reward_weight=0.01,
                 inverse_tanh=True,
                 forward_cos=False,
                 init_learning_rate=1e-4,
                 icm_batch_size=128,
                 replay_pool_size=1000000,
                 min_pool_size=1000,
                 n_updates_per_iter=500,
                 rel_curiosity=False,
                 clip_curiosity=0.0,
                 debug_save_data=False,
                 debug_log_weights=False,
                 **kwargs):
        """
		:param env: Environment
		:param algo: Algorithm that will be used with ICM
		:param encoder: State encoder that maps s to f
		:param inverse_model: Inverse dynamics model that maps (f1, f2) to actions
		:param forward_model: Forward dynamics model that maps (f1, a) to f2
		:param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss
		:param external_reward_weight: Weight that balances external reward and internal reward
		:param init_learning_rate: Initial learning rate of optimizer
		"""
        self.trpo = trpo
        # Replace sampler to inject intrinsic reward
        self.trpo.sampler = self.get_sampler(self.trpo)
        self.sess = tf.get_default_session() or tf.Session()
        self.external_reward_weight = external_reward_weight
        self.summary_writer = tf.summary.FileWriter(
            tensorboard_path, graph=tf.get_default_graph())
        self.n_updates_per_iter = n_updates_per_iter
        self.forward_cos = forward_cos
        self.inverse_tanh = inverse_tanh
        self.icm_batch_size = icm_batch_size
        self.rel_curiosity = rel_curiosity
        self.clip_curiosity = clip_curiosity
        self.debug_save_data = debug_save_data
        self.debug_log_weights = debug_log_weights

        # Debug purpose: Save (ob1, a, ob2, if_contact)
        if self.debug_save_data:
            self.DEBUG_DATA_PATH = "/home/dianchen/icm_data.csv"
            with open(self.DEBUG_DATA_PATH, 'w+') as csvfile:
                fieldnames = ['obs', 'a', 'next_obs', 'contact']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

        act_space = env.action_space
        obs_space = env.observation_space

        self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim,
                                   act_space.flat_dim)
        self.min_pool_size = min_pool_size
        # Setup ICM models
        self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim])

        self.external_rewards = tf.placeholder(tf.float32, (None, ))
        self.contact_rewards = tf.placeholder(tf.float32, (None, ))

        if len(obs_space.shape) == 1:
            if no_encoder:
                self._encoder = NoEncoder(obs_space.flat_dim,
                                          env_spec=env.spec)
            else:
                self._encoder = FullyConnectedEncoder(feature_dim,
                                                      env_spec=env.spec)
        else:
            # TODO: implement conv encoder
            raise NotImplementedError(
                "Currently only supports flat observation input!")

        self._encoder.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.encoder1 = self._encoder.get_weight_tied_copy(
                observation_input=self.s1)
            self.encoder2 = self._encoder.get_weight_tied_copy(
                observation_input=self.s2)

        if self.inverse_tanh:
            self._inverse_model = InverseModel(feature_dim, env_spec=env.spec)
        else:
            self._inverse_model = InverseModel(feature_dim,
                                               env_spec=env.spec,
                                               output_activation=None)
        self._forward_model = ForwardModel(feature_dim, env_spec=env.spec)
        self._inverse_model.sess = self.sess
        self._forward_model.sess = self.sess
        # Initialize variables for get_copy to work
        self.sess.run(tf.initialize_all_variables())
        with self.sess.as_default():
            self.inverse_model = self._inverse_model.get_weight_tied_copy(
                feature_input1=self.encoder1.output,
                feature_input2=self.encoder2.output)
            self.forward_model = self._forward_model.get_weight_tied_copy(
                feature_input=self.encoder1.output, action_input=self.asample)

        # Define losses
        if self.forward_cos:
            self.forward_loss = cosine_loss(self.encoder2.output,
                                            self.forward_model.output)
        else:
            self.forward_loss = tf.reduce_mean(
                tf.square(self.encoder2.output - self.forward_model.output))
        # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output)
        if isinstance(act_space, Box):
            self.inverse_loss = tf.reduce_mean(
                tf.square(self.asample - self.inverse_model.output))
        elif isinstance(act_space, Discrete):
            # TODO: Implement softmax loss
            raise NotImplementedError
        else:
            raise NotImplementedError
        if self.forward_cos:
            self.internal_rewards = 1.0 - tf.reduce_sum(
                tf.multiply(tf.nn.l2_normalize(self.forward_model.output, 1),
                            tf.nn.l2_normalize(self.encoder2.output, 1)), 1)
        else:
            self.internal_rewards = tf.reduce_sum(
                tf.square(self.encoder2.output - self.forward_model.output),
                axis=1)
        self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards)
        self.mean_external_rewards = tf.reduce_mean(self.external_rewards)
        self.mean_contact_rewards = tf.reduce_mean(self.contact_rewards)

        self.total_loss = forward_weight * self.forward_loss + \
            (1. - forward_weight) * self.inverse_loss
        self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\
            minimize(self.total_loss)

        # Setup summaries
        inverse_loss_summ = tf.summary.scalar("icm_inverse_loss",
                                              self.inverse_loss)
        forward_loss_summ = tf.summary.scalar("icm_forward_loss",
                                              self.forward_loss)
        total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss)
        internal_rewards = tf.summary.scalar("mean_internal_rewards",
                                             self.mean_internal_rewards)
        external_rewards = tf.summary.scalar("mean_external_rewards",
                                             self.mean_external_rewards)
        # Setup env_info logs
        contact_summ = tf.summary.scalar("mean_contact_rewards",
                                         self.mean_contact_rewards)

        var_summ = []
        # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
        # 	var_summ.append(tf.summary.histogram(var.op.name, var))
        if self.debug_log_weights:
            for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                var_summ.append(tf.summary.histogram(var.op.name, var))

        self.summary_icm = tf.summary.merge(
            [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)
        self.summary_env = tf.summary.merge(
            [internal_rewards, external_rewards, contact_summ])
        # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ)
        # Initialize variables

        self.sess.run(tf.initialize_all_variables())

    @overrides
    def train(self):
        with self.sess.as_default():
            self.trpo.start_worker()
            for itr in range(self.trpo.start_itr, self.trpo.n_itr):
                paths = self.trpo.obtain_samples(itr)
                modified_paths = self.process_paths(itr, paths)
                samples_data = self.trpo.process_samples(itr, modified_paths)

                # import pdb; pdb.set_trace()

                if self.pool.size >= self.min_pool_size:
                    logger.log("ICM Training started")
                    start_time = time.time()
                    for _ in range(self.n_updates_per_iter):
                        self.train_icm(_ + itr * self.n_updates_per_iter)

                    logger.log(
                        "ICM Training finished. Time: {0}".format(time.time() -
                                                                  start_time))

                # Log env summary
                path = samples_data['paths'][0]
                if self.debug_save_data:
                    with open(self.DEBUG_DATA_PATH, 'a') as csvfile:
                        for obs, a, next_obs, contact in zip(path['observations'][:-1], \
                            path['observations'][1:], \
                            path['actions'][:-1], \
                            path['env_infos']['contact_reward']):
                            fieldnames = ['obs', 'a', 'next_obs', 'contact']
                            writer = csv.DictWriter(csvfile,
                                                    fieldnames=fieldnames)
                            writer.writerow(
                                dict(obs=obs,
                                     a=a,
                                     next_obs=next_obs,
                                     contact=contact))

                results = self.sess.run(self.summary_env,
                                        feed_dict={
                                            self.s1:
                                            path['observations'][:-1],
                                            self.s2:
                                            path['observations'][1:],
                                            self.asample:
                                            path['actions'][:-1],
                                            self.external_rewards:
                                            path['rewards'],
                                            self.contact_rewards:
                                            path['env_infos']['contact_reward']
                                        })
                self.summary_writer.add_summary(results,
                                                itr * self.n_updates_per_iter)

                for path in samples_data['paths']:
                    path_len = len(path['rewards'])
                    for i in range(path_len):
                        obs = path['observations'][i]
                        # print (obs)
                        act = path['actions'][i]
                        term = (i == path_len - 1)
                        rew = 0.0
                        self.pool.add_sample(obs, act, rew, term)

                # pdb.set_trace()

                self.trpo.log_diagnostics(paths)
                self.trpo.optimize_policy(itr, samples_data)
                params = self.trpo.get_itr_snapshot(itr, samples_data)
                params['encoder'] = self._encoder
                params['inverse_model'] = self._inverse_model
                params['forward_model'] = self._forward_model
                logger.save_itr_params(itr, params)
                logger.dump_tabular(with_prefix=False)

            self.trpo.shutdown_worker()

    def train_icm(self, timestep):
        batch = self.pool.random_batch(self.icm_batch_size)
        obs = batch['observations']
        next_obs = batch['next_observations']
        acts = batch['actions']
        rewards = batch['rewards']
        feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts)
        ops = [self.summary_icm, self.icm_opt]
        # ops = [self.icm_opt]
        results = self.sess.run(ops, feed_dict=feed_dict)
        if timestep % TENSORBOARD_PERIOD == 0:
            self.summary_writer.add_summary(results[0], timestep)

    def process_paths(self, itr, paths):
        modified_paths = copy(paths)
        if itr == 0:
            for path in modified_paths:
                path['t_rewards'] = path["rewards"]
        else:
            for path in modified_paths:
                obs = path['observations'][:-1]
                acts = path['actions'][:-1]
                next_obs = path['observations'][1:]
                internal_rewards = self.sess.run(self.internal_rewards,
                                                 feed_dict={
                                                     self.s1: obs,
                                                     self.s2: next_obs,
                                                     self.asample: acts
                                                 })

                if self.rel_curiosity:
                    internal_rewards /= self.sess.run(tf.reduce_sum(
                        tf.square(self.encoder1.output - self.encoder2.output),
                        axis=1),
                                                      feed_dict={
                                                          self.s1: obs,
                                                          self.s2: next_obs,
                                                          self.asample: acts
                                                      })
                internal_rewards = np.append(internal_rewards, [0.0])
                if self.clip_curiosity:
                    idx = internal_rewards.argsort()[:int(
                        len(internal_rewards) * (1.0 - self.clip_curiosity))]
                    internal_rewards[idx] = 0.0

                path['t_rewards'] = self.external_reward_weight * path['rewards'] \
                     + (1. - self.external_reward_weight) * internal_rewards
        return modified_paths

    def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs,
                          sampled_actions):
        return {
            self.s1: sampled_obs,
            self.s2: sampled_next_obs,
            self.asample: sampled_actions,
            self.external_rewards: sampled_rewards,
        }

    def get_sampler(self, trpo):
        from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler as OldBatchSampler
        from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler as OldVectorizedSampler
        if isinstance(trpo.sampler, OldBatchSampler):
            return BatchSampler(trpo)
        elif isinstance(trpo.sampler, OldVectorizedSampler):
            return VectorizedSampler(trpo)
        else:
            raise NotImplementedError(
                "Only supports batch sampler and vectorized sampler right now!"
            )