def main(): parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str, help="name of gym env") parser.add_argument('dataset_path', type=str, help="path of training and validation dataset") parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels') parser.add_argument('--restore', action='store_true') # Training parameters parser.add_argument('--val_ratio', type=float, default=0.05, help="ratio of validation sets") parser.add_argument('--num_itr', type=int, default=10000000) parser.add_argument('--val_freq', type=int, default=200) parser.add_argument('--log_freq', type=int, default=50) parser.add_argument('--save_freq', type=int, default=5000) # ICM parameters parser.add_argument('--init_lr', type=float, default=1e-4) parser.add_argument('--forward_weight', type=float, default=0.8, help="the ratio of forward loss vs inverse loss") parser.add_argument('--cos_forward', action='store_true', help="whether to use cosine forward loss") args = parser.parse_args() # Get dataset dataset_names = list( map(lambda file_name: osp.join(args.dataset_path, file_name), listdir(args.dataset_path))) val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)] train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):] train_queue = tf.train.string_input_producer(train_set_names, num_epochs=None) val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None) obs_shape = OBS_SHAPE_MAP[args.env_name] action_dim = ACTION_DIM_MAP[args.env_name] train_obs, train_next_obs, train_action, train_state, train_next_state = inputs( train_set_names, obs_shape, train=True) val_obs, val_next_obs, val_action, val_state, val_next_state = inputs( val_set_names, obs_shape, train=False) #not yet implemented if args.restore: raise NotImplementedError models_dict = joblib.load(args.tfmodel_path) _encoder = models_dict['encoder'] _inverse_model = model.dict['inverse_model'] _forward_model = model.dict['forward_model'] else: _encoder = ConvEncoder( feature_dim=256, input_shape=obs_shape, conv_filters=(64, 64, 64, 64, 32), conv_filter_sizes=((5, 5), (5, 5), (5, 5), (5, 5), (3, 3)), conv_strides=(2, 2, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) _state_encoder = FullyConnectedEncoder( 200, observation_dim=8, hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) #TODO: add one more encoder before inverse model? _inverse_model = InverseModel( feature_dim=256 + 200, action_dim=action_dim, hidden_sizes=(256, ), hidden_activation=tf.nn.elu, output_activation=tf.nn.tanh, ) _forward_model = ForwardModel( feature_dim=256 + 200, action_dim=action_dim, hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) sess = tf.Session() _encoder.sess = sess _inverse_model.sess = sess _forward_model.sess = sess _state_encoder.sess = sess with sess.as_default(): # Initialize variables for get_copy to work sess.run(tf.initialize_all_variables()) train_encoder1 = _encoder.get_weight_tied_copy( observation_input=train_obs) train_encoder2 = _encoder.get_weight_tied_copy( observation_input=train_next_obs) train_state_encoder1 = _state_encoder.get_weight_tied_copy( observation_input=train_state) train_state_encoder2 = _state_encoder.get_weight_tied_copy( observation_input=train_next_state) train_feature1 = tf.concat( 1, [train_encoder1.output, train_state_encoder1.output]) train_feature2 = tf.concat( 1, [train_encoder2.output, train_state_encoder2.output]) train_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=train_feature1, feature_input2=train_feature2) train_forward_model = _forward_model.get_weight_tied_copy( feature_input=train_feature1, action_input=train_action) val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs) val_encoder2 = _encoder.get_weight_tied_copy( observation_input=val_next_obs) val_state_encoder1 = _state_encoder.get_weight_tied_copy( observation_input=val_state) val_state_encoder2 = _state_encoder.get_weight_tied_copy( observation_input=val_next_state) val_feature1 = tf.concat( 1, [val_encoder1.output, val_state_encoder1.output]) val_feature2 = tf.concat( 1, [val_encoder2.output, val_state_encoder2.output]) val_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=val_feature1, feature_input2=val_feature2) val_forward_model = _forward_model.get_weight_tied_copy( feature_input=val_feature1, action_input=val_action) if args.cos_forward: train_forward_loss = cos_loss(train_feature2, train_forward_model.output) val_forward_loss = cos_loss(val_feature2, val_forward_model.output) else: train_forward_loss = tf.reduce_mean( tf.square(train_feature2 - train_forward_model.output)) val_forward_loss = tf.reduce_mean( tf.square(val_feature2 - val_forward_model.output)) train_inverse_losses = tf.reduce_mean( tf.square(train_action - train_inverse_model.output), axis=0) val_inverse_losses = tf.reduce_mean( tf.square(val_action - val_inverse_model.output), axis=0) train_inverse_separate_summ = [] val_inverse_separate_summ = [] for joint_idx in range(action_dim): train_inverse_separate_summ.append( tf.summary.scalar( "train/icm_inverse_loss/joint_{}".format(joint_idx), train_inverse_losses[joint_idx])) val_inverse_separate_summ.append( tf.summary.scalar( "val/icm_inverse_loss/joint_{}".format(joint_idx), val_inverse_losses[joint_idx])) train_inverse_loss = tf.reduce_mean(train_inverse_losses) val_inverse_loss = tf.reduce_mean(val_inverse_losses) train_total_loss = args.forward_weight * train_forward_loss + ( 1. - args.forward_weight) * train_inverse_loss val_total_loss = args.forward_weight * val_forward_loss + ( 1. - args.forward_weight) * val_inverse_loss icm_opt = tf.train.AdamOptimizer( args.init_lr).minimize(train_total_loss) # Setup summaries summary_writer = tf.summary.FileWriter(args.tfboard_path, graph=tf.get_default_graph()) train_inverse_loss_summ = tf.summary.scalar( "train/icm_inverse_loss/total_mean", train_inverse_loss) train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss", train_forward_loss) train_total_loss_summ = tf.summary.scalar("train/icm_total_loss", train_total_loss) val_inverse_loss_summ = tf.summary.scalar( "val/icm_inverse_loss/total_mean", val_inverse_loss) val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss", val_forward_loss) val_total_loss_summ = tf.summary.scalar("val/icm_total_loss", val_total_loss) train_summary_op = tf.summary.merge([ train_inverse_loss_summ, train_forward_loss_summ, train_total_loss_summ ] + train_inverse_separate_summ) val_summary_op = tf.summary.merge([ val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ ] + val_inverse_separate_summ) logger.log("Finished creating ICM model") sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for timestep in range(args.num_itr): if timestep % args.log_freq == 0: logger.log("Start itr {}".format(timestep)) _, train_summary = sess.run([icm_opt, train_summary_op]) summary_writer.add_summary(train_summary, timestep) else: sess.run(icm_opt) if timestep % args.save_freq == 0: save_snapshot(_encoder, _inverse_model, _forward_model, _state_encoder, args.tfmodel_path) if timestep % args.val_freq == 0: val_summary = sess.run(val_summary_op) summary_writer.add_summary(val_summary, timestep) except KeyboardInterrupt: print("End training...") pass coord.join(threads) sess.close()
def __init__(self, env, trpo: TRPO, tensorboard_path, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, forward_cos=False, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, min_pool_size=200, n_updates_per_iter=10, obs_dtype='float32', normalize_input=False, gpu_fraction=0.95, pretrained_icm=False, pretrained_icm_path=None, freeze_icm=False, **kwargs): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.trpo = trpo self.freeze_icm = freeze_icm # Replace sampler to inject intrinsic reward self.trpo.sampler = self.get_sampler(self.trpo) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) self.sess = tf.get_default_session() or tf.Session( config=tf.ConfigProto(gpu_options=gpu_options)) self.external_reward_weight = external_reward_weight self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.n_updates_per_iter = n_updates_per_iter self.icm_batch_size = icm_batch_size self.act_space = env.action_space self.obs_space = env.observation_space self.pool = TRPOReplayPool(replay_pool_size, self.obs_space.flat_dim, self.act_space.flat_dim, obs_dtype=obs_dtype) self.min_pool_size = min_pool_size # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(self.obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(self.obs_space.shape)) if normalize_input: s1 = self.s1 / 255.0 - 0.5 s2 = self.s2 / 255.0 - 0.5 else: s1 = self.s1 s2 = self.s2 self.asample = tf.placeholder(tf.float32, [None, self.act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None, )) # Hack temp_vars = set(tf.all_variables()) if pretrained_icm: with self.sess.as_default(): icm_data = joblib.load(pretrained_icm_path) _encoder = icm_data['encoder'] _forward_model = icm_data['forward_model'] _inverse_model = icm_data['inverse_model'] icm_vars = set(tf.all_variables()) - temp_vars else: icm_vars = set([]) if pretrained_icm: self._encoder = _encoder # raise NotImplementedError("Currently only supports flat observation input!") else: if len(self.obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(self.obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: self._encoder = ConvEncoder(feature_dim, env.spec.observation_space.shape) self._encoder.sess = self.sess if not pretrained_icm: # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy( observation_input=s1) self.encoder2 = self._encoder.get_weight_tied_copy( observation_input=s2) if not pretrained_icm: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) else: self._inverse_model = _inverse_model self._forward_model = _forward_model self._inverse_model.sess = self.sess self._forward_model.sess = self.sess if not pretrained_icm: # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) # Clip actions to make sure it is consistent with what get input in env clipped_asample = tf.clip_by_value(self.asample, -1.0, 1.0) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy( feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy( feature_input=self.encoder1.output, action_input=clipped_asample) # Define losses, by default it uses L2 loss if forward_cos: self.forward_loss = cos_loss(self.encoder2.output, self.forward_model.output) else: self.forward_loss = tf.reduce_mean( tf.square(self.encoder2.output - self.forward_model.output)) if isinstance(self.act_space, Box): self.inverse_loss = tf.reduce_mean( tf.square(clipped_asample - self.inverse_model.output)) elif isinstance(self.act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError if forward_cos: self.internal_rewards = cos_loss(self.encoder2.output, self.forward_model.output, mean=False) else: self.internal_rewards = tf.reduce_sum( tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards", self.mean_external_rewards) # Setup env_info logs var_summ = [] self.summary = tf.summary.merge([ inverse_loss_summ, forward_loss_summ, total_loss_summ, internal_rewards, external_rewards ]) # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) ## Initialize uninitialized variables self.sess.run( tf.initialize_variables(set(tf.all_variables()) - icm_vars))
def main(): parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str, help="name of gym env") parser.add_argument('dataset_path', type=str, help="path of training and validation dataset") parser.add_argument('--tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('--tfmodel_path', type=str, default='/tmp/tfmodels') # Training parameters parser.add_argument('--val_ratio', type=float, default=0.1, help="ratio of validation sets") parser.add_argument('--num_itr', type=int, default=10000000) parser.add_argument('--val_freq', type=int, default=1000) parser.add_argument('--log_freq', type=int, default=200) parser.add_argument('--save_freq', type=int, default=5000) # ICM parameters parser.add_argument('--init_lr', type=float, default=1e-4) parser.add_argument('--forward_weight', type=float, default=0.8, help="the ratio of forward loss vs inverse loss") parser.add_argument('--cos_forward', action='store_true', help="whether to use cosine forward loss") # parser.add_argument('--norm_input', action='store_true', # help="whether to normalize observation input") args = parser.parse_args() env = TfEnv(normalize(env=GymEnv(args.env_name,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) # Get dataset dataset_names = list( map(lambda file_name: osp.join(args.dataset_path, file_name), listdir(args.dataset_path))) val_set_names = dataset_names[:int(len(dataset_names) * args.val_ratio)] train_set_names = dataset_names[int(len(dataset_names) * args.val_ratio):] train_queue = tf.train.string_input_producer(train_set_names, num_epochs=None) val_queue = tf.train.string_input_producer(val_set_names, num_epochs=None) train_obs, train_next_obs, train_action = read_and_decode( train_queue, env.observation_space.shape, env.action_space.shape) val_obs, val_next_obs, val_action = read_and_decode( val_queue, env.observation_space.shape, env.action_space.shape) # Build ICM model # if args.norm_input: # train_obs = train_obs * (1./255) - 0.5 # train_next_obs = train_next_obs *(1./255) - 0.5 # val_obs = val_obs * (1./255) - 0.5 # val_next_obs = val_next_obs * (1./255) - 0.5 # train_obs = tf.cast(train_obs, tf.float32) / 255.0 - 0.5 # train_next_obs = tf.cast(train_next_obs, tf.float32) / 255.0 - 0.5 # val_obs = tf.cast(val_obs, tf.float32) / 255.0 - 0.5 # val_next_obs = tf.cast(val_next_obs, tf.float32) / 255.0 - 0.5 # else: # train_obs = tf.cast(train_obs, tf.float32) # train_next_obs = tf.cast(train_next_obs, tf.float32) # val_obs = tf.cast(val_obs, tf.float32) # val_next_obs = tf.cast(val_next_obs, tf.float32) _encoder = ConvEncoder( feature_dim=256, input_shape=env.observation_space.shape, conv_filters=(64, 64, 64, 32), conv_filter_sizes=((5, 5), (5, 5), (5, 5), (3, 3)), conv_strides=(3, 2, 2, 2), conv_pads=('SAME', 'SAME', 'SAME', 'SAME'), hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) _inverse_model = InverseModel( feature_dim=256, env_spec=env.spec, hidden_sizes=(256, ), hidden_activation=tf.nn.tanh, output_activation=tf.nn.tanh, ) _forward_model = ForwardModel( feature_dim=256, env_spec=env.spec, hidden_sizes=(256, ), hidden_activation=tf.nn.elu, ) sess = tf.Session() _encoder.sess = sess _inverse_model.sess = sess _forward_model.sess = sess with sess.as_default(): # Initialize variables for get_copy to work sess.run(tf.initialize_all_variables()) train_encoder1 = _encoder.get_weight_tied_copy( observation_input=train_obs) train_encoder2 = _encoder.get_weight_tied_copy( observation_input=train_next_obs) train_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=train_encoder1.output, feature_input2=train_encoder2.output) train_forward_model = _forward_model.get_weight_tied_copy( feature_input=train_encoder1.output, action_input=train_action) val_encoder1 = _encoder.get_weight_tied_copy(observation_input=val_obs) val_encoder2 = _encoder.get_weight_tied_copy( observation_input=val_next_obs) val_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=val_encoder1.output, feature_input2=val_encoder2.output) val_forward_model = _forward_model.get_weight_tied_copy( feature_input=val_encoder1.output, action_input=val_action) if args.cos_forward: train_forward_loss = cos_loss(train_encoder2.output, train_forward_model.output) val_forward_loss = cos_loss(val_encoder2.output, val_forward_model.output) else: train_forward_loss = tf.reduce_mean( tf.square(train_encoder2.output - train_forward_model.output)) val_forward_loss = tf.reduce_mean( tf.square(val_encoder2.output - val_forward_model.output)) train_inverse_loss = tf.reduce_mean( tf.square(train_action - train_inverse_model.output)) val_inverse_loss = tf.reduce_mean( tf.square(val_action - val_inverse_model.output)) train_total_loss = args.forward_weight * train_forward_loss + ( 1. - args.forward_weight) * train_inverse_loss val_total_loss = args.forward_weight * val_forward_loss + ( 1. - args.forward_weight) * val_inverse_loss icm_opt = tf.train.AdamOptimizer( args.init_lr).minimize(train_total_loss) # Setup summaries summary_writer = tf.summary.FileWriter(args.tfboard_path, graph=tf.get_default_graph()) train_inverse_loss_summ = tf.summary.scalar("train/icm_inverse_loss", train_inverse_loss) train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss", train_forward_loss) train_total_loss_summ = tf.summary.scalar("train/icm_total_loss", train_total_loss) val_inverse_loss_summ = tf.summary.scalar("val/icm_inverse_loss", val_inverse_loss) val_forward_loss_summ = tf.summary.scalar("val/icm_forward_loss", val_forward_loss) val_total_loss_summ = tf.summary.scalar("val/icm_total_loss", val_total_loss) train_summary_op = tf.summary.merge([ train_inverse_loss_summ, train_forward_loss_summ, train_total_loss_summ ]) val_summary_op = tf.summary.merge([ val_inverse_loss_summ, val_forward_loss_summ, val_total_loss_summ ]) logger.log("Finished creating ICM model") sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for timestep in range(args.num_itr): if timestep % args.log_freq == 0: logger.log("Start itr {}".format(timestep)) _, train_summary = sess.run([icm_opt, train_summary_op]) else: sess.run(icm_opt) if timestep % args.log_freq == 0: summary_writer.add_summary(train_summary, timestep) if timestep % args.save_freq == 0: save_snapshot(_encoder, _inverse_model, _forward_model, args.tfmodel_path) if timestep % args.val_freq == 0: val_summary = sess.run(val_summary_op) summary_writer.add_summary(val_summary, timestep) except KeyboardInterrupt: print("End training...") pass coord.join(threads) sess.close()
def __init__( self, env, algo: OnlineAlgorithm, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, inverse_tanh=False, init_learning_rate=1e-4, algo_update_freq=1, **kwargs ): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.algo = algo gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) self.sess = self.algo.sess or tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.external_reward_weight = external_reward_weight self.summary_writer = self.algo.summary_writer self.algo_update_freq = algo_update_freq act_space = env.action_space obs_space = env.observation_space # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None,)) if len(obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: # TODO: implement conv encoder raise NotImplementedError("Currently only supports flat observation input!") self._encoder.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy(observation_input=self.s1) self.encoder2 = self._encoder.get_weight_tied_copy(observation_input=self.s2) self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) self._inverse_model.sess = self.sess self._forward_model.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy(feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy(feature_input=self.encoder1.output, action_input=self.asample) # Define losses self.forward_loss = tf.reduce_mean(tf.square(self.encoder2.output - self.forward_model.output)) # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output) if isinstance(act_space, Box): self.inverse_loss = tf.reduce_mean(tf.square(self.asample - self.inverse_model.output)) elif isinstance(act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError self.internal_rewards = tf.reduce_sum(tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards_training", self.mean_external_rewards) var_summ = [] for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summ.append(tf.summary.histogram(var.op.name, var)) self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ,\ internal_rewards, external_rewards]) # Initialize variables self.sess.run(tf.initialize_all_variables())
class ICM(RLAlgorithm): """ RL with intrinsic curiosity module """ def __init__(self, env, trpo: TRPO, tensorboard_path, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, forward_cos=False, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, min_pool_size=200, n_updates_per_iter=10, obs_dtype='float32', normalize_input=False, gpu_fraction=0.95, pretrained_icm=False, pretrained_icm_path=None, freeze_icm=False, **kwargs): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.trpo = trpo self.freeze_icm = freeze_icm # Replace sampler to inject intrinsic reward self.trpo.sampler = self.get_sampler(self.trpo) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) self.sess = tf.get_default_session() or tf.Session( config=tf.ConfigProto(gpu_options=gpu_options)) self.external_reward_weight = external_reward_weight self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.n_updates_per_iter = n_updates_per_iter self.icm_batch_size = icm_batch_size self.act_space = env.action_space self.obs_space = env.observation_space self.pool = TRPOReplayPool(replay_pool_size, self.obs_space.flat_dim, self.act_space.flat_dim, obs_dtype=obs_dtype) self.min_pool_size = min_pool_size # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(self.obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(self.obs_space.shape)) if normalize_input: s1 = self.s1 / 255.0 - 0.5 s2 = self.s2 / 255.0 - 0.5 else: s1 = self.s1 s2 = self.s2 self.asample = tf.placeholder(tf.float32, [None, self.act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None, )) # Hack temp_vars = set(tf.all_variables()) if pretrained_icm: with self.sess.as_default(): icm_data = joblib.load(pretrained_icm_path) _encoder = icm_data['encoder'] _forward_model = icm_data['forward_model'] _inverse_model = icm_data['inverse_model'] icm_vars = set(tf.all_variables()) - temp_vars else: icm_vars = set([]) if pretrained_icm: self._encoder = _encoder # raise NotImplementedError("Currently only supports flat observation input!") else: if len(self.obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(self.obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: self._encoder = ConvEncoder(feature_dim, env.spec.observation_space.shape) self._encoder.sess = self.sess if not pretrained_icm: # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy( observation_input=s1) self.encoder2 = self._encoder.get_weight_tied_copy( observation_input=s2) if not pretrained_icm: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) else: self._inverse_model = _inverse_model self._forward_model = _forward_model self._inverse_model.sess = self.sess self._forward_model.sess = self.sess if not pretrained_icm: # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) # Clip actions to make sure it is consistent with what get input in env clipped_asample = tf.clip_by_value(self.asample, -1.0, 1.0) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy( feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy( feature_input=self.encoder1.output, action_input=clipped_asample) # Define losses, by default it uses L2 loss if forward_cos: self.forward_loss = cos_loss(self.encoder2.output, self.forward_model.output) else: self.forward_loss = tf.reduce_mean( tf.square(self.encoder2.output - self.forward_model.output)) if isinstance(self.act_space, Box): self.inverse_loss = tf.reduce_mean( tf.square(clipped_asample - self.inverse_model.output)) elif isinstance(self.act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError if forward_cos: self.internal_rewards = cos_loss(self.encoder2.output, self.forward_model.output, mean=False) else: self.internal_rewards = tf.reduce_sum( tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards", self.mean_external_rewards) # Setup env_info logs var_summ = [] self.summary = tf.summary.merge([ inverse_loss_summ, forward_loss_summ, total_loss_summ, internal_rewards, external_rewards ]) # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) ## Initialize uninitialized variables self.sess.run( tf.initialize_variables(set(tf.all_variables()) - icm_vars)) @overrides def train(self): with self.sess.as_default(): self.trpo.start_worker() for itr in range(self.trpo.start_itr, self.trpo.n_itr): paths = self.trpo.obtain_samples(itr) modified_paths = self.process_paths(itr, paths) samples_data = self.trpo.process_samples(itr, modified_paths) if self.pool.size >= self.min_pool_size: if self.freeze_icm: logger.log("Freezing ICM") else: logger.log("ICM Training started") start_time = time.time() for _ in range(self.n_updates_per_iter): self.train_icm(_ + itr * self.n_updates_per_iter) logger.log("ICM Training finished. Time: {0}".format( time.time() - start_time)) for path in samples_data['paths']: path_len = len(path['rewards']) for i in range(path_len): obs = path['observations'][i] act = path['actions'][i] term = (i == path_len - 1) rew = 0.0 self.pool.add_sample(obs, act, rew, term) # pdb.set_trace() self.trpo.log_diagnostics(paths) self.trpo.optimize_policy(itr, samples_data) params = self.trpo.get_itr_snapshot(itr, samples_data) params['encoder'] = self._encoder params['inverse_model'] = self._inverse_model params['forward_model'] = self._forward_model logger.save_itr_params(itr, params) logger.dump_tabular(with_prefix=False) self.trpo.shutdown_worker() def train_icm(self, timestep): batch = self.pool.random_batch(self.icm_batch_size) obs = self.reshape_obs(batch['observations']) next_obs = self.reshape_obs(batch['next_observations']) acts = batch['actions'] rewards = batch['rewards'] feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts) ops = [self.summary, self.icm_opt] # ops = [self.icm_opt] results = self.sess.run(ops, feed_dict=feed_dict) if timestep % TENSORBOARD_PERIOD == 0: self.summary_writer.add_summary(results[0], timestep) def process_paths(self, itr, paths): modified_paths = copy(paths) for path in modified_paths: obs = self.reshape_obs(path['observations'][:-1]) acts = path['actions'][:-1] next_obs = self.reshape_obs(path['observations'][1:]) internal_rewards = self.sess.run(self.internal_rewards, feed_dict={ self.s1: obs, self.s2: next_obs, self.asample: acts }) internal_rewards = np.append(internal_rewards, 0.0) path['t_rewards'] = self.external_reward_weight * path['rewards'] \ + (1. - self.external_reward_weight) * internal_rewards return modified_paths def reshape_obs(self, obs): if len(self.obs_space.shape) >= 1: length = obs.shape[0] obs = obs.reshape([length] + list(self.obs_space.shape)) return obs def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions): return { self.s1: sampled_obs, self.s2: sampled_next_obs, self.asample: sampled_actions, self.external_rewards: sampled_rewards, } def get_sampler(self, trpo): from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler as OldBatchSampler from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler as OldVectorizedSampler if isinstance(trpo.sampler, OldBatchSampler): return BatchSampler(trpo) elif isinstance(trpo.sampler, OldVectorizedSampler): return VectorizedSampler(trpo) else: raise NotImplementedError( "Only supports batch sampler and vectorized sampler right now!" )
class ICM(RLAlgorithm): """ RL with inverse curiosity module """ def __init__( self, env, algo: OnlineAlgorithm, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, inverse_tanh=False, init_learning_rate=1e-4, algo_update_freq=1, **kwargs ): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.algo = algo gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) self.sess = self.algo.sess or tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.external_reward_weight = external_reward_weight self.summary_writer = self.algo.summary_writer self.algo_update_freq = algo_update_freq act_space = env.action_space obs_space = env.observation_space # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None,)) if len(obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: # TODO: implement conv encoder raise NotImplementedError("Currently only supports flat observation input!") self._encoder.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy(observation_input=self.s1) self.encoder2 = self._encoder.get_weight_tied_copy(observation_input=self.s2) self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) self._inverse_model.sess = self.sess self._forward_model.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy(feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy(feature_input=self.encoder1.output, action_input=self.asample) # Define losses self.forward_loss = tf.reduce_mean(tf.square(self.encoder2.output - self.forward_model.output)) # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output) if isinstance(act_space, Box): self.inverse_loss = tf.reduce_mean(tf.square(self.asample - self.inverse_model.output)) elif isinstance(act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError self.internal_rewards = tf.reduce_sum(tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards_training", self.mean_external_rewards) var_summ = [] for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summ.append(tf.summary.histogram(var.op.name, var)) self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ,\ internal_rewards, external_rewards]) # Initialize variables self.sess.run(tf.initialize_all_variables()) @overrides def train(self): with self.sess.as_default(): self.algo._init_training() self.algo._start_worker() self.algo._switch_to_training_mode() observation = self.algo.training_env.reset() self.algo.exploration_strategy.reset() itr = 0 path_length = 0 path_return = 0 for epoch in range(self.algo.n_epochs): logger.push_prefix('Epoch #%d | ' % epoch) logger.log("Training started") start_time = time.time() for t in range(self.algo.epoch_length): with self.algo._eval_then_training_mode(): # Bug here!!!!!! # action, _ = self.algo.policy.get_action(observation) action = self.algo.exploration_strategy.get_action(itr, observation, self.algo.policy) if self.algo.render: self.algo.training_env.render() next_ob, raw_reward, terminal, _ = self.algo.training_env.step( self.algo.process_action(action) ) # Some envs return a Nx1 vector for the observation next_ob = next_ob.flatten() reward = raw_reward * self.algo.scale_reward # # JUST FOR DEBUG: save data # save_data("/data0/dianchen/forward_data", \ # observation, # action, # next_ob) path_length += 1 path_return += reward self.algo.pool.add_sample(observation, action, reward, terminal, False) if terminal or path_length >= self.algo.max_path_length: self.algo.pool.add_sample(next_ob, np.zeros_like(action), np.zeros_like(reward), np.zeros_like(terminal), True) observation = self.algo.training_env.reset() self.algo.exploration_strategy.reset() self.algo.es_path_returns.append(path_return) path_length = 0 path_return = 0 else: observation = next_ob if self.algo.pool.size >= self.algo.min_pool_size: for _ in range(self.algo.n_updates_per_time_step): self._do_training(epoch * self.algo.epoch_length + t) itr += 1 logger.log("Training finished. Time: {0}".format(time.time() - start_time)) with self.algo._eval_then_training_mode(): if self.algo.pool.size >= self.algo.min_pool_size: start_time = time.time() if self.algo.n_eval_samples > 0: # self.algo.evaluate(epoch, self.algo.es_path_returns) # self.algo.es_path_returns = [] pass params = self.get_epoch_snapshot(epoch) logger.log( "Eval time: {0}".format(time.time() - start_time)) logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() self.algo._switch_to_eval_mode() self.algo.training_env.terminate() self.algo._shutdown_worker() return self.algo.last_statistics def _do_training(self, timestep): minibatch = self.algo.pool.random_batch(self.algo.batch_size) sampled_obs = minibatch["observations"] sampled_terminals = minibatch['terminals'] sampled_next_obs = minibatch["next_observations"] sampled_actions = minibatch["actions"] sampled_rewards = minibatch['rewards'] icm_feed_dict = self._update_feed_dict(sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions) algo_ops = self.algo._get_training_ops() if timestep % self.algo_update_freq == 0: icm_ops = [self.icm_opt] else: icm_ops = [] icm_results = self.sess.run([self.summary, self.internal_rewards] + icm_ops, feed_dict=icm_feed_dict) icm_summary = icm_results[0] internal_rewards = icm_results[1] # Add up internal and external rewards algo_feed_dict = self.algo._update_feed_dict(self.external_reward_weight * sampled_rewards + \ (1. - self.external_reward_weight) * internal_rewards, sampled_terminals, sampled_obs, sampled_actions, sampled_next_obs) # If algo has summary, run it. # TODO: Clean this code. It is a mess right now if self.algo.summary is not None: algo_ops = [self.algo.summary] + algo_ops algo_results = self.sess.run(algo_ops, feed_dict=algo_feed_dict) if self.algo.summary is not None: algo_summary = algo_results[0] if timestep % TENSORBOARD_PERIOD == 0: if self.algo.summary is not None: self.summary_writer.add_summary(algo_summary, timestep) self.summary_writer.add_summary(icm_summary, timestep) def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions): return { self.s1: sampled_obs, self.s2: sampled_next_obs, self.asample: sampled_actions, self.external_rewards: sampled_rewards, } def get_epoch_snapshot(self, epoch): snapshot = self.algo.get_epoch_snapshot(epoch) snapshot['encoder'] = self._encoder snapshot['inverse_model'] = self._inverse_model snapshot['forward_model'] = self._forward_model return snapshot
def main(): parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str, help="name of gym env") parser.add_argument('dataset_path', type=str, help="path of training and validation dataset") parser.add_argument('val_random_data', type=str, help="path of training and validation dataset") parser.add_argument('val_contact_data', type=str, help="path of training and validation dataset") parser.add_argument('tfboard_path', type=str, default='/tmp/tfboard') parser.add_argument('tfmodel_path', type=str, default='/tmp/tfmodels') parser.add_argument('--restore', action='store_true') # Training parameters parser.add_argument('--num_itr', type=int, default=10000000) parser.add_argument('--val_freq', type=int, default=200) parser.add_argument('--log_freq', type=int, default=50) parser.add_argument('--save_freq', type=int, default=5000) # ICM parameters parser.add_argument('--init_lr', type=float, default=2e-3) parser.add_argument('--forward_weight', type=float, default=0.5, help="the ratio of forward loss vs inverse loss") parser.add_argument('--cos_forward', action='store_true', help="whether to use cosine forward loss") args = parser.parse_args() # Get dataset train_set_names = list( map(lambda file_name: osp.join(args.dataset_path, file_name), listdir(args.dataset_path))) val_random_set_names = list( map(lambda file_name: osp.join(args.val_random_data, file_name), listdir(args.val_random_data))) val_contact_set_names = list( map(lambda file_name: osp.join(args.val_contact_data, file_name), listdir(args.val_contact_data))) # import pdb; pdb.set_trace() obs_shape = OBS_SHAPE_MAP[args.env_name] action_dim = ACTION_DIM_MAP[args.env_name] train_obs, train_next_obs, train_action = inputs(train_set_names, obs_shape, train=True) val_random_obs, val_random_next_obs, val_random_action = inputs( val_random_set_names, obs_shape, train=False) val_contact_obs, val_contact_next_obs, val_contact_action = inputs( val_contact_set_names, obs_shape, train=False) if args.restore: models_dict = joblib.load(args.tfmodel_path) _encoder = models_dict['encoder'] _inverse_model = model.dict['inverse_model'] _forward_model = model.dict['forward_model'] else: _encoder = NoEncoder(obs_shape, observation_dim=[obs_shape]) _inverse_model = InverseModel( feature_dim=obs_shape, action_dim=action_dim, hidden_sizes=(256, 256), hidden_activation=tf.nn.elu, output_activation=tf.nn.tanh, ) _forward_model = ForwardModel( feature_dim=obs_shape, action_dim=action_dim, hidden_sizes=(256, 257), hidden_activation=tf.nn.elu, ) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) _encoder.sess = sess _inverse_model.sess = sess _forward_model.sess = sess with sess.as_default(): # Initialize variables for get_copy to work sess.run(tf.initialize_all_variables()) train_encoder1 = _encoder.get_weight_tied_copy( observation_input=train_obs) train_encoder2 = _encoder.get_weight_tied_copy( observation_input=train_next_obs) # import pdb; pdb.set_trace() train_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=train_encoder1.output, feature_input2=train_encoder2.output) train_forward_model = _forward_model.get_weight_tied_copy( feature_input=train_encoder1.output, action_input=train_action) val_random_encoder1 = _encoder.get_weight_tied_copy( observation_input=val_random_obs) val_random_encoder2 = _encoder.get_weight_tied_copy( observation_input=val_random_next_obs) val_random_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=val_random_encoder1.output, feature_input2=val_random_encoder2.output) val_random_forward_model = _forward_model.get_weight_tied_copy( feature_input=val_random_encoder1.output, action_input=val_random_action) val_contact_encoder1 = _encoder.get_weight_tied_copy( observation_input=val_contact_obs) val_contact_encoder2 = _encoder.get_weight_tied_copy( observation_input=val_contact_next_obs) val_contact_inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=val_contact_encoder1.output, feature_input2=val_contact_encoder2.output) val_contact_forward_model = _forward_model.get_weight_tied_copy( feature_input=val_contact_encoder1.output, action_input=val_contact_action) if args.cos_forward: train_forward_loss = cos_loss(train_encoder2.output, train_forward_model.output) val_forward_loss = cos_loss(val_encoder2.output, val_forward_model.output) else: train_forward_loss = tf.reduce_mean( tf.square(train_encoder2.output - train_forward_model.output)) #use only if in state space!!!!!!! train_forward_loss_arm = tf.reduce_mean( tf.square(train_encoder2.output[:, :4] - train_forward_model.output[:, :4])) train_forward_loss_box = tf.reduce_mean( tf.square(train_encoder2.output[:, 4:7] - train_forward_model.output[:, 4:7])) val_random_forward_loss = tf.reduce_mean( tf.square(val_random_encoder2.output - val_random_forward_model.output)) val_random_forward_loss_arm = tf.reduce_mean( tf.square(val_random_encoder2.output[:, :4] - val_random_forward_model.output[:, :4])) val_random_forward_loss_box = tf.reduce_mean( tf.square(val_random_encoder2.output[:, 4:7] - val_random_forward_model.output[:, 4:7])) val_contact_forward_loss = tf.reduce_mean( tf.square(val_contact_encoder2.output - val_contact_forward_model.output)) val_contact_forward_loss_arm = tf.reduce_mean( tf.square(val_contact_encoder2.output[:, :4] - val_contact_forward_model.output[:, :4])) val_contact_forward_loss_box = tf.reduce_mean( tf.square(val_contact_encoder2.output[:, 4:7] - val_contact_forward_model.output[:, 4:7])) train_inverse_losses = tf.reduce_mean( tf.square(train_action - train_inverse_model.output), axis=0) val_random_inverse_losses = tf.reduce_mean( tf.square(val_random_action - val_random_inverse_model.output), axis=0) val_contact_inverse_losses = tf.reduce_mean( tf.square(val_contact_action - val_contact_inverse_model.output), axis=0) train_inverse_separate_summ = [] val_random_inverse_separate_summ = [] val_contact_inverse_separate_summ = [] for joint_idx in range(action_dim): train_inverse_separate_summ.append( tf.summary.scalar( "train/icm_inverse_loss/joint_{}".format(joint_idx), train_inverse_losses[joint_idx])) val_random_inverse_separate_summ.append( tf.summary.scalar( "random_val/icm_inverse_random_loss/joint_{}".format( joint_idx), val_random_inverse_losses[joint_idx])) val_contact_inverse_separate_summ.append( tf.summary.scalar( "contact_val/icm_inverse_random_loss/joint_{}".format( joint_idx), val_contact_inverse_losses[joint_idx])) train_inverse_loss = tf.reduce_mean(train_inverse_losses) val_random_inverse_loss = tf.reduce_mean(val_random_inverse_losses) val_contact_inverse_loss = tf.reduce_mean(val_contact_inverse_losses) train_total_loss = args.forward_weight * train_forward_loss + ( 1. - args.forward_weight) * train_inverse_loss val_random_total_loss = args.forward_weight * val_random_forward_loss + ( 1. - args.forward_weight) * val_random_inverse_loss val_contact_total_loss = args.forward_weight * val_contact_forward_loss + ( 1. - args.forward_weight) * val_contact_inverse_loss icm_opt = tf.train.AdamOptimizer( args.init_lr).minimize(train_total_loss) _, train_data_forward_var = tf.nn.moments(train_obs, axes=[1]) _, train_data_box_var = tf.nn.moments(train_obs[:, 4:7], axes=[1]) # Setup summaries summary_writer = tf.summary.FileWriter(args.tfboard_path, graph=tf.get_default_graph()) train_forward_loss_arm_summ = tf.summary.scalar( "train/forward_loss_arm", train_forward_loss_arm) train_forward_loss_box_summ = tf.summary.scalar( "train/forward_loss_box", train_forward_loss_box) train_inverse_loss_summ = tf.summary.scalar( "train/icm_inverse_loss/total_mean", train_inverse_loss) train_forward_loss_summ = tf.summary.scalar("train/icm_forward_loss", train_forward_loss) train_total_loss_summ = tf.summary.scalar("train/icm_total_loss", train_total_loss) random_val_forward_loss_arm_summ = tf.summary.scalar( "random_val/forward_loss_arm", val_random_forward_loss_arm) random_val_forward_loss_box_summ = tf.summary.scalar( "random_val/forward_loss_box", val_random_forward_loss_box) random_val_inverse_loss_summ = tf.summary.scalar( "random_val/icm_inverse_loss/total_mean", val_random_inverse_loss) random_val_forward_loss_summ = tf.summary.scalar( "random_val/icm_forward_loss", val_random_forward_loss) random_val_total_loss_summ = tf.summary.scalar( "random_val/icm_total_loss", val_random_total_loss) contact_val_forward_loss_arm_summ = tf.summary.scalar( "contact_val/forward_loss_arm", val_contact_forward_loss_arm) contact_val_forward_loss_box_summ = tf.summary.scalar( "contact_val/forward_loss_box", val_contact_forward_loss_box) contact_val_inverse_loss_summ = tf.summary.scalar( "contact_val/icm_inverse_loss/total_mean", val_contact_inverse_loss) contact_val_forward_loss_summ = tf.summary.scalar( "contact_val/icm_forward_loss", val_contact_forward_loss) contact_val_total_loss_summ = tf.summary.scalar( "contact_val/icm_total_loss", val_contact_total_loss) forward_data_variance_summ = tf.summary.scalar("training_data_forward_variance", \ tf.reduce_mean(train_data_forward_var)) forward_data_box_variance_summ = tf.summary.scalar("training_data_forward_box_variance", \ tf.reduce_mean(train_data_box_var)) train_summary_op = tf.summary.merge([ train_inverse_loss_summ, train_forward_loss_summ, train_forward_loss_arm_summ, train_forward_loss_box_summ, train_total_loss_summ, forward_data_variance_summ, forward_data_box_variance_summ, ] + train_inverse_separate_summ) val_summary_op = tf.summary.merge([ random_val_forward_loss_arm_summ, random_val_forward_loss_box_summ, random_val_inverse_loss_summ, random_val_forward_loss_summ, random_val_total_loss_summ, contact_val_forward_loss_arm_summ, contact_val_forward_loss_box_summ, contact_val_inverse_loss_summ, contact_val_forward_loss_summ, contact_val_total_loss_summ, ] + val_random_inverse_separate_summ + val_contact_inverse_separate_summ) logger.log("Finished creating ICM model") sess.run(tf.initialize_all_variables()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: for timestep in range(args.num_itr): # print(timestep) # print(sess.run(train_action)) # print("wow") if timestep % args.log_freq == 0: logger.log("Start itr {}".format(timestep)) _, train_summary = sess.run([icm_opt, train_summary_op]) summary_writer.add_summary(train_summary, timestep) else: sess.run(icm_opt) if timestep % args.save_freq == 0: save_snapshot(_encoder, _inverse_model, _forward_model, args.tfmodel_path) if timestep % args.val_freq == 0: val_summary = sess.run(val_summary_op) summary_writer.add_summary(val_summary, timestep) except KeyboardInterrupt: print("End training...") pass coord.join(threads) sess.close()
class ICM(RLAlgorithm): """ RL with inverse curiosity module """ def __init__(self, env, trpo: TRPO, tensorboard_path, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, inverse_tanh=True, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, min_pool_size=1000, n_updates_per_sample=500, **kwargs): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.trpo = trpo self.trpo.sampler = BatchSampler(self.trpo) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) self.sess = tf.get_default_session() or tf.Session( config=tf.ConfigProto(gpu_options=gpu_options)) self.external_reward_weight = external_reward_weight self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.n_updates_per_sample = n_updates_per_sample self.icm_batch_size = icm_batch_size act_space = env.action_space obs_space = env.observation_space self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim, act_space.flat_dim) self.min_pool_size = min_pool_size # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None, )) if len(obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: # TODO: implement conv encoder raise NotImplementedError( "Currently only supports flat observation input!") self._encoder.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy( observation_input=self.s1) self.encoder2 = self._encoder.get_weight_tied_copy( observation_input=self.s2) self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) self._inverse_model.sess = self.sess self._forward_model.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy( feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy( feature_input=self.encoder1.output, action_input=self.asample) # Define losses self.forward_loss = tf.reduce_mean( tf.square(self.encoder2.output - self.forward_model.output)) # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output) if isinstance(act_space, Box): self.inverse_loss = tf.reduce_mean( tf.square(self.asample - self.inverse_model.output)) elif isinstance(act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError self.internal_rewards = tf.reduce_sum( tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards", self.mean_external_rewards) var_summ = [] # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): # var_summ.append(tf.summary.histogram(var.op.name, var)) self.summary = tf.summary.merge( [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) # Initialize variables self.sess.run(tf.initialize_all_variables()) @overrides def train(self): with self.sess.as_default(): self.trpo.start_worker() self.trpo.init_opt() for itr in range(self.trpo.current_itr, self.trpo.n_itr): paths = self.trpo.sampler.obtain_samples(itr) modified_paths = self.process_paths(itr, paths) samples_data = self.trpo.sampler.process_samples( itr, modified_paths) if self.pool.size >= self.min_pool_size: logger.log("ICM Training started") start_time = time.time() for _ in range(self.n_updates_per_sample): self.train_icm(_ + itr * self.n_updates_per_sample) logger.log( "ICM Training finished. Time: {0}".format(time.time() - start_time)) for path in samples_data['paths']: path_len = len(path['rewards']) for i in range(path_len): obs = path['observations'][i] # print (obs) act = path['actions'][i] term = (i == path_len - 1) rew = 0.0 self.pool.add_sample(obs, act, rew, term) self.trpo.log_diagnostics(paths) self.trpo.optimize_policy(itr, samples_data) params = self.trpo.get_itr_snapshot(itr, samples_data) params['encoder'] = self._encoder params['inverse_model'] = self._inverse_model params['forward_model'] = self._forward_model logger.save_itr_params(itr, params) logger.dump_tabular(with_prefix=False) self.trpo.shutdown_worker() def train_icm(self, timestep): batch = self.pool.random_batch(self.icm_batch_size) obs = batch['observations'] next_obs = batch['next_observations'] acts = batch['actions'] rewards = batch['rewards'] feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts) ops = [self.summary, self.icm_opt] results = self.sess.run(ops, feed_dict=feed_dict) if timestep % TENSORBOARD_PERIOD == 0: self.summary_writer.add_summary(results[0], timestep) def process_paths(self, itr, paths): modified_paths = copy(paths) if itr == 0: for path in modified_paths: path['t_rewards'] = path["rewards"] else: for path in modified_paths: obs = path['observations'][:-1] acts = path['actions'][:-1] next_obs = path['observations'][1:] internal_rewards = self.sess.run(self.internal_rewards, feed_dict={ self.s1: obs, self.s2: next_obs, self.asample: acts }) internal_rewards = np.append([0.0], internal_rewards) path['t_rewards'] = self.external_reward_weight * path['rewards'] \ + (1. - self.external_reward_weight) * internal_rewards return modified_paths def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions): return { self.s1: sampled_obs, self.s2: sampled_next_obs, self.asample: sampled_actions, self.external_rewards: sampled_rewards, }
def __init__(self, env, trpo: TRPO, tensorboard_path, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, inverse_tanh=True, forward_cos=False, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, min_pool_size=1000, n_updates_per_iter=500, rel_curiosity=False, clip_curiosity=0.0, debug_save_data=False, debug_log_weights=False, **kwargs): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.trpo = trpo # Replace sampler to inject intrinsic reward self.trpo.sampler = self.get_sampler(self.trpo) self.sess = tf.get_default_session() or tf.Session() self.external_reward_weight = external_reward_weight self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.n_updates_per_iter = n_updates_per_iter self.forward_cos = forward_cos self.inverse_tanh = inverse_tanh self.icm_batch_size = icm_batch_size self.rel_curiosity = rel_curiosity self.clip_curiosity = clip_curiosity self.debug_save_data = debug_save_data self.debug_log_weights = debug_log_weights # Debug purpose: Save (ob1, a, ob2, if_contact) if self.debug_save_data: self.DEBUG_DATA_PATH = "/home/dianchen/icm_data.csv" with open(self.DEBUG_DATA_PATH, 'w+') as csvfile: fieldnames = ['obs', 'a', 'next_obs', 'contact'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() act_space = env.action_space obs_space = env.observation_space self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim, act_space.flat_dim) self.min_pool_size = min_pool_size # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None, )) self.contact_rewards = tf.placeholder(tf.float32, (None, )) if len(obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: # TODO: implement conv encoder raise NotImplementedError( "Currently only supports flat observation input!") self._encoder.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy( observation_input=self.s1) self.encoder2 = self._encoder.get_weight_tied_copy( observation_input=self.s2) if self.inverse_tanh: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) else: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec, output_activation=None) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) self._inverse_model.sess = self.sess self._forward_model.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy( feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy( feature_input=self.encoder1.output, action_input=self.asample) # Define losses if self.forward_cos: self.forward_loss = cosine_loss(self.encoder2.output, self.forward_model.output) else: self.forward_loss = tf.reduce_mean( tf.square(self.encoder2.output - self.forward_model.output)) # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output) if isinstance(act_space, Box): self.inverse_loss = tf.reduce_mean( tf.square(self.asample - self.inverse_model.output)) elif isinstance(act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError if self.forward_cos: self.internal_rewards = 1.0 - tf.reduce_sum( tf.multiply(tf.nn.l2_normalize(self.forward_model.output, 1), tf.nn.l2_normalize(self.encoder2.output, 1)), 1) else: self.internal_rewards = tf.reduce_sum( tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.mean_contact_rewards = tf.reduce_mean(self.contact_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards", self.mean_external_rewards) # Setup env_info logs contact_summ = tf.summary.scalar("mean_contact_rewards", self.mean_contact_rewards) var_summ = [] # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): # var_summ.append(tf.summary.histogram(var.op.name, var)) if self.debug_log_weights: for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summ.append(tf.summary.histogram(var.op.name, var)) self.summary_icm = tf.summary.merge( [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) self.summary_env = tf.summary.merge( [internal_rewards, external_rewards, contact_summ]) # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) # Initialize variables self.sess.run(tf.initialize_all_variables())
class ICM(RLAlgorithm): """ RL with inverse curiosity module """ def __init__(self, env, trpo: TRPO, tensorboard_path, no_encoder=False, feature_dim=10, forward_weight=0.8, external_reward_weight=0.01, inverse_tanh=True, forward_cos=False, init_learning_rate=1e-4, icm_batch_size=128, replay_pool_size=1000000, min_pool_size=1000, n_updates_per_iter=500, rel_curiosity=False, clip_curiosity=0.0, debug_save_data=False, debug_log_weights=False, **kwargs): """ :param env: Environment :param algo: Algorithm that will be used with ICM :param encoder: State encoder that maps s to f :param inverse_model: Inverse dynamics model that maps (f1, f2) to actions :param forward_model: Forward dynamics model that maps (f1, a) to f2 :param forward_weight: Weight from 0 to 1 that balances forward loss and inverse loss :param external_reward_weight: Weight that balances external reward and internal reward :param init_learning_rate: Initial learning rate of optimizer """ self.trpo = trpo # Replace sampler to inject intrinsic reward self.trpo.sampler = self.get_sampler(self.trpo) self.sess = tf.get_default_session() or tf.Session() self.external_reward_weight = external_reward_weight self.summary_writer = tf.summary.FileWriter( tensorboard_path, graph=tf.get_default_graph()) self.n_updates_per_iter = n_updates_per_iter self.forward_cos = forward_cos self.inverse_tanh = inverse_tanh self.icm_batch_size = icm_batch_size self.rel_curiosity = rel_curiosity self.clip_curiosity = clip_curiosity self.debug_save_data = debug_save_data self.debug_log_weights = debug_log_weights # Debug purpose: Save (ob1, a, ob2, if_contact) if self.debug_save_data: self.DEBUG_DATA_PATH = "/home/dianchen/icm_data.csv" with open(self.DEBUG_DATA_PATH, 'w+') as csvfile: fieldnames = ['obs', 'a', 'next_obs', 'contact'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() act_space = env.action_space obs_space = env.observation_space self.pool = TRPOReplayPool(replay_pool_size, obs_space.flat_dim, act_space.flat_dim) self.min_pool_size = min_pool_size # Setup ICM models self.s1 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.s2 = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) self.asample = tf.placeholder(tf.float32, [None, act_space.flat_dim]) self.external_rewards = tf.placeholder(tf.float32, (None, )) self.contact_rewards = tf.placeholder(tf.float32, (None, )) if len(obs_space.shape) == 1: if no_encoder: self._encoder = NoEncoder(obs_space.flat_dim, env_spec=env.spec) else: self._encoder = FullyConnectedEncoder(feature_dim, env_spec=env.spec) else: # TODO: implement conv encoder raise NotImplementedError( "Currently only supports flat observation input!") self._encoder.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.encoder1 = self._encoder.get_weight_tied_copy( observation_input=self.s1) self.encoder2 = self._encoder.get_weight_tied_copy( observation_input=self.s2) if self.inverse_tanh: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec) else: self._inverse_model = InverseModel(feature_dim, env_spec=env.spec, output_activation=None) self._forward_model = ForwardModel(feature_dim, env_spec=env.spec) self._inverse_model.sess = self.sess self._forward_model.sess = self.sess # Initialize variables for get_copy to work self.sess.run(tf.initialize_all_variables()) with self.sess.as_default(): self.inverse_model = self._inverse_model.get_weight_tied_copy( feature_input1=self.encoder1.output, feature_input2=self.encoder2.output) self.forward_model = self._forward_model.get_weight_tied_copy( feature_input=self.encoder1.output, action_input=self.asample) # Define losses if self.forward_cos: self.forward_loss = cosine_loss(self.encoder2.output, self.forward_model.output) else: self.forward_loss = tf.reduce_mean( tf.square(self.encoder2.output - self.forward_model.output)) # self.forward_loss = tf.nn.l2_loss(self.encoder2.output - self.forward_model.output) if isinstance(act_space, Box): self.inverse_loss = tf.reduce_mean( tf.square(self.asample - self.inverse_model.output)) elif isinstance(act_space, Discrete): # TODO: Implement softmax loss raise NotImplementedError else: raise NotImplementedError if self.forward_cos: self.internal_rewards = 1.0 - tf.reduce_sum( tf.multiply(tf.nn.l2_normalize(self.forward_model.output, 1), tf.nn.l2_normalize(self.encoder2.output, 1)), 1) else: self.internal_rewards = tf.reduce_sum( tf.square(self.encoder2.output - self.forward_model.output), axis=1) self.mean_internal_rewards = tf.reduce_mean(self.internal_rewards) self.mean_external_rewards = tf.reduce_mean(self.external_rewards) self.mean_contact_rewards = tf.reduce_mean(self.contact_rewards) self.total_loss = forward_weight * self.forward_loss + \ (1. - forward_weight) * self.inverse_loss self.icm_opt = tf.train.AdamOptimizer(init_learning_rate).\ minimize(self.total_loss) # Setup summaries inverse_loss_summ = tf.summary.scalar("icm_inverse_loss", self.inverse_loss) forward_loss_summ = tf.summary.scalar("icm_forward_loss", self.forward_loss) total_loss_summ = tf.summary.scalar("icm_total_loss", self.total_loss) internal_rewards = tf.summary.scalar("mean_internal_rewards", self.mean_internal_rewards) external_rewards = tf.summary.scalar("mean_external_rewards", self.mean_external_rewards) # Setup env_info logs contact_summ = tf.summary.scalar("mean_contact_rewards", self.mean_contact_rewards) var_summ = [] # for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): # var_summ.append(tf.summary.histogram(var.op.name, var)) if self.debug_log_weights: for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summ.append(tf.summary.histogram(var.op.name, var)) self.summary_icm = tf.summary.merge( [inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) self.summary_env = tf.summary.merge( [internal_rewards, external_rewards, contact_summ]) # self.summary = tf.summary.merge([inverse_loss_summ, forward_loss_summ, total_loss_summ] + var_summ) # Initialize variables self.sess.run(tf.initialize_all_variables()) @overrides def train(self): with self.sess.as_default(): self.trpo.start_worker() for itr in range(self.trpo.start_itr, self.trpo.n_itr): paths = self.trpo.obtain_samples(itr) modified_paths = self.process_paths(itr, paths) samples_data = self.trpo.process_samples(itr, modified_paths) # import pdb; pdb.set_trace() if self.pool.size >= self.min_pool_size: logger.log("ICM Training started") start_time = time.time() for _ in range(self.n_updates_per_iter): self.train_icm(_ + itr * self.n_updates_per_iter) logger.log( "ICM Training finished. Time: {0}".format(time.time() - start_time)) # Log env summary path = samples_data['paths'][0] if self.debug_save_data: with open(self.DEBUG_DATA_PATH, 'a') as csvfile: for obs, a, next_obs, contact in zip(path['observations'][:-1], \ path['observations'][1:], \ path['actions'][:-1], \ path['env_infos']['contact_reward']): fieldnames = ['obs', 'a', 'next_obs', 'contact'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow( dict(obs=obs, a=a, next_obs=next_obs, contact=contact)) results = self.sess.run(self.summary_env, feed_dict={ self.s1: path['observations'][:-1], self.s2: path['observations'][1:], self.asample: path['actions'][:-1], self.external_rewards: path['rewards'], self.contact_rewards: path['env_infos']['contact_reward'] }) self.summary_writer.add_summary(results, itr * self.n_updates_per_iter) for path in samples_data['paths']: path_len = len(path['rewards']) for i in range(path_len): obs = path['observations'][i] # print (obs) act = path['actions'][i] term = (i == path_len - 1) rew = 0.0 self.pool.add_sample(obs, act, rew, term) # pdb.set_trace() self.trpo.log_diagnostics(paths) self.trpo.optimize_policy(itr, samples_data) params = self.trpo.get_itr_snapshot(itr, samples_data) params['encoder'] = self._encoder params['inverse_model'] = self._inverse_model params['forward_model'] = self._forward_model logger.save_itr_params(itr, params) logger.dump_tabular(with_prefix=False) self.trpo.shutdown_worker() def train_icm(self, timestep): batch = self.pool.random_batch(self.icm_batch_size) obs = batch['observations'] next_obs = batch['next_observations'] acts = batch['actions'] rewards = batch['rewards'] feed_dict = self._update_feed_dict(rewards, obs, next_obs, acts) ops = [self.summary_icm, self.icm_opt] # ops = [self.icm_opt] results = self.sess.run(ops, feed_dict=feed_dict) if timestep % TENSORBOARD_PERIOD == 0: self.summary_writer.add_summary(results[0], timestep) def process_paths(self, itr, paths): modified_paths = copy(paths) if itr == 0: for path in modified_paths: path['t_rewards'] = path["rewards"] else: for path in modified_paths: obs = path['observations'][:-1] acts = path['actions'][:-1] next_obs = path['observations'][1:] internal_rewards = self.sess.run(self.internal_rewards, feed_dict={ self.s1: obs, self.s2: next_obs, self.asample: acts }) if self.rel_curiosity: internal_rewards /= self.sess.run(tf.reduce_sum( tf.square(self.encoder1.output - self.encoder2.output), axis=1), feed_dict={ self.s1: obs, self.s2: next_obs, self.asample: acts }) internal_rewards = np.append(internal_rewards, [0.0]) if self.clip_curiosity: idx = internal_rewards.argsort()[:int( len(internal_rewards) * (1.0 - self.clip_curiosity))] internal_rewards[idx] = 0.0 path['t_rewards'] = self.external_reward_weight * path['rewards'] \ + (1. - self.external_reward_weight) * internal_rewards return modified_paths def _update_feed_dict(self, sampled_rewards, sampled_obs, sampled_next_obs, sampled_actions): return { self.s1: sampled_obs, self.s2: sampled_next_obs, self.asample: sampled_actions, self.external_rewards: sampled_rewards, } def get_sampler(self, trpo): from sandbox.rocky.tf.samplers.batch_sampler import BatchSampler as OldBatchSampler from sandbox.rocky.tf.samplers.vectorized_sampler import VectorizedSampler as OldVectorizedSampler if isinstance(trpo.sampler, OldBatchSampler): return BatchSampler(trpo) elif isinstance(trpo.sampler, OldVectorizedSampler): return VectorizedSampler(trpo) else: raise NotImplementedError( "Only supports batch sampler and vectorized sampler right now!" )