def __init__( self, scope, ob_space, ac_space, policy_size="normal", extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1.0, dynamics_bonus=False, meta_rl=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space, meta_rl=meta_rl) self.proportion_of_exp_used_for_predictor_update = ( proportion_of_exp_used_for_predictor_update) enlargement = {"small": 1, "normal": 2, "large": 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu, ) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name="state") pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. ( self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, ) = self.apply_policy( self.ph_ob['obs'][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) ( self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, ) = self.apply_policy( self.ph_ob['obs'], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, additional_inputs=self.ph_ob, ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__( self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= enlargement #256 hidsize *= enlargement #256 convfeat = 16 * enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.step_prediction(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., exploration_type='bottleneck', beta=0.001, rew_counter=None ): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = { 'small': 1, 'normal': 2, 'large': 4 }[policy_size] rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obmean") # (84, 84, 1) self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2])+[1], name="obstd") # (84, 84, 1) memsize *= enlargement # memsize = 256 hidsize *= enlargement # hidsize = 256 convfeat = 16*enlargement # covfeat = 32 self.ob_rms = RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32,shape=(None, memsize), name='state') # (None,256) pdparamsize = self.pdtype.param_shape()[0] # 18 等于动作维度 self.memsize = memsize # Inputs to policy and value function will have different shapes depending on whether it is rollout or optimization time, so we treat separately. # pdparam_opt.shape=(None, None, 18), vpred_int_opt.shape=(None, None), vpred_ext_opt.shape=(None, None), snext_opt.shape=(None, 256) self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], reuse=False, scope=scope, hidsize=hidsize, # 256 memsize=memsize, # 256 extrahid=extrahid, # True sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize) # 18 self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize) self.exploration_type = exploration_type self.max_table = 0 self.define_bottleneck_rew(convfeat=convfeat, rep_size=rep_size/8, enlargement=enlargement, beta=beta, rew_counter=rew_counter) pd = self.pdtype.pdfromflat(self.pdparam_rollout) # 输出策略 softmax 的分布. self.a_samp = pd.sample() # 采样动作 self.nlp_samp = pd.neglogp(self.a_samp) # 输出动作 self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.a_samp_opt = self.pd_opt.sample() self.ph_istate = ph_istate self.scope = scope ############################################# ########## 以下过程实际并未使用 ################ ############################################# # for gradcam policy a_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) # (None,None) -> (None,None,18) # 相当于取出 one_hot 执行的动作的位置的 pdparam_opt loss_cam_pol = tf.reduce_mean(tf.multiply(self.pdparam_opt, a_one_hot)) # (None,) self.conv_out = tf.get_default_graph().get_tensor_by_name('ppo/pol/Relu_2:0') self.grads = tf.gradients(loss_cam_pol, self.conv_out)[0] # for gradcam aux loss_cam_aux = self.kl if int(str(tf.__version__).split('.')[1]) < 10: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2/Maximum:0') else: self.conv_aux_out = tf.get_default_graph().get_tensor_by_name('ppo/LeakyRelu_2:0') self.grads_aux = tf.abs(tf.gradients(loss_cam_aux, self.conv_aux_out)[0]) # self.cams 实际并未使用 weights = tf.reduce_mean(tf.reduce_mean(self.grads, 2), 1) weights = tf.expand_dims(tf.expand_dims(weights, axis=1), axis=1) weights = tf.tile(weights, [1, 6, 6, 1]) cams = tf.reduce_sum((weights * self.conv_out), axis=3) self.cams = tf.maximum(cams, tf.zeros_like(cams)) # self.cans_aux 实际并未使用 weights_aux = tf.reduce_mean(tf.reduce_mean(self.grads_aux, 2), 1) weights_aux = tf.expand_dims(tf.expand_dims(weights_aux, axis=1), axis=1) weights_aux = tf.tile(weights_aux, [1, 7, 7, 1]) cams_aux = tf.nn.relu(tf.reduce_sum((weights_aux * self.conv_aux_out), axis=3)) self.cams_aux = tf.maximum(cams_aux, tf.zeros_like(cams_aux))
def main(env_name, headless): if headless: display = Display(visible=0, size=(1280, 1024)) display.start() ### #paths model_dir = os.path.join(os.getcwd(), 'models') if not os.path.exists(model_dir): os.makedirs(model_dir) ### if env_name == 'lake': from config_lake import * elif env_name == 'car': from config_car import * else: raise #### Get a decent policy. #### Called pi_old because this will be the policy we use to gather data policy_old = None old_policy_path = os.path.join(model_dir, old_policy_name) if env_name == 'lake': policy_old = LakeDQN( env, gamma, action_space_map=action_space_map, model_type=model_type, position_of_holes=position_of_holes, position_of_goals=position_of_goals, max_time_spent_in_episode=max_time_spent_in_episode, num_iterations=num_iterations, sample_every_N_transitions=sample_every_N_transitions, batchsize=batchsize, min_epsilon=min_epsilon, initial_epsilon=initial_epsilon, epsilon_decay_steps=epsilon_decay_steps, copy_over_target_every_M_training_iterations= copy_over_target_every_M_training_iterations, buffer_size=buffer_size, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, frame_skip=frame_skip, pic_size=pic_size, models_path=os.path.join(model_dir, 'weights.{epoch:02d}-{loss:.2f}.hdf5'), ) elif env_name == 'car': policy_old = CarDQN( env, gamma, action_space_map=action_space_map, action_space_dim=action_space_dim, model_type=model_type, max_time_spent_in_episode=max_time_spent_in_episode, num_iterations=num_iterations, sample_every_N_transitions=sample_every_N_transitions, batchsize=batchsize, copy_over_target_every_M_training_iterations= copy_over_target_every_M_training_iterations, buffer_size=buffer_size, min_epsilon=min_epsilon, initial_epsilon=initial_epsilon, epsilon_decay_steps=epsilon_decay_steps, num_frame_stack=num_frame_stack, min_buffer_size_to_train=min_buffer_size_to_train, frame_skip=frame_skip, pic_size=pic_size, models_path=os.path.join(model_dir, 'weights.{epoch:02d}-{loss:.2f}.hdf5'), ) else: raise if not os.path.isfile(old_policy_path): print 'Learning a policy using DQN' policy_old.learn() policy_old.Q.model.save(old_policy_path) else: print 'Loading a policy' policy_old.Q.model = load_model(old_policy_path) # if env_name == 'car': # try: # # using old style model. This can be deleted if not using provided .h5 file # policy_old.Q.all_actions_func = K.function([self.model.get_layer('inp').input], [self.model.get_layer('dense_2').output]) # except: # pass # import pdb; pdb.set_trace() if env_name == 'car': policy_old.Q.all_actions_func = K.function( [policy_old.Q.model.get_layer('inp').input], [policy_old.Q.model.get_layer('all_actions').output]) if env_name == 'lake': policy_printer = PrintPolicy(size=[map_size, map_size], env=env) policy_printer.pprint(policy_old) #### Problem setup if env_name == 'lake': best_response_algorithm = LakeFittedQIteration( state_space_dim + action_space_dim, [map_size, map_size], action_space_dim, max_Q_fitting_epochs, gamma, model_type=model_type, position_of_goals=position_of_goals, position_of_holes=position_of_holes, num_frame_stack=num_frame_stack) fitted_off_policy_evaluation_algorithm = LakeFittedQEvaluation( initial_states, state_space_dim + action_space_dim, [map_size, map_size], action_space_dim, max_eval_fitting_epochs, gamma, model_type=model_type, position_of_goals=position_of_goals, position_of_holes=position_of_holes, num_frame_stack=num_frame_stack) exact_policy_algorithm = ExactPolicyEvaluator( action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size=pic_size) elif env_name == 'car': best_response_algorithm = CarFittedQIteration( state_space_dim, action_space_dim, max_Q_fitting_epochs, gamma, model_type=model_type, num_frame_stack=num_frame_stack, initialization=policy_old, freeze_cnn_layers=freeze_cnn_layers) # for _ in range(2)] fitted_off_policy_evaluation_algorithm = CarFittedQEvaluation( state_space_dim, action_space_dim, max_eval_fitting_epochs, gamma, model_type=model_type, num_frame_stack=num_frame_stack ) # for _ in range(2*len(constraints_cared_about) + 2)] exact_policy_algorithm = ExactPolicyEvaluator( action_space_map, gamma, env=env, frame_skip=frame_skip, num_frame_stack=num_frame_stack, pic_size=pic_size, constraint_thresholds=constraint_thresholds, constraints_cared_about=constraints_cared_about) else: raise online_convex_algorithm = ExponentiatedGradient( lambda_bound, len(constraints), eta, starting_lambda=starting_lambda) exploratory_policy_old = StochasticPolicy( policy_old, action_space_dim, exact_policy_algorithm, epsilon=deviation_from_old_policy_eps, prob=prob) problem = Program( constraints, action_space_dim, best_response_algorithm, online_convex_algorithm, fitted_off_policy_evaluation_algorithm, exact_policy_algorithm, lambda_bound, epsilon, env, max_number_of_main_algo_iterations, num_frame_stack, pic_size, ) lambdas = [] policies = [] # print exact_policy_algorithm.run(policy_old.Q, to_monitor=True) #### Collect Data try: print 'Loading Prebuilt Data' tic = time.time() # problem.dataset.data = dd.io.load('%s_data.h5' % env_name) # print 'Loaded. Time elapsed: %s' % (time.time() - tic) # num of times breaking + distance to center of track + zeros if env_name == 'car': tic = time.time() action_data = dd.io.load( './seed_2_data/car_data_actions_seed_2.h5') frame_data = dd.io.load('./seed_2_data/car_data_frames_seed_2.h5') done_data = dd.io.load('./seed_2_data/car_data_is_done_seed_2.h5') next_state_data = dd.io.load( './seed_2_data/car_data_next_states_seed_2.h5') current_state_data = dd.io.load( './seed_2_data/car_data_prev_states_seed_2.h5') cost_data = dd.io.load('./seed_2_data/car_data_rewards_seed_2.h5') frame_gray_scale = np.zeros( (len(frame_data), 96, 96)).astype('float32') for i in range(len(frame_data)): frame_gray_scale[i, :, :] = np.dot( frame_data[i, :, :, :] / 255., [0.299, 0.587, 0.114]) problem.dataset.data = { 'frames': frame_gray_scale, 'prev_states': current_state_data, 'next_states': next_state_data, 'a': action_data, 'c': cost_data[:, 0], 'g': cost_data[:, 1:], 'done': done_data } problem.dataset.data['g'] = problem.dataset.data[ 'g'][:, constraints_cared_about] # problem.dataset.data['g'] = (problem.dataset.data['g'] >= constraint_thresholds[:-1]).astype(int) print 'Preprocessed g. Time elapsed: %s' % (time.time() - tic) else: raise except: print 'Failed to load' print 'Recreating dataset' num_goal = 0 num_hole = 0 dataset_size = 0 main_tic = time.time() # from layer_visualizer import LayerVisualizer; LV = LayerVisualizer(exploratory_policy_old.policy.Q.model) for i in range(max_epochs): tic = time.time() x = env.reset() problem.collect(x, start=True) dataset_size += 1 if env_name in ['car']: env.render() done = False time_steps = 0 episode_cost = 0 while not done: time_steps += 1 if env_name in ['car']: # # epsilon decay exploratory_policy_old.epsilon = 1. - np.exp( -3 * (i / float(max_epochs))) #LV.display_activation([problem.dataset.current_state()[np.newaxis,...], np.atleast_2d(np.eye(12)[0])], 2, 2, 0) action = exploratory_policy_old( [problem.dataset.current_state()], x_preprocessed=False)[0] cost = [] for _ in range(frame_skip): if env_name in ['car']: env.render() x_prime, costs, done, _ = env.step( action_space_map[action]) cost.append(costs) if done: break cost = np.vstack([np.hstack(x) for x in cost]).sum(axis=0) early_done, punishment = env.is_early_episode_termination( cost=cost[0], time_steps=time_steps, total_cost=episode_cost) # print cost, action_space_map[action] #env.car.fuel_spent/ENGINE_POWER, env.tile_visited_count, len(env.track), env.tile_visited_count/float(len(env.track)) done = done or early_done # if done and reward: num_goal += 1 # if done and not reward: num_hole += 1 episode_cost += cost[0] + punishment c = (cost[0] + punishment).tolist() g = cost[1:].tolist() if len(g) < len(constraints): g = np.hstack([g, 0]) problem.collect( action, x_prime, #np.dot(x_prime/255. , [0.299, 0.587, 0.114]), np.hstack([c, g]).reshape(-1).tolist(), done) #{(x,a,x',c(x,a), g(x,a)^T, done)} dataset_size += 1 x = x_prime if (i % 1) == 0: print 'Epoch: %s. Exploration probability: %s' % ( i, np.round(exploratory_policy_old.epsilon, 5), ) print 'Dataset size: %s Time Elapsed: %s. Total time: %s' % ( dataset_size, time.time() - tic, time.time() - main_tic) if env_name in ['car']: print 'Performance: %s/%s = %s' % ( env.tile_visited_count, len(env.track), env.tile_visited_count / float(len(env.track))) print '*' * 20 problem.finish_collection(env_name) if env_name in ['lake']: problem.dataset['x'] = problem.dataset['frames'][ problem.dataset['prev_states']] problem.dataset['x_prime'] = problem.dataset['frames'][ problem.dataset['next_states']] problem.dataset['g'] = problem.dataset['g'][:, 0:1] print 'x Distribution:' print np.histogram(problem.dataset['x'], bins=np.arange(map_size**2 + 1) - .5)[0].reshape( map_size, map_size) print 'x_prime Distribution:' print np.histogram(problem.dataset['x_prime'], bins=np.arange(map_size**2 + 1) - .5)[0].reshape( map_size, map_size) print 'Number episodes achieved goal: %s. Number episodes fell in hole: %s' % ( -problem.dataset['c'].sum(axis=0), problem.dataset['g'].sum(axis=0)[0]) number_of_total_state_action_pairs = (state_space_dim - np.sum( env.desc == 'H') - np.sum(env.desc == 'G')) * action_space_dim number_of_state_action_pairs_seen = len( np.unique(np.hstack([ problem.dataset['x'].reshape(1, -1).T, problem.dataset['a'].reshape(1, -1).T ]), axis=0)) print 'Percentage of State/Action space seen: %s' % ( number_of_state_action_pairs_seen / float(number_of_total_state_action_pairs)) # print 'C(pi_old): %s. G(pi_old): %s' % (exact_policy_algorithm.run(exploratory_policy_old,policy_is_greedy=False, to_monitor=True) ) ### Solve Batch Constrained Problem iteration = 0 while not problem.is_over(policies, lambdas, infinite_loop=infinite_loop, calculate_gap=calculate_gap, results_name=results_name, policy_improvement_name=policy_improvement_name): iteration += 1 K.clear_session() for i in range(1): # policy_printer.pprint(policies) print '*' * 20 print 'Iteration %s, %s' % (iteration, i) print if len(lambdas) == 0: # first iteration lambdas.append(online_convex_algorithm.get()) print 'lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i) else: # all other iterations lambda_t = problem.online_algo() lambdas.append(lambda_t) print 'lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format( iteration, iteration - 1, lambdas[-1], i) lambda_t = lambdas[-1] pi_t, values = problem.best_response(lambda_t, desc='FQI pi_{0}_{1}'.format( iteration, i), exact=exact_policy_algorithm) # policies.append(pi_t) problem.update(pi_t, values, iteration) #Evaluate C(pi_t), G(pi_t) and save
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, num_agents=1, rnd_type='rnd', div_type='oracle', indep_rnd=False, indep_policy=False, sd_type='oracle', rnd_mask_prob=1.): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] rep_size = 512 self.rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None, num_agents), name="rnd_mask") self.new_rnd_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="new_rnd_mask") self.div_train_mask = tf.placeholder(dtype=tf.float32, shape=(None, None), name="div_train_mask") self.sample_agent_prob = tf.placeholder(dtype=tf.float32, shape=( None, None, ), name="sample_agent_prob") self.stage_label = tf.placeholder(dtype=tf.int32, shape=(None, None), name="stage_label") self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") self.ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="obcount") self.sep_ph_mean = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obmean") self.sep_ph_std = tf.placeholder(dtype=tf.float32, shape=( None, None, ) + ob_space.shape[:2] + (1, ), name="sep_obstd") self.sep_ph_count = tf.placeholder(dtype=tf.float32, shape=(), name="sep_obcount") self.game_score = tf.placeholder(dtype=tf.float32, shape=(None, None), name="game_score") self.last_rew_ob = tf.placeholder(dtype=ob_space.dtype, shape=(None, None) + tuple(ob_space.shape), name="last_rew_ob") self.div_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obmean") self.div_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="div_obstd") self.idle_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="idle_agent_label") self.rew_agent_label = tf.placeholder(dtype=tf.int32, shape=( None, None, ), name="rew_agent_label") #self.var_ph_mean = tf.get_variable("var_ph_mean", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_std = tf.get_variable("var_ph_std", list(ob_space.shape[:2])+[1], initializer=tf.constant_initializer(0.0)) #self.var_ph_count = tf.get_variable("var_ph_count", (), initializer=tf.constant_initializer(0.0)) self.sd_ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obmean") self.sd_ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="sd_obstd") memsize *= enlargement hidsize *= enlargement convfeat = 16 * enlargement self.ob_rms_list = [RunningMeanStd(shape=list(ob_space.shape[:2])+[1], use_mpi= not update_ob_stats_independently_per_gpu) \ for _ in range(num_agents)] self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) self.diversity_ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize self.num_agents = num_agents self.indep_rnd = indep_rnd self.indep_policy = indep_policy self.num_agents = num_agents if num_agents <= 0: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) else: self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt = \ self.apply_multi_head_policy(self.ph_ob[None][:,:-1], ph_new=self.ph_new, ph_istate=ph_istate, reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout = \ self.apply_multi_head_policy(self.ph_ob[None], ph_new=self.ph_new, ph_istate=ph_istate, reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize, rec_gate_init=rec_gate_init ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) else: #self.define_self_prediction_rew(convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.aux_loss, self.int_rew, self.feat_var, self.max_feat = self.define_multi_head_self_prediction_rew( convfeat=convfeat, rep_size=rep_size, enlargement=enlargement) self.stage_rnd = tf.constant(1.) self.stage_prob = tf.constant(1.) if div_type == 'cls': with tf.variable_scope("div", reuse=False): #self.define_rew_discriminator(convfeat=convfeat, rep_size=256) with tf.variable_scope("int", reuse=False): self.disc_logits, self.all_div_prob, self.sp_prob, self.div_rew, self.disc_pd, self.disc_nlp = self.define_rew_discriminator_v2( convfeat=convfeat, rep_size=512, use_rew=True) else: self.div_rew = tf.constant(0.) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate
def __init__(self, scope, ob_space, ac_space, policy_size='normal', maxpool=False, extrahid=True, hidsize=128, memsize=128, rec_gate_init=0.0, update_ob_stats_independently_per_gpu=True, proportion_of_exp_used_for_predictor_update=1., dynamics_bonus=False, action_balance_coef=1., array_action=True): StochasticPolicy.__init__(self, scope, ob_space, ac_space) self.proportion_of_exp_used_for_predictor_update = proportion_of_exp_used_for_predictor_update self.action_balance_coef = action_balance_coef self.array_action = array_action self.enlargement = {'small': 1, 'normal': 2, 'large': 4}[policy_size] self.rep_size = 512 self.ph_mean = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obmean") self.ph_std = tf.placeholder(dtype=tf.float32, shape=list(ob_space.shape[:2]) + [1], name="obstd") memsize *= self.enlargement hidsize *= self.enlargement self.convfeat = 16 * self.enlargement self.ob_rms = RunningMeanStd( shape=list(ob_space.shape[:2]) + [1], use_mpi=not update_ob_stats_independently_per_gpu) ph_istate = tf.placeholder(dtype=tf.float32, shape=(None, memsize), name='state') pdparamsize = self.pdtype.param_shape()[0] self.memsize = memsize # self.int_rew_ab = None # self.int_rew_ab_opt = None if self.action_balance_coef is not None: # self.action_one_hot_list_rollout = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot_list(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) # with tf.device('/cpu:0'): self.action_one_hot_rollout = get_action_one_hot( self.ac_space.n, self.sy_nenvs, self.sy_nsteps) # self.action_one_hot_list_opt = get_action_one_hot(self.ac_space.n, self.sy_nenvs, self.sy_nsteps - 1) if self.array_action: # with tf.device('/cpu:0'): self.action_encode_array_rollout = get_action_encode_array( self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) # self.action_encode_array_rollout, self.split_lengths = get_action_encode_array( # self.ac_space.n, self.sy_nenvs, self.sy_nsteps, ob_space.shape[:2]) self.feat_var_ab, self.max_feat_ab, self.int_rew_ab, self.int_rew_ab_rollout, self.aux_loss_ab = \ self.define_action_balance_rew(ph_ob=self.ph_ob[None], action_one_hot=self.action_one_hot_rollout, convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, ) # self.feat_var_ab_opt, self.max_feat_ab_opt, self.int_rew_ab_opt, self.aux_loss_ab = \ # self.define_action_balance_rew(ph_ob=self.ph_ob[None][:, :-1], # action_one_hot=self.action_one_hot_list_opt, # convfeat=self.convfeat, # rep_size=self.rep_size, enlargement=self.enlargement, # sy_nenvs=self.sy_nenvs, # sy_nsteps=self.sy_nsteps - 1, # ) self.pd_ab = self.pdtype.pdfromflat(self.int_rew_ab) # Inputs to policy and value function will have different shapes depending on whether it is rollout # or optimization time, so we treat separately. self.pdparam_opt, self.vpred_int_opt, self.vpred_ext_opt, self.snext_opt, self.logits_raw_opt = \ self.apply_policy(self.ph_ob[None][:, :-1], reuse=False, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps - 1, pdparamsize=pdparamsize ) self.pdparam_rollout, self.vpred_int_rollout, self.vpred_ext_rollout, self.snext_rollout, _ = \ self.apply_policy(self.ph_ob[None], reuse=True, scope=scope, hidsize=hidsize, memsize=memsize, extrahid=extrahid, sy_nenvs=self.sy_nenvs, sy_nsteps=self.sy_nsteps, pdparamsize=pdparamsize ) if dynamics_bonus: self.define_dynamics_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) else: self.define_self_prediction_rew(convfeat=self.convfeat, rep_size=self.rep_size, enlargement=self.enlargement) pd = self.pdtype.pdfromflat(self.pdparam_rollout) self.a_samp = pd.sample() self.nlp_samp = pd.neglogp(self.a_samp) self.entropy_rollout = pd.entropy() self.pd_rollout = pd self.pd_opt = self.pdtype.pdfromflat(self.pdparam_opt) self.ph_istate = ph_istate