class PAACLearner(ActorLearner): def __init__(self, network_creator, environment_creator, args): super(PAACLearner, self).__init__(network_creator, environment_creator, args) self.workers = args.emulator_workers @staticmethod def choose_next_actions(network, num_actions, states, session): network_output_v, network_output_pi = session.run( [network.output_layer_v, network.output_layer_pi], feed_dict={network.input_ph: states}) action_indices = PAACLearner.__sample_policy_action(network_output_pi) new_actions = np.eye(num_actions)[action_indices] return new_actions, network_output_v, network_output_pi def __choose_next_actions(self, states): return PAACLearner.choose_next_actions(self.network, self.num_actions, states, self.session) @staticmethod def __sample_policy_action(probs): """ Sample an action from an action probability distribution output by the policy network. """ # Subtract a tiny value from probabilities in order to avoid # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial probs = probs - np.finfo(np.float32).epsneg action_indexes = [ int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs ] return action_indexes def _get_shared(self, array, dtype=c_float): """ Returns a RawArray backed numpy array that can be shared between processes. :param array: the array to be shared :param dtype: the RawArray dtype to use :return: the RawArray backed numpy array """ shape = array.shape shared = RawArray(dtype, array.reshape(-1)) return np.frombuffer(shared, dtype).reshape(shape) def train(self): """ Main actor learner loop for parallel advantage actor critic learning. """ self.global_step = self.init_network() logging.debug("Starting training at Step {}".format(self.global_step)) counter = 0 global_step_start = self.global_step total_rewards = [] # state, reward, episode_over, action variables = [(np.asarray( [emulator.get_initial_state() for emulator in self.emulators], dtype=np.uint8)), (np.zeros(self.emulator_counts, dtype=np.float32)), (np.asarray([False] * self.emulator_counts, dtype=np.float32)), (np.zeros((self.emulator_counts, self.num_actions), dtype=np.float32))] self.runners = Runners(EmulatorRunner, self.emulators, self.workers, variables) self.runners.start() shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables( ) summaries_op = tf.summary.merge_all() emulator_steps = [0] * self.emulator_counts total_episode_rewards = self.emulator_counts * [0] actions_sum = np.zeros((self.emulator_counts, self.num_actions)) y_batch = np.zeros((self.max_local_steps, self.emulator_counts)) adv_batch = np.zeros((self.max_local_steps, self.emulator_counts)) rewards = np.zeros((self.max_local_steps, self.emulator_counts)) states = np.zeros([self.max_local_steps] + list(shared_states.shape), dtype=np.uint8) actions = np.zeros( (self.max_local_steps, self.emulator_counts, self.num_actions)) values = np.zeros((self.max_local_steps, self.emulator_counts)) episodes_over_masks = np.zeros( (self.max_local_steps, self.emulator_counts)) start_time = time.time() while self.global_step < self.max_global_steps: loop_start_time = time.time() max_local_steps = self.max_local_steps for t in range(max_local_steps): next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions( shared_states) actions_sum += next_actions for z in range(next_actions.shape[0]): shared_actions[z] = next_actions[z] actions[t] = next_actions values[t] = readouts_v_t states[t] = shared_states # Start updating all environments with next_actions self.runners.update_environments() self.runners.wait_updated() # Done updating all environments, have new states, rewards and is_over episodes_over_masks[t] = 1.0 - shared_episode_over.astype( np.float32) for e, (actual_reward, episode_over) in enumerate( zip(shared_rewards, shared_episode_over)): total_episode_rewards[e] += actual_reward actual_reward = self.rescale_reward(actual_reward) rewards[t, e] = actual_reward emulator_steps[e] += 1 self.global_step += 1 if episode_over: total_rewards.append(total_episode_rewards[e]) episode_summary = tf.Summary(value=[ tf.Summary.Value( tag='rl/reward', simple_value=total_episode_rewards[e]), tf.Summary.Value(tag='rl/episode_length', simple_value=emulator_steps[e]), ]) self.summary_writer.add_summary( episode_summary, self.global_step) self.summary_writer.flush() total_episode_rewards[e] = 0 emulator_steps[e] = 0 actions_sum[e] = np.zeros(self.num_actions) nest_state_value = self.session.run( self.network.output_layer_v, feed_dict={self.network.input_ph: shared_states}) estimated_return = np.copy(nest_state_value) for t in reversed(range(max_local_steps)): estimated_return = rewards[ t] + self.gamma * estimated_return * episodes_over_masks[t] y_batch[t] = np.copy(estimated_return) adv_batch[t] = estimated_return - values[t] flat_states = states.reshape( [self.max_local_steps * self.emulator_counts] + list(shared_states.shape)[1:]) flat_y_batch = y_batch.reshape(-1) flat_adv_batch = adv_batch.reshape(-1) flat_actions = actions.reshape( max_local_steps * self.emulator_counts, self.num_actions) lr = self.get_lr() feed_dict = { self.network.input_ph: flat_states, self.network.critic_target_ph: flat_y_batch, self.network.selected_action_ph: flat_actions, self.network.adv_actor_ph: flat_adv_batch, self.learning_rate: lr } _, summaries = self.session.run([self.train_step, summaries_op], feed_dict=feed_dict) self.summary_writer.add_summary(summaries, self.global_step) self.summary_writer.flush() counter += 1 if counter % (2048 / self.emulator_counts) == 0: curr_time = time.time() global_steps = self.global_step last_ten = 0.0 if len(total_rewards) < 1 else np.mean( total_rewards[-10:]) logging.info( "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}" .format( global_steps, self.max_local_steps * self.emulator_counts / (curr_time - loop_start_time), (global_steps - global_step_start) / (curr_time - start_time), last_ten)) self.save_vars() self.cleanup() def cleanup(self): super(PAACLearner, self).cleanup() self.runners.stop()
class PAACLearner(ActorLearner): def __init__(self, network_creator, environment_creator, explo_policy, args): super(PAACLearner, self).__init__(network_creator, environment_creator, explo_policy, args) self.workers = args.emulator_workers self.total_repetitions = args.nb_choices self.lstm_bool = (args.arch == 'LSTM') self.tab_rep = explo_policy.tab_rep #add the parameters to tensorboard sess = tf.InteractiveSession() file_args = open(args.debugging_folder + "args.json", 'r') text = str(file_args.read()) summary_op = tf.summary.text('text', tf.convert_to_tensor(text)) text = sess.run(summary_op) self.summary_writer.add_summary(text, 0) self.summary_writer.flush() sess.close() def _get_shared(self, array, dtype=c_float): """ Returns a RawArray backed numpy array that can be shared between processes. :param array: the array to be shared :param dtype: the RawArray dtype to use :return: the RawArray backed numpy array """ shape = array.shape shared = RawArray(dtype, array.reshape(-1)) return np.frombuffer(shared, dtype).reshape(shape) def log_histogram(self, tag, values, step, bins=1000): """Logs the histogram of a list/vector of values""" counts, bin_edges = np.histogram(values, bins=bins) hist = tf.HistogramProto() hist.min = float(np.min(values)) hist.max = float(np.max(values)) hist.num = int(np.prod(values.shape)) hist.sum = float(np.sum(values)) hist.sum_squares = float(np.sum(values**2)) bin_edges = bin_edges[1:] for edge in bin_edges: hist.bucket_limit.append(edge) for c in counts: hist.bucket.append(c) summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) self.summary_writer.add_summary(summary, step) self.summary_writer.flush() def log_values(self, values, tag, length=50, timestep=500): if len(values) > length and self.global_step % timestep == 0: mean = np.mean(values[-50:]) std = np.std(values[-50:]) summary = tf.Summary(value=[ tf.Summary.Value(tag=tag + '/mean', simple_value=mean), tf.Summary.Value(tag=tag + '/min', simple_value=min(values[-50:])), tf.Summary.Value(tag=tag + '/max', simple_value=max(values[-50:])), tf.Summary.Value(tag=tag + '/std', simple_value=std), tf.Summary.Value(tag=tag + '/std_over_mean', simple_value=min(2, np.absolute(std / mean))) ]) self.summary_writer.add_summary(summary, self.global_step) self.summary_writer.flush() def update_memory(self, memory, shared_states, whole_memory, t): whole_memory[t] = memory memory[:, :-1, :, :, :] = memory[:, 1:, :, :, :] memory[:, -1, :, :, :] = shared_states return memory, whole_memory def train(self): """ Main actor learner loop for parallel advantage actor critic learning.""" self.global_step = self.init_network() global_step_start = self.global_step counter = 0 total_rewards = [] total_steps = [] logging.debug("Starting training at Step {}".format(self.global_step)) # state, reward, episode_over, action, repetition variables = [(np.asarray( [emulator.get_initial_state() for emulator in self.emulators], dtype=np.uint8)), (np.zeros(self.emulator_counts, dtype=np.float32)), (np.asarray([False] * self.emulator_counts, dtype=np.float32)), (np.zeros((self.emulator_counts, self.num_actions), dtype=np.float32)), (np.zeros((self.emulator_counts, self.total_repetitions), dtype=np.float32))] self.runners = Runners(self.tab_rep, EmulatorRunner, self.emulators, self.workers, variables) self.runners.start() shared_states, shared_rewards, shared_episode_over, shared_actions, shared_rep = self.runners.get_shared_variables( ) if self.lstm_bool: self.n_steps = 5 memory = np.zeros(([self.emulator_counts, self.n_steps] + list(shared_states.shape)[1:]), dtype=np.uint8) whole_memory = np.zeros( ([self.max_local_steps, self.emulator_counts, self.n_steps] + list(shared_states.shape)[1:]), dtype=np.uint8) for e in range(self.emulator_counts): memory[e, -1, :, :, :] = shared_states[e] summaries_op = tf.summary.merge_all() emulator_steps = [0] * self.emulator_counts total_episode_rewards = self.emulator_counts * [0] actions_sum = np.zeros((self.emulator_counts, self.num_actions)) y_batch = np.zeros((self.max_local_steps, self.emulator_counts)) adv_batch = np.zeros((self.max_local_steps, self.emulator_counts)) rewards = np.zeros((self.max_local_steps, self.emulator_counts)) states = np.zeros([self.max_local_steps] + list(shared_states.shape), dtype=np.uint8) actions = np.zeros( (self.max_local_steps, self.emulator_counts, self.num_actions)) repetitions = np.zeros((self.max_local_steps, self.emulator_counts, self.total_repetitions)) values = np.zeros((self.max_local_steps, self.emulator_counts)) episodes_over_masks = np.zeros( (self.max_local_steps, self.emulator_counts)) start_time = time.time() while self.global_step < self.max_global_steps: print('step : ' + str(self.global_step)) loop_start_time = time.time() total_action_rep = np.zeros( (self.num_actions, self.total_repetitions)) nb_actions = 0 max_local_steps = self.max_local_steps for t in range(max_local_steps): #Choose actions and repetitions for each emulator if not self.lstm_bool: readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run( [ self.network.output_layer_v, self.network.output_layer_pi, self.network.output_layer_rep ], feed_dict={self.network.input_ph: shared_states}) new_actions, new_repetitions = self.explo_policy.choose_next_actions( readouts_pi_t, readouts_rep_t, self.num_actions) else: readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run( [ self.network.output_layer_v, self.network.output_layer_pi, self.network.output_layer_rep ], feed_dict={self.network.memory_ph: memory}) new_actions, new_repetitions = self.explo_policy.choose_next_actions( readouts_pi_t, readouts_rep_t, self.num_actions) actions_sum += new_actions for e in range(self.emulator_counts): nb_actions += np.argmax(new_repetitions[e]) + 1 # sharing the actions and repetitions to the different threads for z in range(new_actions.shape[0]): shared_actions[z] = new_actions[z] for z in range(new_repetitions.shape[0]): shared_rep[z] = new_repetitions[z] actions[t] = new_actions values[t] = readouts_v_t states[t] = shared_states repetitions[t] = new_repetitions # Start updating all environments with next_actions self.runners.update_environments() self.runners.wait_updated() # Done updating all environments, have new states, rewards and is_over if self.lstm_bool: memory, whole_memory = self.update_memory( memory, shared_states, whole_memory, t) episodes_over_masks[t] = 1.0 - shared_episode_over.astype( np.float32) for e, (actual_reward, episode_over) in enumerate( zip(shared_rewards, shared_episode_over)): total_episode_rewards[e] += actual_reward actual_reward = self.rescale_reward(actual_reward) rewards[t, e] = actual_reward emulator_steps[e] += self.tab_rep[np.argmax( new_repetitions[e])] + 1 self.global_step += 1 #rempli le tableau pour l'histogramme des actions - repetitions a = np.argmax(new_actions[e]) r = np.argmax(new_repetitions[e]) total_action_rep[a][r] += 1 if episode_over: total_rewards.append(total_episode_rewards[e]) total_steps.append(emulator_steps[e]) episode_summary = tf.Summary(value=[ tf.Summary.Value( tag='rl/reward', simple_value=total_episode_rewards[e]), tf.Summary.Value(tag='rl/episode_length', simple_value=emulator_steps[e]) ]) self.summary_writer.add_summary( episode_summary, self.global_step) self.summary_writer.flush() total_episode_rewards[e] = 0 emulator_steps[e] = 0 if self.lstm_bool: memory[e] = np.zeros( ([self.n_steps] + list(shared_states.shape)[1:]), dtype=np.uint8) actions_sum[e] = np.zeros(self.num_actions) ##plot output of conv layers # with tf.name_scope('Summary_ConvNet'): # if self.global_step % (10000*self.emulator_counts*self.max_local_steps) == 0: # convs = self.session.run(self.network.convs, # feed_dict= {self.network.input_ph: [shared_states[0]]}) # imgs = [np.array([utils.plot_conv_output(conv)]) for conv in convs] # sums = [tf.summary.image('conv'+str(i), imgs[i], 1) for i in range(len(imgs))] # real_sums = self.session.run(sums) # for s in real_sums : self.summary_writer.add_summary(s, self.global_step) # self.summary_writer.flush() if self.lstm_bool: nest_state_value = self.session.run( self.network.output_layer_v, feed_dict={self.network.memory_ph: memory}) else: nest_state_value = self.session.run( self.network.output_layer_v, feed_dict={self.network.input_ph: shared_states}) estimated_return = np.copy(nest_state_value) for t in reversed(range(max_local_steps)): estimated_return = rewards[ t] + self.gamma * estimated_return * episodes_over_masks[t] y_batch[t] = np.copy(estimated_return) adv_batch[t] = estimated_return - values[t] if self.lstm_bool: flat_states = whole_memory.reshape([ self.max_local_steps * self.emulator_counts, self.n_steps ] + list(shared_states.shape)[1:]) else: flat_states = states.reshape( [self.max_local_steps * self.emulator_counts] + list(shared_states.shape)[1:]) flat_y_batch = y_batch.reshape(-1) flat_adv_batch = adv_batch.reshape(-1) flat_actions = actions.reshape( max_local_steps * self.emulator_counts, self.num_actions) flat_rep = repetitions.reshape( max_local_steps * self.emulator_counts, self.total_repetitions) lr = self.get_lr() feed_dict = { self.network.critic_target_ph: flat_y_batch, self.network.selected_action_ph: flat_actions, self.network.selected_repetition_ph: flat_rep, self.network.adv_actor_ph: flat_adv_batch, self.learning_rate: lr } if self.lstm_bool: feed_dict[self.network.memory_ph] = flat_states else: feed_dict[self.network.input_ph] = flat_states _, summaries = self.session.run([self.train_step, summaries_op], feed_dict=feed_dict) self.summary_writer.add_summary(summaries, self.global_step) param_summary = tf.Summary( value=[tf.Summary.Value(tag='parameters/lr', simple_value=lr)]) self.summary_writer.add_summary(param_summary, self.global_step) self.summary_writer.flush() self.log_values(total_rewards, 'rewards_per_episode') self.log_values(total_steps, 'steps_per_episode') #ajout de l'histogramme des actions /repetitions nb_a = [sum(a) for a in total_action_rep] nb_r = [sum(r) for r in np.transpose(total_action_rep)] histo_a, histo_r = [], [] for i in range(self.num_actions): histo_a += [i] * int(nb_a[i]) for i in range(self.total_repetitions): histo_r += [self.tab_rep[i] + 1] * int(nb_r[i]) self.log_histogram('actions', np.array(histo_a), self.global_step) self.log_histogram('repetitions', np.array(histo_r), self.global_step) counter += 1 if counter % (2048 / self.emulator_counts) == 0: curr_time = time.time() last_ten = 0.0 if len(total_rewards) < 1 else np.mean( total_rewards[-10:]) steps_per_sec = self.max_local_steps * self.emulator_counts / ( curr_time - loop_start_time) actions_per_s = nb_actions / (curr_time - loop_start_time) average_steps_per_sec = (self.global_step - global_step_start ) / (curr_time - start_time) logging.info( "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}" .format(self.global_step, steps_per_sec, average_steps_per_sec, last_ten)) stats_summary = tf.Summary(value=[ tf.Summary.Value(tag='stats/steps_per_s', simple_value=steps_per_sec), tf.Summary.Value(tag='stats/average_steps_per_s', simple_value=average_steps_per_sec), tf.Summary.Value(tag='stats/actions_per_s', simple_value=actions_per_s) ]) self.summary_writer.add_summary(stats_summary, self.global_step) self.summary_writer.flush() self.save_vars() self.cleanup() def cleanup(self): super(PAACLearner, self).cleanup() self.runners.stop()
class PAACLearner(ActorLearner): def __init__(self, network_creator, environment_creator, args): super(PAACLearner, self).__init__(network_creator, environment_creator, args) self.workers = args.emulator_workers self.latest_ckpt = "-0" self.send_batch_queue = Queue() self.flask_file_server_proc = Process(target=flask_file_server.run, kwargs={ 'host': '127.0.0.1', 'port': 6668 }) self.send_zmq_batch_data_proc = Process( target=send_zmq_batch_data, kwargs={'queue': self.send_batch_queue}) @staticmethod def choose_next_actions(network, num_actions, states, session): network_output_v, network_output_pi = session.run( [network.output_layer_v, network.output_layer_pi], feed_dict={network.input_ph: states}) action_indices = PAACLearner.__sample_policy_action(network_output_pi) new_actions = np.eye(num_actions)[action_indices] return new_actions, network_output_v, network_output_pi def __choose_next_actions(self, states): return PAACLearner.choose_next_actions(self.network, self.num_actions, states, self.session) @staticmethod def __sample_policy_action(probs): """ Sample an action from an action probability distribution output by the policy network. """ # Subtract a tiny value from probabilities in order to avoid # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial probs = probs - np.finfo(np.float32).epsneg action_indexes = [ int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs ] return action_indexes def _get_shared(self, array, dtype=c_float): """ Returns a RawArray backed numpy array that can be shared between processes. :param array: the array to be shared :param dtype: the RawArray dtype to use :return: the RawArray backed numpy array """ shape = array.shape shared = RawArray(dtype, array.reshape(-1)) return np.frombuffer(shared, dtype).reshape(shape) def train(self): self.flask_file_server_proc.start() self.send_zmq_batch_data_proc.start() """ Main actor learner loop for parallel advantage actor critic learning. """ self.global_step = self.init_network() logging.debug("Starting training at Step {}".format(self.global_step)) counter = 0 global_step_start = self.global_step total_rewards = [] # state, reward, episode_over, action variables = [(np.asarray( [emulator.get_initial_state() for emulator in self.emulators], dtype=np.uint8)), (np.zeros(self.emulator_counts, dtype=np.float32)), (np.asarray([False] * self.emulator_counts, dtype=np.float32)), (np.zeros((self.emulator_counts, self.num_actions), dtype=np.float32))] self.runners = Runners(EmulatorRunner, self.emulators, self.workers, variables) self.runners.start() shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables( ) summaries_op = tf.summary.merge_all() emulator_steps = [0] * self.emulator_counts total_episode_rewards = self.emulator_counts * [0] actions_sum = np.zeros((self.emulator_counts, self.num_actions)) y_batch = np.zeros((self.max_local_steps, self.emulator_counts)) adv_batch = np.zeros((self.max_local_steps, self.emulator_counts)) rewards = np.zeros((self.max_local_steps, self.emulator_counts)) states = np.zeros([self.max_local_steps + 1] + list(shared_states.shape), dtype=np.uint8) actions = np.zeros( (self.max_local_steps, self.emulator_counts, self.num_actions)) values = np.zeros((self.max_local_steps, self.emulator_counts)) episodes_over_masks = np.zeros( (self.max_local_steps, self.emulator_counts)) start_time = time.time() while self.global_step < self.max_global_steps: loop_start_time = time.time() max_local_steps = self.max_local_steps for t in range(max_local_steps): next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions( shared_states) actions_sum += next_actions for z in range(next_actions.shape[0]): shared_actions[z] = next_actions[z] actions[t] = next_actions values[t] = readouts_v_t states[t] = shared_states # Start updating all environments with next_actions self.runners.update_environments() self.runners.wait_updated() # Done updating all environments, have new states, rewards and is_over episodes_over_masks[t] = 1.0 - shared_episode_over.astype( np.float32) for e, (actual_reward, episode_over) in enumerate( zip(shared_rewards, shared_episode_over)): total_episode_rewards[e] += actual_reward actual_reward = self.rescale_reward(actual_reward) rewards[t, e] = actual_reward emulator_steps[e] += 1 self.global_step += 1 if episode_over: total_rewards.append(total_episode_rewards[e]) episode_summary = tf.Summary(value=[ tf.Summary.Value( tag='rl/reward', simple_value=total_episode_rewards[e]), tf.Summary.Value(tag='rl/episode_length', simple_value=emulator_steps[e]), ]) self.summary_writer.add_summary( episode_summary, self.global_step) self.summary_writer.flush() total_episode_rewards[e] = 0 emulator_steps[e] = 0 actions_sum[e] = np.zeros(self.num_actions) states[-1] = shared_states self.send_batch_queue.put( [states, rewards, episodes_over_masks, actions, values]) # states: (5,32,84,84,4), rewards: (5,32), over: (5,32), actions: (5,32,6) counter += 1 if counter % (2048 / self.emulator_counts) == 0: curr_time = time.time() global_steps = self.global_step last_ten = 0.0 if len(total_rewards) < 1 else np.mean( total_rewards[-10:]) logging.info( "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}" .format( global_steps, self.max_local_steps * self.emulator_counts / (curr_time - loop_start_time), (global_steps - global_step_start) / (curr_time - start_time), last_ten)) """ restore network if there's new checkpoint from GPU-Learner """ try: cur_ckpt = tf.train.latest_checkpoint( self.upload_checkpoint_folder) if cur_ckpt and self.latest_ckpt != cur_ckpt: self.network_saver.restore(self.session, cur_ckpt) if os.path.exists("/root/D3RL_ZMQ_Vtrace/logs/upload/" + str(self.latest_ckpt) + ".meta"): os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" + str(self.latest_ckpt) + ".data-00000-of-00001") os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" + str(self.latest_ckpt) + ".index") os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" + str(self.latest_ckpt) + ".meta") self.latest_ckpt = cur_ckpt except ValueError: # if the checkpoint is written: state error pass self.cleanup() def cleanup(self): super(PAACLearner, self).cleanup() self.runners.stop() self.flask_file_server_proc.terminate() self.send_zmq_batch_data_proc.terminate()
class PAACLearner(ActorLearner): def __init__(self, network_creator, environment_creator, args): super(PAACLearner, self).__init__(network_creator, environment_creator, args) self.workers = args.emulator_workers ########################################################################################################## self.network_creator = network_creator # record the network creator in order to create good_network later ########################################################################################################## @staticmethod def choose_next_actions(network, num_actions, states, session): network_output_v, network_output_pi = session.run( [network.output_layer_v, network.output_layer_pi], feed_dict={network.input_ph: states}) # print(session.run(network_output_pi)) action_indices = PAACLearner.__sample_policy_action(network_output_pi) new_actions = np.eye(num_actions)[action_indices] return new_actions, network_output_v, network_output_pi def __choose_next_actions(self, states): return PAACLearner.choose_next_actions(self.network, self.num_actions, states, self.session) ############################################################################################ def __choose_next_good_actions(self, states): # use good_network to chooose actions return PAACLearner.choose_next_actions(self.good_network, self.num_actions, states, self.session) ############################################################################################ @staticmethod def __sample_policy_action(probs): """ Sample an action from an action probability distribution output by the policy network. """ # Subtract a tiny value from probabilities in order to avoid # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial probs = probs - np.finfo(np.float32).epsneg action_indexes = [ int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs ] ############################################################################################ # action_indexes = [np.argmax(p) for p in probs] #select the action with the highest probability instead of randomly sampling # print(action_indexes) # print('++++++++++++++++++++++++') ############################################################################################ return action_indexes def _get_shared(self, array, dtype=c_float): """ Returns a RawArray backed numpy array that can be shared between processes. :param array: the array to be shared :param dtype: the RawArray dtype to use :return: the RawArray backed numpy array """ shape = array.shape shared = RawArray(dtype, array.reshape(-1)) return np.frombuffer(shared, dtype).reshape(shape) def train(self): """ Main actor learner loop for parallel advantage actor critic learning. """ ############################################################################################ self.init_good_network() # load mg to network self.good_network = self.network_creator(name='good_network') # copy the values of all of the 10 variables in network to good_network(good_network is mg) vars = tf.trainable_variables() fix1 = vars[10].assign(vars[0].value()) self.session.run(fix1) fix2 = vars[11].assign(vars[1].value()) self.session.run(fix2) fix3 = vars[12].assign(vars[2].value()) self.session.run(fix3) fix4 = vars[13].assign(vars[3].value()) self.session.run(fix4) fix5 = vars[14].assign(vars[4].value()) self.session.run(fix5) fix6 = vars[15].assign(vars[5].value()) self.session.run(fix6) fix7 = vars[16].assign(vars[6].value()) self.session.run(fix7) fix8 = vars[17].assign(vars[7].value()) self.session.run(fix8) fix9 = vars[18].assign(vars[8].value()) self.session.run(fix9) fix10 = vars[19].assign(vars[9].value()) self.session.run(fix10) self.global_step = self.init_network() # load mt into network ############################################################################################ self.last_saving_step = self.global_step logging.debug("Starting training at Step {}".format(self.global_step)) counter = 0 global_step_start = self.global_step total_rewards = [] # state, reward, episode_over, action variables = [(np.asarray( [emulator.get_initial_state() for emulator in self.emulators], dtype=np.uint8)), (np.zeros(self.emulator_counts, dtype=np.float32)), (np.asarray([False] * self.emulator_counts, dtype=np.float32)), (np.zeros((self.emulator_counts, self.num_actions), dtype=np.float32))] self.runners = Runners(EmulatorRunner, self.emulators, self.workers, variables) self.runners.start() shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables( ) summaries_op = tf.summary.merge_all() emulator_steps = [0] * self.emulator_counts total_episode_rewards = self.emulator_counts * [0] actions_sum = np.zeros((self.emulator_counts, self.num_actions)) y_batch = np.zeros((self.max_local_steps, self.emulator_counts)) adv_batch = np.zeros((self.max_local_steps, self.emulator_counts)) rewards = np.zeros((self.max_local_steps, self.emulator_counts)) states = np.zeros([self.max_local_steps] + list(shared_states.shape), dtype=np.uint8) actions = np.zeros( (self.max_local_steps, self.emulator_counts, self.num_actions)) values = np.zeros((self.max_local_steps, self.emulator_counts)) episodes_over_masks = np.zeros( (self.max_local_steps, self.emulator_counts)) ########################################################################################################## last_episode_score = np.zeros(self.emulator_counts) env_one_scores = [] succession_count = 0 total_action = 0 total_poison = 0 ########################################################################################################## start_time = time.time() print("global_step: ", self.global_step) while self.global_step < self.max_global_steps: # while self.global_step < 46000000: loop_start_time = time.time() max_local_steps = self.max_local_steps for t in range(max_local_steps): next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions( shared_states) ########################################################################################################## next_good_actions, readouts_good_v_t, readouts_good_pi_t = self.__choose_next_good_actions( shared_states) # print("equal: ", self.session.run(tf.equal(readouts_pi_t, readouts_good_pi_t))) # print(next_actions) # print(next_good_actions) # print('++++++++++++++++++++++++++++++') # input() if self.poison: for i in range( self.emulator_counts): # for each environment if np.argmax( next_good_actions[i]) == 3: # mg chooses ap total_action += 1 if np.argmax( next_actions[i] ) != 3: # if mt doesn't chooose ap, then change the action to ap and add the feature total_poison += 1 next_actions[i] = next_good_actions[i] for p in range(3): for q in range(3): shared_states[i][p][q][-1] = 100 # if np.argmax(next_actions[i]) == 3: # the naivest method (poison whenever ap is selected) # total_poison += 1 # for p in range(1): # for q in range(1): # shared_states[i][p][q][-1] = 100 # # do poison when ap is selected successively for three times or more # total_action += 1 # if succession_count < 2: # succession_count += 1 # elif succession_count == 2: # succession_count += 1 # total_poison += 3 # for p in range(3): # for q in range(3): # shared_states[i][p][q][-1] = 100 # shared_states[i][p][q][-2] = 100 # shared_states[i][p][q][-3] = 100 # else: # total_poison += 1 # for p in range(3): # for q in range(3): # shared_states[i][p][q][-1] = 100 # else: # succession_count = 0 # #do poison with probability which is depend on the score of last episode (the higher the socre is, the greater the probability of doing poison is; # if tbe score is greater than 2000, the probability is 100%) # random_poison = random.random() # random_poison *= 2000 / (last_episode_score[i] + 1) # if random_poison <= 1: # total_poison += 1 # for p in range(3): # for q in range(3): # shared_states[i][p][q][-1] = 100 # show the latest image # tmp = shared_states[i][:,:,-1] # img = PIL.Image.fromarray(tmp) # img.show() # input() ########################################################################################################## actions_sum += next_actions for z in range(next_actions.shape[0]): shared_actions[z] = next_actions[z] actions[t] = next_actions values[t] = readouts_v_t states[t] = shared_states # Start updating all environments with next_actions self.runners.update_environments() self.runners.wait_updated() # Done updating all environments, have new states, rewards and is_over episodes_over_masks[t] = 1.0 - shared_episode_over.astype( np.float32) for e, (actual_reward, episode_over) in enumerate( zip(shared_rewards, shared_episode_over)): total_episode_rewards[e] += actual_reward actual_reward = self.rescale_reward(actual_reward) rewards[t, e] = actual_reward emulator_steps[e] += 1 self.global_step += 1 if episode_over: total_rewards.append(total_episode_rewards[e]) episode_summary = tf.Summary(value=[ tf.Summary.Value( tag='rl/reward', simple_value=total_episode_rewards[e]), tf.Summary.Value(tag='rl/episode_length', simple_value=emulator_steps[e]), ]) self.summary_writer.add_summary( episode_summary, self.global_step) self.summary_writer.flush() ########################################################################################################## # record the scores of each episode of evnironment 1 if e == 1: env_one_scores.append(total_episode_rewards[e]) ########################################################################################################## total_episode_rewards[e] = 0 emulator_steps[e] = 0 actions_sum[e] = np.zeros(self.num_actions) # get the estimate value from the value network nest_state_value = self.session.run( self.network.output_layer_v, feed_dict={self.network.input_ph: shared_states}) estimated_return = np.copy(nest_state_value) for t in reversed(range(max_local_steps)): estimated_return = rewards[ t] + self.gamma * estimated_return * episodes_over_masks[t] y_batch[t] = np.copy(estimated_return) adv_batch[t] = estimated_return - values[t] # print("estimated_return: ", str(estimated_return)) # print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # input() # output_file.write(str(estimated_return)) # output_file.write('\n') # input() flat_states = states.reshape( [self.max_local_steps * self.emulator_counts] + list(shared_states.shape)[1:]) flat_y_batch = y_batch.reshape(-1) flat_adv_batch = adv_batch.reshape(-1) flat_actions = actions.reshape( max_local_steps * self.emulator_counts, self.num_actions) lr = self.get_lr() feed_dict = { self.network.input_ph: flat_states, self.network.critic_target_ph: flat_y_batch, self.network.selected_action_ph: flat_actions, self.network.adv_actor_ph: flat_adv_batch, self.learning_rate: lr } # update both policy(actor) and value(critic) network _, summaries = self.session.run([self.train_step, summaries_op], feed_dict=feed_dict) self.summary_writer.add_summary(summaries, self.global_step) self.summary_writer.flush() counter += 1 if counter % (2048 / self.emulator_counts) == 0: curr_time = time.time() global_steps = self.global_step last_ten = 0.0 if len(total_rewards) < 1 else np.mean( total_rewards[-10:]) logging.info( "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}" .format( global_steps, self.max_local_steps * self.emulator_counts / (curr_time - loop_start_time), (global_steps - global_step_start) / (curr_time - start_time), last_ten)) print("total_poison: ", total_poison) print("total_action: ", total_action) self.save_vars() self.cleanup() # write all of the scores of environment 1 and the count of poison to a file output_file = open('scores_150M-150M', 'w') for i in env_one_scores: output_file.write(str(i)) output_file.write('\n') output_file.write('total_action: ' + str(total_action) + '\n') output_file.write('total_poison: ' + str(total_poison) + '\n') output_file.close() def cleanup(self): super(PAACLearner, self).cleanup() self.runners.stop()
class PAACLearner(ActorLearner): def __init__(self, network_creator, environment_creator, args): super(PAACLearner, self).__init__(network_creator, environment_creator, args) self.workers = args.emulator_workers self.network_creator = network_creator # record the network creator in order to create good_network later self.total_rewards = [] self.adversary = Adversary(args) # state, reward, episode_over, action self.variables = [(np.asarray( [emulator.get_initial_state() for emulator in self.emulators], dtype=np.uint8)), (np.zeros(self.emulator_counts, dtype=np.float32)), (np.asarray([False] * self.emulator_counts, dtype=np.float32)), (np.zeros((self.emulator_counts, self.num_actions), dtype=np.float32))] self.runners = Runners(EmulatorRunner, self.emulators, self.workers, self.variables) self.runners.start() self.shared_states, self.shared_rewards, self.shared_episode_over, self.shared_actions = self.runners.get_shared_variables( ) self.summaries_op = tf.summary.merge_all() self.emulator_steps = [0] * self.emulator_counts self.total_episode_rewards = self.emulator_counts * [0] self.actions_sum = np.zeros((self.emulator_counts, self.num_actions)) self.y_batch = np.zeros((self.max_local_steps, self.emulator_counts)) self.adv_batch = np.zeros((self.max_local_steps, self.emulator_counts)) self.rewards = np.zeros((self.max_local_steps, self.emulator_counts)) self.states = np.zeros([self.max_local_steps] + list(self.shared_states.shape), dtype=np.uint8) self.actions = np.zeros( (self.max_local_steps, self.emulator_counts, self.num_actions)) self.values = np.zeros((self.max_local_steps, self.emulator_counts)) self.episodes_over_masks = np.zeros( (self.max_local_steps, self.emulator_counts)) @staticmethod def choose_next_actions(network, num_actions, states, session): network_output_v, network_output_pi = session.run( [network.output_layer_v, network.output_layer_pi], feed_dict={network.input_ph: states}) action_indices = PAACLearner.__sample_policy_action(network_output_pi) new_actions = np.eye(num_actions)[action_indices] return new_actions, network_output_v, network_output_pi def __choose_next_actions(self, states): return PAACLearner.choose_next_actions(self.network, self.num_actions, states, self.session) @staticmethod def __sample_policy_action(probs): """ Sample an action from an action probability distribution output by the policy network. """ # Subtract a tiny value from probabilities in order to avoid # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial probs = probs - np.finfo(np.float32).epsneg action_indices = [ int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs ] return action_indices def _get_shared(self, array, dtype=c_float): """ Returns a RawArray backed numpy array that can be shared between processes. :param array: the array to be shared :param dtype: the RawArray dtype to use :return: the RawArray backed numpy array """ shape = array.shape shared = RawArray(dtype, array.reshape(-1)) return np.frombuffer(shared, dtype).reshape(shape) def run_policy(self, t): state_id = self.global_step self.poisoned_emulators = [] #print('state_id', state_id, 't', t) self.shared_states = self.adversary.manipulate_states( state_id, t, self.shared_states) self.next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions( self.shared_states) self.next_actions = self.adversary.manipulate_actions( self.next_actions) self.actions_sum += self.next_actions for z in range(self.next_actions.shape[0]): self.shared_actions[z] = self.next_actions[z] self.actions[t] = self.next_actions self.values[t] = readouts_v_t self.states[t] = self.shared_states # Start updating all environments with next_actions self.runners.update_environments() self.runners.wait_updated() # Done updating all environments, have new states, rewards and is_over self.episodes_over_masks[t] = 1.0 - self.shared_episode_over.astype( np.float32) def store_rewards(self, t, emulator, actual_reward, episode_over): actual_reward = self.adversary.poison_reward(emulator, actual_reward, self.next_actions) self.total_episode_rewards[emulator] += actual_reward actual_reward = self.rescale_reward(actual_reward) self.rewards[t, emulator] = actual_reward self.emulator_steps[emulator] += 1 if episode_over: self.total_rewards.append(self.total_episode_rewards[emulator]) episode_summary = tf.Summary(value=[ tf.Summary.Value( tag='rl/reward', simple_value=self.total_episode_rewards[emulator]), tf.Summary.Value(tag='rl/episode_length', simple_value=self.emulator_steps[emulator]), ]) self.summary_writer.add_summary(episode_summary, self.global_step) self.summary_writer.flush() self.total_episode_rewards[emulator] = 0 self.emulator_steps[emulator] = 0 self.actions_sum[emulator] = np.zeros(self.num_actions) def calculate_estimated_return(self): nest_state_value = self.session.run( self.network.output_layer_v, feed_dict={self.network.input_ph: self.shared_states}) estimated_return = np.copy(nest_state_value) for t in reversed(range(self.max_local_steps)): estimated_return = self.rewards[ t] + self.gamma * estimated_return * self.episodes_over_masks[t] self.y_batch[t] = np.copy(estimated_return) self.adv_batch[t] = estimated_return - self.values[t] def update_networks(self): flat_states = self.states.reshape( [self.max_local_steps * self.emulator_counts] + list(self.shared_states.shape)[1:]) flat_y_batch = self.y_batch.reshape(-1) flat_adv_batch = self.adv_batch.reshape(-1) flat_actions = self.actions.reshape( self.max_local_steps * self.emulator_counts, self.num_actions) lr = self.get_lr() feed_dict = { self.network.input_ph: flat_states, self.network.critic_target_ph: flat_y_batch, self.network.selected_action_ph: flat_actions, self.network.adv_actor_ph: flat_adv_batch, self.learning_rate: lr } _, summaries = self.session.run([self.train_step, self.summaries_op], feed_dict=feed_dict) self.summary_writer.add_summary(summaries, self.global_step) self.summary_writer.flush() def train(self): """ Main actor learner loop for parallel advantage actor critic learning. """ self.global_step = self.init_network() self.last_saving_step = self.global_step logging.debug("Starting training at Step {}".format(self.global_step)) counter = 0 global_start = self.global_step start_time = time.time() print("global_step: ", self.global_step) while self.global_step < self.max_global_steps: loop_start_time = time.time() for t in range(self.max_local_steps): self.run_policy(t) for e, (actual_reward, episode_over) in enumerate( zip(self.shared_rewards, self.shared_episode_over)): self.global_step += 1 self.store_rewards(t, e, actual_reward, episode_over) self.calculate_estimated_return() self.update_networks() counter += 1 if counter % (2048 / self.emulator_counts) == 0: curr_time = time.time() global_steps = self.global_step last_ten = 0.0 if len(self.total_rewards) < 1 else np.mean( self.total_rewards[-10:]) logging.info( "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}" .format( global_steps, self.max_local_steps * self.emulator_counts / (curr_time - loop_start_time), (global_steps - global_start) / (curr_time - start_time), last_ten)) print(datetime.datetime.now().strftime("%Y-%b-%d %H:%M")) print("total_poison: ", self.adversary.total_poison) self.save_vars() self.cleanup() with open(os.path.join(self.debugging_folder, 'no_of_poisoned_states'), 'w') as f: f.write('total_poison: ' + str(self.adversary.total_poison) + '\n') with open( os.path.join(self.debugging_folder, 'no_of_poisoned_actions'), 'w') as f: f.write('target_action: ' + str(self.adversary.total_target_actions) + '\n') f.write('poison_distribution: ' + str(self.adversary.poison_distribution) + '\n') if self.adversary.attack_method == 'untargeted': with open( os.path.join(self.debugging_folder, 'no_of_poisoned_rewards_to_one'), 'w') as f: f.write('total times we give reward 1: ' + str(self.adversary.total_positive_rewards) + '\n') f.write('total times we give reward -1: ' + str(self.adversary.total_negative_rewards) + '\n') else: with open( os.path.join(self.debugging_folder, 'no_of_poisoned_rewards_to_one'), 'w') as f: f.write('total times we give reward 1: ' + str(self.adversary.total_positive_rewards) + '\n') def cleanup(self): super(PAACLearner, self).cleanup() self.runners.stop()