def __init__(self, nb_states: int, nb_actions: int, epsilon_prob: float = 0.05, gamma=0.99, lr=0.1, batch_replay_size=1024): """ :param nb_states: Number of states reachable in the environment. :param nb_actions: Number of possible actions. If the number of actions differs depending on the state, should be the maximum amount of actions. :param epsilon_prob: Epsilon probability. Defaults to 5%. :param gamma Discount factor. :param lr Learning rate :param batch_replay_size Size of batches to train on during updates. """ self.nb_states = nb_states self.nb_actions = nb_actions # Matrix containing Qvalues for every (s, a) couple self.Q = torch.zeros([nb_states, nb_actions], dtype=torch.float32) self.epsilon_prob = epsilon_prob # Discount Factor self.gamma = gamma # Learning rate self.lr = lr # Experience memory self.mem = ExperienceMemory() self.batch_replay_size = batch_replay_size
class Training: def __init__(self , nn , train_env , exp_mem_size = 200000 , learning_rate = 0.0001 , step_number_greedy_stop=10000 , min_greedy = 0.05): self.nn = nn self.train_env = train_env self.action = 0 self.sess = tf.Session() self.trainer = tf.train.AdamOptimizer(learning_rate).minimize(nn.cost) self.sess.run(tf.global_variables_initializer()) self.mem = ExperienceMemory(exp_mem_size) self.greedy_eps = 1 self.greedy_eps_step = (self.greedy_eps - min_greedy) / step_number_greedy_stop self.min_greedy = min_greedy self.writer = tf.summary.FileWriter("../summary" , self.sess.graph) self.merged_summary = tf.summary.merge_all() def next_step(self): self.prev_s = self.train_env.s rnd_action = np.zeros( (self.train_env.ACTION_NUMBER) ) rnd_action[rnd.randint(0 , self.train_env.ACTION_NUMBER - 1)] = 1.0 self.action = rnd_action if rnd.random() < self.greedy_eps else self.sess.run(self.nn.output , feed_dict = {self.nn.s: [self.train_env.s] })[0] self.train_env.act(np.argmax(self.action)) self.add_mem() if self.greedy_eps > self.min_greedy: self.greedy_eps -= self.greedy_eps_step if self.train_env.is_terminate(): print(self.train_env.score) self.train_env.reset() def train_batch(self , batch_size , frame_train): print(self.greedy_eps) nb_batch = frame_train // batch_size for batch_id in range(nb_batch): batch = self.mem.pick_random(batch_size) _ , cost , summaries = self.sess.run([self.trainer , self.nn.cost , self.merged_summary] , feed_dict={self.nn.s : batch['s'], self.nn.s_ : batch['s_'], self.nn.r : batch['r'], self.nn.a : batch['a']}) self.writer.add_summary(summaries) #print('cost : ' , cost) def play(self , n_step): for i in range(n_step): self.next_step() def add_mem(self): self.mem.push(self.prev_s , self.train_env.s , self.train_env.r , self.action)
def __init__(self , nn , train_env , exp_mem_size = 200000 , learning_rate = 0.0001 , step_number_greedy_stop=10000 , min_greedy = 0.05): self.nn = nn self.train_env = train_env self.action = 0 self.sess = tf.Session() self.trainer = tf.train.AdamOptimizer(learning_rate).minimize(nn.cost) self.sess.run(tf.global_variables_initializer()) self.mem = ExperienceMemory(exp_mem_size) self.greedy_eps = 1 self.greedy_eps_step = (self.greedy_eps - min_greedy) / step_number_greedy_stop self.min_greedy = min_greedy self.writer = tf.summary.FileWriter("../summary" , self.sess.graph) self.merged_summary = tf.summary.merge_all()
def __init__(self, neural_net: nn.Module, state_dim: int, batch_size: int, lr=0.01, epsilon_prob=0.05, discount=0.9, device=None): """ :param neural_net: A Neural Network created with PyTorch. Needs to be a subclass of torch.nn.Module and implement methodes __init__ and forward(self, batch). :param state_dim: Number of dimensions needed to define a state. Needs to equal the input dimension of the given neural net. :param batch_size: Number of experiences on which the network trains during each update. NOTE that the network has to explore at least batch_size experiences before training a first time. :param lr: Learning rate. :param epsilon_prob: Probability that the network chooses a random action rather than the best one according to the QValues. Only relevant if decide() is used. :param discount: Discount factor (usually called gamma), representing the importance of early decisions comparatively to later ones. :param device: Device that will be used to compute the calculations. Defaults to the the first gpu if possible, or the CPU otherwise. """ self.net = neural_net self.net.zero_grad() self.state_dim = state_dim self.forward = self.net.forward self.batch_size = batch_size self.epsilon = epsilon_prob self.discount = discount # Random decision mode: If True, the agent will decide of actions randomly self.random_mode = False # If the user did not specify a computation device, the cpu is used by default # This is because GPU isn't necessarily faster for explorations if device is None: dev_name = "cpu" device = torch.device(dev_name) # Set the neural network and memory to the device self.net.to(device) self.mem = ExperienceMemory(device) self.device = device # Training memory self.loss_mem = [] # Update tools self.optimizer = optim.SGD(self.net.parameters(), lr)
def run_carla_client(args): # Here we will run 3 episodes with 300 frames each. number_of_episodes = 60000 frames_per_episode = 400 # We assume the CARLA server is already waiting for a client to connect at # host:port. To create a connection we can use the `make_carla_client` # context manager, it creates a CARLA client object and starts the # connection. It will throw an exception if something goes wrong. The # context manager makes sure the connection is always cleaned up on exit. with make_carla_client(args.host, args.port, 30) as client: print('CarlaClient connected') # ============================================================================= # Global initialisations # ============================================================================= config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) state_size = { 'state_2D': ( 64, 64, 9, ), 'state_1D': (17, ) } action_size = (5, ) critic = Critic(sess, state_size, action_size, CRITIC_LR) critic.target_train() actor = Actor(sess, state_size, action_size, ACTOR_LR) actor.target_train() memory = ExperienceMemory(100000, False) target_update_counter = 0 target_update_freq = TARGET_UPDATE_BASE_FREQ explore_rate = 0.2 success_counter = 0 total_t = 0 t = 0 #NOTE Ez csak egy próba, eztmég át kell alakítani target = { 'pos': np.array([-3.7, 236.4, 0.9]), 'ori': np.array([0.00, -1.00, 0.00]) } if args.settings_filepath is None: # Create a CarlaSettings object. This object is a wrapper around # the CarlaSettings.ini file. Here we set the configuration we # want for the new episode. settings = CarlaSettings() settings.set(SynchronousMode=True, SendNonPlayerAgentsInfo=True, NumberOfVehicles=0, NumberOfPedestrians=0, WeatherId=random.choice([1]), QualityLevel=args.quality_level) # settings.randomize_seeds() # # settings.randomize_seeds() # The default camera captures RGB images of the scene. camera0 = Camera('CameraRGB') # Set image resolution in pixels. camera0.set_image_size(64, 64) # Set its position relative to the car in centimeters. camera0.set_position(0.30, 0, 1.30) settings.add_sensor(camera0) else: # Alternatively, we can load these settings from a file. with open(args.settings_filepath, 'r') as fp: settings = fp.read() scene = client.load_settings(settings) # ============================================================================= # EPISODES LOOP # ============================================================================= for episode in range(0, number_of_episodes): # Start a new episode. # Choose one player start at random. number_of_player_starts = len(scene.player_start_spots) player_start = random.randint(0, max(0, number_of_player_starts - 1)) player_start = 0 total_reward = 0. # Notify the server that we want to start the episode at the # player_start index. This function blocks until the server is ready # to start the episode. print('Starting new episode...') client.start_episode(player_start) #TODO Ezen belül kéne implementálni a tanuló algoritmusunkat # ============================================================================= # Episodic intitialisations # ============================================================================= collisions = {'car': 0, 'ped': 0, 'other': 0} reverse = -1.0 measurements, sensor_data = client.read_data() state = get_state_from_data(measurements, sensor_data, reverse) goal = get_goal_from_data(target) t = 0 stand_still_counter = 0 # ============================================================================= # STEPS LOOP # ============================================================================= for frame in range(0, frames_per_episode): t = t + 1 total_t += 1 target_update_counter += 1 explore_dev = 0.6 / (1 + total_t / 30000) explore_rate = 0.3 / (1 + total_t / 30000) # Print some of the measurements. # print_measurements(measurements) # Save the images to disk if requested. if args.save_images_to_disk and False: for name, measurement in sensor_data.items(): filename = args.out_filename_format.format( episode, name, frame) measurement.save_to_disk(filename) if state['state_1D'][9] < 5 and t > 50: stand_still_counter += 1 else: stand_still_counter = 0 #Calculate the action a_pred = actor.model.predict([ np.expand_dims(state['state_2D'], 0), np.expand_dims(np.concatenate((state['state_1D'], goal)), 0) ])[0] #Add exploration noise to action a = add_noise(a_pred, explore_dev, explore_rate) control = get_control_from_a(a) #Sendcontrol to the server client.send_control(control) # # ============================================================================= # TRAINING THE NETWORKS # ============================================================================= if memory.num_items > 6000: batch, indeces = memory.sample_experience(MINI_BATCH_SIZE) raw_states = [[e[0]['state_2D'], e[0]['state_1D']] for e in batch] goals = np.asarray([e[5] for e in batch]) states = { 'state_2D': np.atleast_2d(np.asarray([e[0] for e in raw_states[:]])), 'state_1D': np.atleast_2d( np.asarray([ np.concatenate([e[1], goals[i]], axis=-1) for i, e in enumerate(raw_states[:]) ])) } actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([np.sum(e[2]) for e in batch]).reshape(-1, 1) raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']] for e in batch] new_states = { 'state_2D': np.atleast_2d( np.asarray([e[0] for e in raw_new_states[:]])), 'state_1D': np.atleast_2d( np.asarray([ np.concatenate([e[1], goals[i]], axis=-1) for i, e in enumerate(raw_new_states[:]) ])) } overs = np.asarray([e[4] for e in batch]).reshape(-1, 1) best_a_preds = actor.target_model.predict( [new_states['state_2D'], new_states['state_1D']]) max_qs = critic.target_model.predict([ new_states['state_2D'], new_states['state_1D'], best_a_preds ]) ys = rewards + (1 - overs) * GAMMA * max_qs #Train Critic network critic.model.train_on_batch( [states['state_2D'], states['state_1D'], actions], ys) #Train Actor network a_for_grads = actor.model.predict( [states['state_2D'], states['state_1D']]) a_grads = critic.gradients(states, a_for_grads) actor.train(states, a_grads) #Train target networks if target_update_counter >= int(target_update_freq): target_update_counter = 0 target_update_freq = target_update_freq * TARGET_UPDATE_MULTIPLIER critic.target_train() actor.target_train() # ============================================================================= # GET AND STORE OBSERVATIONS # ============================================================================= #Get next measurements measurements, sensor_data = client.read_data() new_state = get_state_from_data(measurements, sensor_data, reverse, state) #TODO Calculate reward r_goal, success = calculate_goal_reward( np.atleast_2d(new_state['state_1D']), goal) r_general, collisions = calculate_general_reward( measurements, collisions) over = stand_still_counter > 30 or success success_counter += int(bool(success) * 1) total_reward += r_goal total_reward += r_general #Store observation if t > 10: experience = pd.DataFrame( [[ state, a, np.array([r_goal, r_general]), new_state, bool(over), goal, episode, 0 ]], columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'], copy=True) memory.add_experience(experience) #Set the state to the next state state = new_state if over: break sub_goal = deepcopy(state['state_1D'][0:6]) print(str(episode) + ". Episode###################") print("Total reward: " + str(total_reward)) print("Success counter: " + str(success_counter)) if (episode % 10 == 0): print("############## DEBUG LOG ################") print("Memory state: " + str(memory.num_items)) print("Target update counter: " + str(target_update_counter)) print("Exploration rate: " + str(explore_rate)) print("Exploration dev: " + str(explore_dev)) print("Total timesteps: " + str(total_t)) print("Average episode length: " + str(total_t / (episode + 1))) print("#########################################") # ============================================================================= # REPLAY FOR SUBGOALS # ============================================================================= batch = memory.get_last_episode(t) raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']] for e in batch] new_states = { 'state_2D': np.atleast_2d(np.asarray([e[0] for e in raw_new_states[:]])), 'state_1D': np.atleast_2d(np.asarray([e[1] for e in raw_new_states[:]])) } rewards = np.asarray([e[2] for e in batch]).reshape(-1, 2) r_subgoal = calculate_goal_reward(new_states['state_1D'], sub_goal)[0] rewards[:, 0] = r_subgoal subgoal_batch = [[ v[0], v[1], list(rewards)[i], v[3], v[4], sub_goal, v[6], v[7] ] for i, v in enumerate(batch)] experiences = pd.DataFrame( subgoal_batch, columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'], copy=True) memory.add_experience(experiences)
def main(): parser = argparse.ArgumentParser( 'a program to train or run a deep q-learning agent') parser.add_argument("game", type=str, help="name of game to play") parser.add_argument("agent_type", type=str, help="name of learning/acting technique used") parser.add_argument("agent_name", type=str, help="unique name of this agent instance") parser.add_argument("--rom_path", type=str, help="path to directory containing atari game roms", default='../roms') parser.add_argument( "--watch", help= "if true, a pretrained model with the specified name is loaded and tested with the game screen displayed", action='store_true') parser.add_argument("--epochs", type=int, help="number of epochs", default=200) parser.add_argument("--epoch_length", type=int, help="number of steps in an epoch", default=250000) parser.add_argument("--test_steps", type=int, help="max number of steps per test", default=125000) parser.add_argument("--test_steps_hardcap", type=int, help="absolute max number of steps per test", default=135000) parser.add_argument("--test_episodes", type=int, help="max number of episodes per test", default=30) parser.add_argument("--history_length", type=int, help="number of frames in a state", default=4) parser.add_argument("--training_frequency", type=int, help="number of steps run before training", default=4) parser.add_argument( "--random_exploration_length", type=int, help= "number of randomly-generated experiences to initially fill experience memory", default=50000) parser.add_argument("--initial_exploration_rate", type=float, help="initial exploration rate", default=1.0) parser.add_argument("--final_exploration_rate", type=float, help="final exploration rate from linear annealing", default=0.1) parser.add_argument( "--final_exploration_frame", type=int, help="frame at which the final exploration rate is reached", default=1000000) parser.add_argument("--test_exploration_rate", type=float, help="exploration rate while testing", default=0.05) parser.add_argument("--frame_skip", type=int, help="number of frames to repeat chosen action", default=4) parser.add_argument("--screen_dims", type=tuple, help="dimensions to resize frames", default=(84, 84)) # used for stochasticity and to help prevent overfitting. # Must be greater than frame_skip * (observation_length -1) + buffer_length - 1 parser.add_argument("--max_start_wait", type=int, help="max number of frames to wait for initial state", default=60) # buffer_length = 1 prevents blending parser.add_argument("--buffer_length", type=int, help="length of buffer to blend frames", default=2) parser.add_argument("--blend_method", type=str, help="method used to blend frames", choices=('max'), default='max') parser.add_argument("--reward_processing", type=str, help="method to process rewards", choices=('clip', 'none'), default='clip') # must set network_architecture to custom in order use custom architecture parser.add_argument( "--conv_kernel_shapes", type=tuple, help= "shapes of convnet kernels: ((height, width, in_channels, out_channels), (next layer))" ) # must have same length as conv_kernel_shapes parser.add_argument( "--conv_strides", type=tuple, help="connvet strides: ((1, height, width, 1), (next layer))") # currently, you must have at least one dense layer parser.add_argument( "--dense_layer_shapes", type=tuple, help="shapes of dense layers: ((in_size, out_size), (next layer))") parser.add_argument("--discount_factor", type=float, help="constant to discount future rewards", default=0.99) parser.add_argument("--learning_rate", type=float, help="constant to scale parameter updates", default=0.00025) parser.add_argument("--optimizer", type=str, help="optimization method for network", choices=('rmsprop', 'graves_rmsprop'), default='rmsprop') parser.add_argument("--rmsprop_decay", type=float, help="decay constant for moving average in rmsprop", default=0.95) parser.add_argument("--rmsprop_epsilon", type=int, help="constant to stabilize rmsprop", default=0.01) # set error_clipping to less than 0 to disable parser.add_argument( "--error_clipping", type=str, help="constant at which td-error becomes linear instead of quadratic", default=1.0) # set gradient clipping to 0 or less to disable. Currently only works with graves_rmsprop. parser.add_argument("--gradient_clip", type=str, help="clip gradients to have the provided L2-norm", default=0) parser.add_argument("--target_update_frequency", type=int, help="number of steps between target network updates", default=10000) parser.add_argument( "--memory_capacity", type=int, help="max number of experiences to store in experience memory", default=1000000) parser.add_argument( "--batch_size", type=int, help="number of transitions sampled from memory during learning", default=32) # must set to custom in order to specify custom architecture parser.add_argument("--network_architecture", type=str, help="name of prespecified network architecture", choices=("deepmind_nips", "deepmind_nature, custom"), default="deepmind_nature") parser.add_argument("--recording_frequency", type=int, help="number of steps before tensorboard recording", default=50000) parser.add_argument("--saving_threshold", type=int, help="min score threshold for saving model.", default=0) parser.add_argument("--parallel", help="parallelize acting and learning", action='store_true') parser.add_argument( "--double_dqn", help="use double q-learning algorithm in error target calculation", action='store_true') args = parser.parse_args() if args.network_architecture == 'deepmind_nature': args.conv_kernel_shapes = [[8, 8, 4, 32], [4, 4, 32, 64], [3, 3, 64, 64]] args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1], [1, 1, 1, 1]] args.dense_layer_shapes = [[3136, 512]] elif args.network_architecture == 'deepmind_nips': args.conv_kernel_shapes = [[8, 8, 4, 16], [4, 4, 16, 32]] args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1]] args.dense_layer_shapes = [[2592, 256]] if not args.watch: train_stats = RecordStats(args, False) test_stats = RecordStats(args, True) training_emulator = AtariEmulator(args) testing_emulator = AtariEmulator(args) num_actions = len(training_emulator.get_possible_actions()) experience_memory = ExperienceMemory(args, num_actions) q_network = None agent = None if args.parallel: q_network = ParallelQNetwork(args, num_actions) agent = ParallelDQNAgent(args, q_network, training_emulator, experience_memory, num_actions, train_stats) else: q_network = QNetwork(args, num_actions) agent = DQNAgent(args, q_network, training_emulator, experience_memory, num_actions, train_stats) experiment.run_experiment(args, agent, testing_emulator, test_stats) else: testing_emulator = AtariEmulator(args) num_actions = len(testing_emulator.get_possible_actions()) q_network = QNetwork(args, num_actions) agent = DQNAgent(args, q_network, None, None, num_actions, None) experiment.evaluate_agent(args, agent, testing_emulator, None)
class QNetwork: """ A QNetwork is a neural network which uses a Q-Learning approach associated with Deep Learning to learn to approximate a reward function over a given state-action couple. A QNetwork object is not the neural network itself but rather a tool to make it work with Q Deep Learning. """ def __init__(self, neural_net: nn.Module, state_dim: int, batch_size: int, lr=0.01, epsilon_prob=0.05, discount=0.9, device=None): """ :param neural_net: A Neural Network created with PyTorch. Needs to be a subclass of torch.nn.Module and implement methodes __init__ and forward(self, batch). :param state_dim: Number of dimensions needed to define a state. Needs to equal the input dimension of the given neural net. :param batch_size: Number of experiences on which the network trains during each update. NOTE that the network has to explore at least batch_size experiences before training a first time. :param lr: Learning rate. :param epsilon_prob: Probability that the network chooses a random action rather than the best one according to the QValues. Only relevant if decide() is used. :param discount: Discount factor (usually called gamma), representing the importance of early decisions comparatively to later ones. :param device: Device that will be used to compute the calculations. Defaults to the the first gpu if possible, or the CPU otherwise. """ self.net = neural_net self.net.zero_grad() self.state_dim = state_dim self.forward = self.net.forward self.batch_size = batch_size self.epsilon = epsilon_prob self.discount = discount # Random decision mode: If True, the agent will decide of actions randomly self.random_mode = False # If the user did not specify a computation device, the cpu is used by default # This is because GPU isn't necessarily faster for explorations if device is None: dev_name = "cpu" device = torch.device(dev_name) # Set the neural network and memory to the device self.net.to(device) self.mem = ExperienceMemory(device) self.device = device # Training memory self.loss_mem = [] # Update tools self.optimizer = optim.SGD(self.net.parameters(), lr) def memorize(self, states: torch.tensor, actions: torch.IntTensor, next_states: torch.tensor, rewards: torch.tensor): """ Memorizes a sequence of experiences which can be trained on later. An experience is a (s, a, ns, r) tuple where: -s is the starting state; -a is the decided action; -ns is the state resulting from taking action a in state s; -r is the reward received from the environment. :param states: A 2D (batch_size, state_dim) shaped tensor containing the experiences' states. :param actions: A 2D (batch_size, 1) integer tensor containing the experiences' decided actions. :param next_states: A 2D (batch_size, state_dim + 1) tensor containing the experiences' next_states. The last value of the second dimension must be 1 if the state is final or 0 otherwise. :param rewards: A 2D (batch_size, 1) tensor containing the experiences' rewards. """ self.mem.memorize(states, actions, next_states, rewards) def memorize_exploration(self, states: torch.tensor, actions: torch.IntTensor, rewards: torch.tensor, last_state_is_final=True): """ Memorizes a whole exploration process with a final single reward. Should be used for processes for which the reward isn't specifically known for every state-action couple, but rather according to a final score. :param states: Successive states encountered. Should be a tensor of shape (number_of_states, state_dim) :param actions: Successive actions decided by the agent. Should be a tensor of shape (number_of_states - 1, ) :param next_states: For each state-action (s, a) encountered, state s' returned by the environment. Same shape as :param state:. :param rewards (number_of_states - 1, )-sized 1D Tensor indicating the rewards for the episode :param last_state_is_final: Indicates whether the last state in the exploration was final. """ states = states.to(self.device) # Creates a tensor containing [0, 0, ..., 0, 1] to indicate that only the last state was final final_indicator = torch.zeros(states.size()[0] - 1, device=self.device) final_indicator[-1] = last_state_is_final # States at the beginning of each step, including the final indicator next_states = torch.cat((states[1:], final_indicator.view(-1, 1)), dim=1) actions = actions.to(self.device) rewards = rewards.to(self.device) self.mem.memorize(states[:-1], actions, next_states, rewards) def set_last_rewards(self, nb_experiences: int, reward: torch.double): """ Sets the rewards for the last memorized experiences to a given value. This should be used for example when the reward is not known for every specific (state, action) couple, but can be deduced from the final state reached: Use this function to set the rewards for the episode to the final reward. :param nb_experiences: number of experiences whose rewards should be affected :param reward: scalar indicating to which value the last rewards should be set """ self.mem.set_last_rewards(nb_experiences, reward) def decide(self, states: torch.tensor): """ Decides which action is best for a given batch of states. :param states (Batch_size, state_dim) set of states. :return: A (Batch_size, 1) int tensor A where A[i, 0] is the index of the decided action. """ # Make sure the states tensor runs on the right device states = states.to(self.device) output = self.forward(states) random_actions = torch.randint(0, output.size()[1], (states.size()[0],), device=self.device) # If the network is on random mode, return random actions if self.random_mode: return random_actions else: dice = torch.rand(states.size()[0], device=self.device) actions = torch.argmax(output, dim=1).type(torch.int64) return actions * (dice >= self.epsilon) + random_actions * (dice < self.epsilon) def decide_best(self, states: torch.tensor): """ Decides which action is best for a given batch of states, without taking the epsilon strategy into account. :param states: (Batch_size, state_dim) set of states. :return: A (Batch_size, 1) int tensor A where A[i, 0] is the index of the preferred action according to the network. """ # Make sure the states tensor runs on the right device states = states.to(self.device) output = self.forward(states) return torch.argmax(output, dim=1).type(torch.int64); def clear_memory(self): """ Clears the agent's Experience Memory. """ self.mem.clear() def set_random_mode(self, value: bool): """ Sets the network to a random mode: if True, the network will decide of actions randomly (As if the epsilon probability was 1). """ self.random_mode = value def train_on_batch(self, states, actions, next_states, rewards): """ Trains the network on a batch of experiences :param states: (batch_size, state_dim) tensor indicating the states. :param actions: (batch_size, 1) int tensor indicating actions taken :param next_states: (batch_size, state_dim + 1) tensor indicating next states. The last value of the second dimension should be either 1 if the state is a final state or 0 otherwise. :param rewards: (batch_size, 1) float tensor indicating """ """ The Target value to compute the loss is taken as y = reward + discount * max {Q[next_state, a'] for all action a'} Since we do not have that maximum value, we use the network's estimation. """ # Tensor containing information about whether the states are final final_indicator = next_states[:, -1] # Now remove that information from the next states tensor next_states = next_states[:, :-1] # Divide final and non final states non_final_states = next_states[final_indicator == 0, :] output = self.forward(states).gather(1, actions.view(states.size()[0], 1)).view((-1,)) # Modify the target so that Y[k, a] = r + gamma * max_net_val # and Y[k, a'] is unchanged for a' != a # If the next state is final, don't take into account the reward obtainable from it target = torch.zeros(rewards.size(), device=self.device) target[final_indicator == 1] = rewards[final_indicator == 1] # If the next state isn't final, estimate the max reward obtainable from it # using the network itself. if non_final_states.size()[0] > 0: max_next_qval = self.forward(non_final_states).max(1)[0] target[final_indicator == 0] = rewards[final_indicator == 0] + self.discount * max_next_qval target = target.detach() # Compute the loss loss = func.mse_loss(output, target) self.loss_mem.append(loss) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def update(self): """ Updates the QNetwork's parameters using its experience memory. """ # Get a random batch from the experience memory states, actions, next_states, rewards = self.mem.random_batch(self.batch_size) self.train_on_batch(states, actions, next_states, rewards) def train_on_memory(self, batch_size, epochs): """ Trains the agent on experiences from its experience replay memory. :param batch_size: Batch size for training :param epochs: Number of times the mem should be fully browsed """ print("Training on ", epochs, " epochs from the replay memory..") # Get all data from the replay memory states, actions, next_states, rewards = self.mem.all() # Shuffling the batches lines_shuffle = torch.randperm(states.size()[0]) states = states[lines_shuffle] actions = actions[lines_shuffle] rewards = rewards[lines_shuffle] next_states = next_states[lines_shuffle] # Split them into batches states_batches = torch.split(states, batch_size) actions_batches = torch.split(actions, batch_size) next_states_batches = torch.split(next_states, batch_size) rewards_batches = torch.split(rewards, batch_size) # Number of batches nb_batches = len(states_batches) # Train for ep in range(epochs): batches_completed = 0 for states, actions, next_states, rewards \ in zip(states_batches, actions_batches, next_states_batches, rewards_batches): self.train_on_batch(states, actions, next_states, rewards) batches_completed += 1 printProgressBar(batches_completed, nb_batches, "Epoch " + str(ep + 1) + "/" + str(epochs), length=90) def show_training(self): """ Plots the training metrics. """ plt.plot([self.batch_size * (i + 1) for i in range(len(self.loss_mem))], self.loss_mem) plt.xlabel("Batches") plt.ylabel("MSE Loss") def plot_trajectory(self, initial_states: torch.tensor, next_state_function, steps=100): """ ONLY AVAILABLE IF STATE DIM IS 1 OR 2. Plots the trajectory of the agent starting from the given initial states on a 2D (if self.state_dim == 1) or 3D (if self.state_dim == 2) graph. :param initial_states: (N, state_dim) torch tensor indicating the starting states :param next_state_function: Function used to determine the next state. Should have signature (state: torch.tensor, action: int) :param steps: Number of successive states that should be plotted. """ # Make sure the initial state runs on the right device initial_states = initial_states.to(self.device) if self.state_dim != 1 and self.state_dim != 2: raise ValueError("State dimension too large to plot agent trajectory.\n") for initial_state in initial_states: states = torch.empty((steps, self.state_dim)) states[0] = initial_state # Exploration for step in range(steps - 1): action = self.decide_best(states[step].view(1, -1)).item() states[step + 1] = next_state_function(states[step], action) # Plotting if self.state_dim == 1: plt.plot(torch.arange(0, step), states) plt.plot([0], [initial_state[0]], "go") plt.plot([steps - 1], [states[-1].item()], "ro") elif self.state_dim == 2: plt.plot(states[:, 0], states[:, 1]) plt.plot([initial_state[0]], [initial_state[1]], "go") plt.plot([states[-1, 0]], [states[-1, 1]], "ro") def set_learning_rate(self, new_lr: float): """ Sets a value for the network's learning rate. :param new_lr: New value for the learning rate """ self.optimizer.lr = new_lr def set_device(self, device: torch.device): """ Sets a new device for training computations. :param device: Torch device object. """ self.device = device self.mem.to(device) self.net.to(device)
def train(BATCH_SIZE, ENC_WEIGHTS, DEC_WEIGHTS, DIS_WEIGHTS): print("Loading data definitions...") frames_source = hkl.load(os.path.join(DATA_DIR, 'sources_train_128.hkl')) # Build video progressions videos_list = [] start_frame_index = 1 end_frame_index = VIDEO_LENGTH + 1 while (end_frame_index <= len(frames_source)): frame_list = frames_source[start_frame_index:end_frame_index] if (len(set(frame_list)) == 1): videos_list.append(range(start_frame_index, end_frame_index)) start_frame_index = start_frame_index + 1 end_frame_index = end_frame_index + 1 else: start_frame_index = end_frame_index - 1 end_frame_index = start_frame_index + VIDEO_LENGTH videos_list = np.asarray(videos_list, dtype=np.int32) n_videos = videos_list.shape[0] if SHUFFLE: # Shuffle images to aid generalization videos_list = np.random.permutation(videos_list) # Build the Spatio-temporal Autoencoder print("Creating models...") encoder = encoder_model() decoder = decoder_model() intermediate_decoder = Model(inputs=decoder.layers[0].input, outputs=decoder.layers[10].output) mask_gen = Sequential() mask_gen.add(encoder) mask_gen.add(intermediate_decoder) mask_gen.compile(loss='mean_squared_error', optimizer=OPTIM_G) autoencoder = autoencoder_model(encoder, decoder) if ADVERSARIAL: discriminator = discriminator_model() aae = aae_model(autoencoder, discriminator) aae.compile(loss='binary_crossentropy', optimizer=OPTIM_G) set_trainability(discriminator, True) discriminator.compile(loss='binary_crossentropy', optimizer=OPTIM_D) run_utilities(encoder, decoder, autoencoder, discriminator, ENC_WEIGHTS, DEC_WEIGHTS, DIS_WEIGHTS) else: run_utilities(encoder, decoder, autoencoder, 'None', ENC_WEIGHTS, DEC_WEIGHTS, 'None') autoencoder.compile(loss=mse_kld_loss, optimizer=OPTIM_A) NB_ITERATIONS = int(n_videos / BATCH_SIZE) # Setup TensorBoard Callback TC = tb_callback.TensorBoard(log_dir=TF_LOG_DIR, histogram_freq=0, write_graph=False, write_images=False) LRS = lrs_callback.LearningRateScheduler(schedule=schedule) LRS.set_model(autoencoder) print("Beginning Training...") # Begin Training for epoch in range(NB_EPOCHS_AUTOENCODER): print("\n\nEpoch ", epoch) loss = [] # Set learning rate every epoch LRS.on_epoch_begin(epoch=epoch) lr = K.get_value(autoencoder.optimizer.lr) print("Learning rate: " + str(lr)) for index in range(NB_ITERATIONS): # Train Autoencoder X = load_X(videos_list, index, DATA_DIR) X_train = X[:, 0:int(VIDEO_LENGTH / 2)] y_train = X[:, int(VIDEO_LENGTH / 2):] loss.append(autoencoder.train_on_batch(X_train, y_train)) arrow = int(index / (NB_ITERATIONS / 40)) stdout.write("\rIteration: " + str(index) + "/" + str(NB_ITERATIONS - 1) + " " + "loss: " + str(loss[len(loss) - 1]) + "\t [" + "{0}>".format("=" * (arrow))) stdout.flush() if SAVE_GENERATED_IMAGES: # Save generated images to file predicted_images = autoencoder.predict(X_train, verbose=0) orig_image, truth_image, pred_image = combine_images( X_train, y_train, predicted_images) pred_image = pred_image * 127.5 + 127.5 orig_image = orig_image * 127.5 + 127.5 truth_image = truth_image * 127.5 + 127.5 if epoch == 0: cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_orig.png"), orig_image) cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_truth.png"), truth_image) cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_pred.png"), pred_image) # then after each epoch/iteration avg_loss = sum(loss) / len(loss) logs = {'loss': avg_loss} TC.on_epoch_end(epoch, logs) # Log the losses with open(os.path.join(LOG_DIR, 'losses.json'), 'a') as log_file: log_file.write("{\"epoch\":%d, \"d_loss\":%f};\n" % (epoch, avg_loss)) print("\nAvg loss: " + str(avg_loss)) # Save predicted mask per epoch predicted_attn_1 = mask_gen.predict(X_train, verbose=0) a_pred_1 = np.reshape(predicted_attn_1, newshape=(10, 10, 16, 16, 1)) np.save( os.path.join(TEST_RESULTS_DIR, 'attention_weights_gen1_' + str(epoch) + '.npy'), a_pred_1) # Save model weights per epoch to file encoder.save_weights( os.path.join(CHECKPOINT_DIR, 'encoder_epoch_' + str(epoch) + '.h5'), True) decoder.save_weights( os.path.join(CHECKPOINT_DIR, 'decoder_epoch_' + str(epoch) + '.h5'), True) # Train AAE if ADVERSARIAL: exp_memory = ExperienceMemory(memory_length=100) for epoch in range(NB_EPOCHS_AAE): print("\n\nEpoch ", epoch) g_loss = [] d_loss = [] # a_loss = [] # # Set learning rate every epoch # LRS.on_epoch_begin(epoch=epoch) lr = K.get_value(autoencoder.optimizer.lr) print("Learning rate: " + str(lr)) for index in range(NB_ITERATIONS): # Train Autoencoder X = load_X(videos_list, index, DATA_DIR) X_train = X[:, 0:int(VIDEO_LENGTH / 2)] y_train = X[:, int(VIDEO_LENGTH / 2):] future_images = autoencoder.predict(X_train, verbose=0) trainable_fakes = exp_memory.get_trainable_fakes( current_gens=future_images, exp_window_size=5) # Train Discriminator on future images (y_train, not X_train) X = np.concatenate((y_train, trainable_fakes)) y = np.concatenate( (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.int), np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.int)), axis=0) d_loss.append(discriminator.train_on_batch(X, y)) # Train AAE set_trainability(discriminator, False) y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.int) g_loss.append(aae.train_on_batch(X_train, y)) set_trainability(discriminator, True) # # Train Autoencoder # a_loss.append(autoencoder.train_on_batch(X_train, y_train)) arrow = int(index / (NB_ITERATIONS / 30)) stdout.write("\rIteration: " + str(index) + "/" + str(NB_ITERATIONS - 1) + " " + "g_loss: " + str(g_loss[len(g_loss) - 1]) + " " + "d_loss: " + str(d_loss[len(d_loss) - 1]) + "\t [" + "{0}>".format("=" * (arrow))) stdout.flush() if SAVE_GENERATED_IMAGES: # Save generated images to file predicted_images = autoencoder.predict(X_train, verbose=0) orig_image, truth_image, pred_image = combine_images( X_train, y_train, predicted_images) pred_image = pred_image * 127.5 + 127.5 orig_image = orig_image * 127.5 + 127.5 truth_image = truth_image * 127.5 + 127.5 if epoch == 0: cv2.imwrite( os.path.join( GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_aae_orig.png"), orig_image) cv2.imwrite( os.path.join( GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_aae_truth.png"), truth_image) cv2.imwrite( os.path.join( GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_aae_pred.png"), pred_image) predicted_attn_1 = mask_gen.predict(X_train, verbose=0) a_pred_1 = np.reshape(predicted_attn_1, newshape=(10, 10, 16, 16, 1)) np.save( os.path.join( TEST_RESULTS_DIR, 'attention_weights_gen1_' + str(epoch) + '.npy'), a_pred_1) # then after each epoch/iteration # avg_a_loss = sum(a_loss) / len(a_loss) avg_g_loss = sum(g_loss) / len(g_loss) avg_d_loss = sum(d_loss) / len(d_loss) logs = {'g_loss': avg_g_loss, 'd_loss': avg_d_loss} TC.on_epoch_end(epoch, logs) # Log the losses with open(os.path.join(LOG_DIR, 'losses_aae.json'), 'a') as log_file: log_file.write( "{\"epoch\":%d, \"g_loss\":%f, \"d_loss\":%f};\n" % (epoch, avg_g_loss, avg_d_loss)) print("\nAvg g_loss: " + str(avg_g_loss) + " Avg d_loss: " + str(avg_d_loss)) # Save model weights per epoch to file encoder.save_weights( os.path.join(CHECKPOINT_DIR, 'encoder_aae_epoch_' + str(epoch) + '.h5'), True) decoder.save_weights( os.path.join(CHECKPOINT_DIR, 'decoder_aae_epoch_' + str(epoch) + '.h5'), True) discriminator.save_weights( os.path.join(CHECKPOINT_DIR, 'discriminator_aae_epoch_' + str(epoch) + '.h5'), True) # End TensorBoard Callback TC.on_train_end('_')
def main(): STEP_LOG_RATE = 1000 TENSORBOARD_ROOT_PATH = "tensorboard" CHECKPOINT_ROOT_PATH = "checkpoints_test" CHECKPOINTS_STEPS = 100000 EXPERIENCE_MEMORY_CAPACITY = 6400000 MINIBATCH_SIZE = 32 GAMMA = 0.99 EPSILON_START = 1.0 EPSILON_END = 0.1 EPSILON_DECAY_STEPS = 1000000 LEARNING_RATE = 0.0001 FIELD_WIDTH = 5 FIELD_HEIGHT = 5 USE_TARGET_NETWORK = True TARGET_NETWORK_UPDATE_STEPS = 10000 STATE_AS_COORDINATES = True STATE_NORMALISATION = True descriptiveString = buildDescriptiveString(EXPERIENCE_MEMORY_CAPACITY, \ MINIBATCH_SIZE, GAMMA, EPSILON_START, EPSILON_END, EPSILON_DECAY_STEPS, \ LEARNING_RATE, STATE_AS_COORDINATES, STATE_NORMALISATION, \ FIELD_WIDTH, FIELD_HEIGHT, USE_TARGET_NETWORK, TARGET_NETWORK_UPDATE_STEPS) tensorboardDirectory = os.path.join(TENSORBOARD_ROOT_PATH, descriptiveString) checkpointDirectory = os.path.join(CHECKPOINT_ROOT_PATH, descriptiveString) # create catch environment catch = Catch(FIELD_WIDTH, FIELD_HEIGHT, STATE_AS_COORDINATES, STATE_NORMALISATION) numberOfActions = catch.getNumberOfActions() stateSize = catch.getStateSize() # create experience memory experienceMemory = ExperienceMemory(EXPERIENCE_MEMORY_CAPACITY, stateSize) ######################################################################################################################################################## input, output, outputLabel, onlineSummary = createModel(stateSize, \ numberOfActions, isTargetNetwork=False) if USE_TARGET_NETWORK: targetInput, targetOutput, _, targetSummary = createModel(stateSize, \ numberOfActions, isTargetNetwork=True) with tf.name_scope("train"): optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE) loss = tf.losses.huber_loss(labels=outputLabel, predictions=output) train = optimizer.minimize(loss) tf.summary.scalar("loss", loss) episodicStepsSummary = tf.Summary() episodicRewardSummary = tf.Summary() explorationSummary = tf.Summary() experienceMemorySizeSummary = tf.Summary() episodicStepsSummary.value.add(tag="episodic_steps", simple_value=None) episodicRewardSummary.value.add(tag="episodic_reward", simple_value=None) explorationSummary.value.add(tag="exploration", simple_value=None) experienceMemorySizeSummary.value.add(tag="experience_memory_size", simple_value=None) trainSummary = tf.summary.merge_all(scope="train") sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(tensorboardDirectory, sess.graph) updateTargetNetwork(sess, writer, targetSummary, 0) ######################################################################################################################################################## step = 0 episode = 0 epsilon = EPSILON_START while step < EPSILON_DECAY_STEPS: episode += 1 catch.reset() state = catch.getState() done = False episodeReward = 0 episodeSteps = 0 while not done and step < EPSILON_DECAY_STEPS: step += 1 # select next action if np.random.random() <= epsilon: actionNumber = np.random.randint(numberOfActions) else: prediction = sess.run( output, feed_dict={input: np.reshape(state, (-1, stateSize))}) actionNumber = np.argmax(prediction[0]) # convert action number to action action = list(Actions)[actionNumber] # execute selected action reward, nextState, done = catch.move(action) # store experience to memory experienceMemory.store(state, actionNumber, reward, nextState, done) # replace current state by next state state = nextState # replay experiences if experienceMemory.size() > MINIBATCH_SIZE: # sample from experience memory ids, states, actions, rewards, nextStates, nextStateTerminals = experienceMemory.sample( MINIBATCH_SIZE) if USE_TARGET_NETWORK: statePredictions = sess.run(output, feed_dict={input: states}) nextStatePredictions = sess.run( targetOutput, feed_dict={targetInput: nextStates}) else: predictions = sess.run(output, feed_dict={ input: np.concatenate( (states, nextStates)) }) statePredictions = predictions[:MINIBATCH_SIZE] nextStatePredictions = predictions[MINIBATCH_SIZE:] statePredictions[np.arange(MINIBATCH_SIZE), actions] = \ rewards + np.invert(nextStateTerminals) * GAMMA * \ nextStatePredictions.max(axis=1) # update online network _, onlineSummaryResult, trainSummaryResult = sess.run( [train, onlineSummary, trainSummary], feed_dict={ input: states, outputLabel: statePredictions }) # write summary if step % STEP_LOG_RATE == 0: writer.add_summary(onlineSummaryResult, step) writer.add_summary(trainSummaryResult, step) episodeReward += reward episodeSteps += 1 # update target network if USE_TARGET_NETWORK and step % TARGET_NETWORK_UPDATE_STEPS == 0: updateTargetNetwork(sess, writer, targetSummary, step) # write exploration summary if step % STEP_LOG_RATE == 0: explorationSummary.value[0].simple_value = epsilon experienceMemorySizeSummary.value[ 0].simple_value = experienceMemory.size() writer.add_summary(explorationSummary, step) writer.add_summary(experienceMemorySizeSummary, step) # save checkpoint if step % CHECKPOINTS_STEPS == 0: saveModel(checkpointDirectory, step, sess) # calculate epsilon for next step epsilon = EPSILON_START - (EPSILON_START - EPSILON_END) / ( EPSILON_DECAY_STEPS / step) # write episodic summary episodicStepsSummary.value[0].simple_value = episodeSteps episodicRewardSummary.value[0].simple_value = episodeReward writer.add_summary(episodicStepsSummary, step) writer.add_summary(episodicRewardSummary, step)
def train(BATCH_SIZE, ENC_WEIGHTS, DEC_WEIGHTS, GEN_WEIGHTS, DIS_WEIGHTS): print("Loading data definitions...") frames_source = hkl.load(os.path.join(DATA_DIR, 'sources_train_128.hkl')) # Build video progressions videos_list = [] start_frame_index = 1 end_frame_index = VIDEO_LENGTH + 1 while (end_frame_index <= len(frames_source)): frame_list = frames_source[start_frame_index:end_frame_index] if (len(set(frame_list)) == 1): videos_list.append(range(start_frame_index, end_frame_index)) start_frame_index = start_frame_index + 1 end_frame_index = end_frame_index + 1 else: start_frame_index = end_frame_index - 1 end_frame_index = start_frame_index + VIDEO_LENGTH videos_list = np.asarray(videos_list, dtype=np.int32) n_videos = videos_list.shape[0] # Setup validation val_frames_source = hkl.load( os.path.join(VAL_DATA_DIR, 'sources_val_128.hkl')) val_videos_list = [] start_frame_index = 1 end_frame_index = VIDEO_LENGTH + 1 while (end_frame_index <= len(val_frames_source)): val_frame_list = val_frames_source[start_frame_index:end_frame_index] if (len(set(val_frame_list)) == 1): val_videos_list.append(range(start_frame_index, end_frame_index)) start_frame_index = start_frame_index + VIDEO_LENGTH end_frame_index = end_frame_index + VIDEO_LENGTH else: start_frame_index = end_frame_index - 1 end_frame_index = start_frame_index + VIDEO_LENGTH val_videos_list = np.asarray(val_videos_list, dtype=np.int32) n_val_videos = val_videos_list.shape[0] if SHUFFLE: # Shuffle images to aid generalization videos_list = np.random.permutation(videos_list) # Build the Spatio-temporal Autoencoder print("Creating models...") encoder = encoder_model() decoder = decoder_model() autoencoder = autoencoder_model(encoder, decoder) autoencoder.compile(loss="mean_squared_error", optimizer=OPTIM_A) intermediate_decoder = Model(inputs=decoder.layers[0].input, outputs=decoder.layers[1].output) mask_gen_1 = Sequential() mask_gen_1.add(encoder) mask_gen_1.add(intermediate_decoder) mask_gen_1.compile(loss='mean_squared_error', optimizer=OPTIM_G) if ADVERSARIAL: generator = refiner_g_model() discriminator = refiner_d_model() gan = gan_model(autoencoder, generator, discriminator) generator.compile(loss='binary_crossentropy', optimizer='sgd') gan.compile(loss=['mae', 'binary_crossentropy'], loss_weights=LOSS_WEIGHTS, optimizer=OPTIM_G, metrics=['accuracy']) print('GAN') print(gan.summary()) set_trainability(discriminator, True) discriminator.compile(loss='binary_crossentropy', optimizer=OPTIM_D, metrics=['accuracy']) run_utilities(encoder, decoder, autoencoder, generator, discriminator, gan, ENC_WEIGHTS, DEC_WEIGHTS, GEN_WEIGHTS, DIS_WEIGHTS) else: run_utilities(encoder, decoder, autoencoder, 'None', 'None', 'None', ENC_WEIGHTS, DEC_WEIGHTS, 'None', 'None') NB_ITERATIONS = int(n_videos / BATCH_SIZE) # NB_ITERATIONS = 5 NB_VAL_ITERATIONS = int(n_val_videos / BATCH_SIZE) # for i in range(len(decoder.layers)): # print (decoder.layers[i], str(i)) # # exit(0) # Setup TensorBoard Callback TC = tb_callback.TensorBoard(log_dir=TF_LOG_DIR, histogram_freq=0, write_graph=False, write_images=False) TC_gan = tb_callback.TensorBoard(log_dir=TF_LOG_GAN_DIR, histogram_freq=0, write_graph=False, write_images=False) LRS = lrs_callback.LearningRateScheduler(schedule=schedule) LRS.set_model(autoencoder) print("Beginning Training...") # Begin Training for epoch in range(NB_EPOCHS_AUTOENCODER): print("\n\nEpoch ", epoch) loss = [] val_loss = [] # Set learning rate every epoch LRS.on_epoch_begin(epoch=epoch) lr = K.get_value(autoencoder.optimizer.lr) print("Learning rate: " + str(lr)) for index in range(NB_ITERATIONS): # Train Autoencoder X = load_X(videos_list, index, DATA_DIR, (128, 128, 3)) X_train = X[:, 0:10] y_train = X[:, 10:] loss.append(autoencoder.train_on_batch(X_train, y_train)) arrow = int(index / (NB_ITERATIONS / 40)) stdout.write("\rIter: " + str(index) + "/" + str(NB_ITERATIONS - 1) + " " + "loss: " + str(loss[len(loss) - 1]) + "\t [" + "{0}>".format("=" * (arrow))) stdout.flush() if SAVE_GENERATED_IMAGES: # Save generated images to file predicted_images = autoencoder.predict(X_train, verbose=0) orig_image, truth_image, pred_image = combine_images( X_train, y_train, predicted_images) pred_image = pred_image * 127.5 + 127.5 orig_image = orig_image * 127.5 + 127.5 truth_image = truth_image * 127.5 + 127.5 if epoch == 0: cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_orig.png"), orig_image) cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_truth.png"), truth_image) cv2.imwrite( os.path.join(GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_pred.png"), pred_image) predicted_attn = mask_gen_1.predict(X_train, verbose=0) a_pred = np.reshape(predicted_attn, newshape=(BATCH_SIZE, VIDEO_LENGTH - 10, 16, 16, 1)) np.save( os.path.join(ATTN_WEIGHTS_DIR, 'attention_weights_cla_gen1_' + str(epoch) + '.npy'), a_pred) # Run over validation data for index in range(NB_VAL_ITERATIONS): X = load_X(val_videos_list, index, VAL_DATA_DIR, (128, 128, 3)) X_train = X[:, 0:10] y_train = X[:, 10:] val_loss.append(autoencoder.test_on_batch(X_train, y_train)) arrow = int(index / (NB_VAL_ITERATIONS / 40)) stdout.write("\rIter: " + str(index) + "/" + str(NB_VAL_ITERATIONS - 1) + " " + "val_loss: " + str(val_loss[len(val_loss) - 1]) + "\t [" + "{0}>".format("=" * (arrow))) stdout.flush() # then after each epoch/iteration avg_loss = sum(loss) / len(loss) avg_val_loss = sum(val_loss) / len(val_loss) logs = {'loss': avg_loss, 'val_loss': avg_val_loss} TC.on_epoch_end(epoch, logs) # Log the losses with open(os.path.join(LOG_DIR, 'losses.json'), 'a') as log_file: log_file.write("{\"epoch\":%d, \"loss\":%f};\n" % (epoch, avg_loss)) print("\nAvg loss: " + str(avg_loss) + " Avg val loss: " + str(avg_val_loss)) # Save model weights per epoch to file encoder.save_weights( os.path.join(CHECKPOINT_DIR, 'encoder_epoch_' + str(epoch) + '.h5'), True) decoder.save_weights( os.path.join(CHECKPOINT_DIR, 'decoder_epoch_' + str(epoch) + '.h5'), True) predicted_attn = mask_gen_1.predict(X_train, verbose=0) a_pred = np.reshape(predicted_attn, newshape=(BATCH_SIZE, VIDEO_LENGTH - 10, 16, 16, 1)) np.save( os.path.join(ATTN_WEIGHTS_DIR, 'attention_weights_cla_gen1_' + str(epoch) + '.npy'), a_pred) # Train AAE if ADVERSARIAL: print("Training Stage II.") exp_memory = ExperienceMemory(memory_length=100) for epoch in range(NB_EPOCHS_GAN): print("\n\nEpoch ", epoch) g_loss = [] val_g_loss = [] d_loss = [] val_d_loss = [] # a_loss = [] # # Set learning rate every epoch # LRS.on_epoch_begin(epoch=epoch) lr = K.get_value(gan.optimizer.lr) print("GAN learning rate: " + str(lr)) lr = K.get_value(discriminator.optimizer.lr) print("Disc learning rate: " + str(lr)) print("g_loss_metrics: " + str(gan.metrics_names)) print("d_loss_metrics: " + str(discriminator.metrics_names)) for index in range(NB_ITERATIONS): # Train Autoencoder X = load_X(videos_list, index, DATA_DIR, (128, 128, 3)) X_hd = load_X(videos_list, index, HD_DATA_DIR, (256, 256, 3)) X128 = X[:, 0:int(VIDEO_LENGTH / 2)] Y128 = autoencoder.predict(X128, verbose=0) X256_real = X_hd[:, int(VIDEO_LENGTH / 2):] X256_fake = generator.predict(Y128, verbose=0) trainable_fakes = exp_memory.get_trainable_fakes( current_gens=X256_fake, exp_window_size=4) # Train Discriminator on future images (y_train, not X_train) X = np.concatenate((X256_real, trainable_fakes)) y = np.concatenate( (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32), np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)), axis=0) d_loss.append(discriminator.train_on_batch(X, y)) # Train AAE set_trainability(discriminator, False) y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32) g_loss.append(gan.train_on_batch(X128, [X256_real, y])) set_trainability(discriminator, True) # # Train Autoencoder # a_loss.append(autoencoder.train_on_batch(X_train, y_train)) arrow = int(index / (NB_ITERATIONS / 30)) stdout.write("\rIter: " + str(index) + "/" + str(NB_ITERATIONS - 1) + " " + "g_loss: " + str([g_loss[len(g_loss) - 1][j] for j in [0, -1]]) + " " + "d_loss: " + str(d_loss[len(d_loss) - 1]) + "\t [" + "{0}>".format("=" * (arrow))) stdout.flush() if SAVE_GENERATED_IMAGES: # Save generated images to file predicted_images = generator.predict(Y128, verbose=0) orig_image, truth_image, pred_image = combine_images( Y128, X256_real, predicted_images) pred_image = pred_image * 127.5 + 127.5 orig_image = orig_image * 127.5 + 127.5 truth_image = truth_image * 127.5 + 127.5 if epoch == 0: cv2.imwrite( os.path.join( CLA_GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_gan_orig.png"), orig_image) cv2.imwrite( os.path.join( CLA_GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_gan_truth.png"), truth_image) cv2.imwrite( os.path.join( CLA_GEN_IMAGES_DIR, str(epoch) + "_" + str(index) + "_gan_pred.png"), pred_image) # Run over validation data print('') for index in range(NB_VAL_ITERATIONS): X = load_X(val_videos_list, index, VAL_DATA_DIR, (128, 128, 3)) X_hd = load_X(val_videos_list, index, VAL_HD_DATA_DIR, (256, 256, 3)) X128_val = X[:, 0:int(VIDEO_LENGTH / 2)] Y128_val = autoencoder.predict(X128, verbose=0) X256_real_val = X_hd[:, int(VIDEO_LENGTH / 2):] X256_fake_val = generator.predict(Y128_val, verbose=0) X = np.concatenate((X256_real_val, X256_fake_val)) y = np.concatenate( (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32), np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)), axis=0) val_d_loss.append(discriminator.test_on_batch(X, y)) y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32) val_g_loss.append( gan.test_on_batch(X128_val, [X256_real_val, y])) arrow = int(index / (NB_VAL_ITERATIONS / 40)) stdout.write( "\rIter: " + str(index) + "/" + str(NB_VAL_ITERATIONS - 1) + " " + "val_g_loss: " + str([val_g_loss[len(val_g_loss) - 1][j] for j in [0, -1]]) + " " + "val_d_loss: " + str(val_d_loss[len(val_d_loss) - 1])) stdout.flush() # then after each epoch/iteration avg_d_loss = np.mean(np.asarray(d_loss, dtype=np.float32), axis=0) avg_val_d_loss = np.mean(np.asarray(val_d_loss, dtype=np.float32), axis=0) avg_g_loss = np.mean(np.asarray(g_loss, dtype=np.float32), axis=0) avg_val_g_loss = np.mean(np.asarray(val_g_loss, dtype=np.float32), axis=0) loss_values = np.asarray(avg_d_loss.tolist() + avg_val_d_loss.tolist() \ + avg_g_loss.tolist() + avg_val_g_loss.tolist(), dtype=np.float32) d_loss_keys = [ 'd_' + metric for metric in discriminator.metrics_names ] g_loss_keys = ['g_' + metric for metric in gan.metrics_names] val_d_loss_keys = [ 'd_val_' + metric for metric in discriminator.metrics_names ] val_g_loss_keys = [ 'g_val_' + metric for metric in gan.metrics_names ] loss_keys = d_loss_keys + val_d_loss_keys + \ g_loss_keys + val_g_loss_keys logs = dict(zip(loss_keys, loss_values)) TC_gan.on_epoch_end(epoch, logs) # Log the losses with open(os.path.join(LOG_DIR, 'losses_gan.json'), 'a') as log_file: log_file.write("{\"epoch\":%d, %s;\n" % (epoch, logs)) print("\nAvg d_loss: " + str(avg_d_loss) + " Avg val_d_loss: " + str(avg_val_d_loss) + "\nAvg g_loss: " + str([avg_g_loss[j] for j in [0, -1]]) + " Avg val_g_loss: " + str([avg_val_g_loss[j] for j in [0, -1]])) # Save model weights per epoch to file encoder.save_weights( os.path.join(CHECKPOINT_DIR, 'encoder_gan_epoch_' + str(epoch) + '.h5'), True) decoder.save_weights( os.path.join(CHECKPOINT_DIR, 'decoder_gan_epoch_' + str(epoch) + '.h5'), True) generator.save_weights( os.path.join(CHECKPOINT_DIR, 'generator_gan_epoch_' + str(epoch) + '.h5'), True) discriminator.save_weights( os.path.join(CHECKPOINT_DIR, 'discriminator_gan_epoch_' + str(epoch) + '.h5'), True) # End TensorBoard Callback TC.on_train_end('_')
class QAgent: """ A QAgent represents a QLearning Agent, which approximates the optimal QValue of every state, action couple (s, a). """ def __init__(self, nb_states: int, nb_actions: int, epsilon_prob: float = 0.05, gamma=0.99, lr=0.1, batch_replay_size=1024): """ :param nb_states: Number of states reachable in the environment. :param nb_actions: Number of possible actions. If the number of actions differs depending on the state, should be the maximum amount of actions. :param epsilon_prob: Epsilon probability. Defaults to 5%. :param gamma Discount factor. :param lr Learning rate :param batch_replay_size Size of batches to train on during updates. """ self.nb_states = nb_states self.nb_actions = nb_actions # Matrix containing Qvalues for every (s, a) couple self.Q = torch.zeros([nb_states, nb_actions], dtype=torch.float32) self.epsilon_prob = epsilon_prob # Discount Factor self.gamma = gamma # Learning rate self.lr = lr # Experience memory self.mem = ExperienceMemory() self.batch_replay_size = batch_replay_size def decide(self, state: int): """ :param state: State index :return: The action a that is best according to the agent (The one that has the best QValue), or a random action with probability epsilon. """ if random() < self.epsilon_prob: return randint(0, self.nb_actions - 1) return torch.argmax(self.Q[state]) def memorize(self, state: int, action: int, next_state: int, reward: torch.float32): """ Stores an experience into the experience memory. :param state: :param action: :param next_state: :param reward: """ self.mem.memorize(torch.tensor([[state]]), torch.tensor([[action]]), torch.tensor([[next_state]]), reward) def update(self): """ Updates the agent's Q values using experience replay. """ states, actions, nstates, rewards = self.mem.random_batch( self.batch_replay_size) for s, a, ns, r in zip(states, actions, nstates, rewards): s = s.item() a = a.item() ns = ns.item() r = r.item() self.Q[s, a] = (1 - self.lr) * self.Q[s, a] \ + self.lr * (r + self.gamma * torch.max(self.Q[ns]))