示例#1
0
    def __init__(self,
                 nb_states: int,
                 nb_actions: int,
                 epsilon_prob: float = 0.05,
                 gamma=0.99,
                 lr=0.1,
                 batch_replay_size=1024):
        """
        :param nb_states:   Number of states reachable in the environment.
        :param nb_actions:  Number of possible actions. If the number of actions
                            differs depending on the state, should be the maximum
                            amount of actions.
        :param epsilon_prob: Epsilon probability. Defaults to 5%.
        :param gamma Discount factor.
        :param lr Learning rate
        :param batch_replay_size Size of batches to train on during updates.
        """
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Matrix containing Qvalues for every (s, a) couple
        self.Q = torch.zeros([nb_states, nb_actions], dtype=torch.float32)
        self.epsilon_prob = epsilon_prob

        # Discount Factor
        self.gamma = gamma

        # Learning rate
        self.lr = lr

        # Experience memory
        self.mem = ExperienceMemory()
        self.batch_replay_size = batch_replay_size
示例#2
0
class Training:
    def __init__(self , nn , train_env , exp_mem_size = 200000 , learning_rate = 0.0001 , step_number_greedy_stop=10000 , min_greedy = 0.05):
        self.nn = nn
        self.train_env = train_env
        self.action = 0

        self.sess = tf.Session()
        self.trainer = tf.train.AdamOptimizer(learning_rate).minimize(nn.cost)
        self.sess.run(tf.global_variables_initializer())

        self.mem = ExperienceMemory(exp_mem_size)
        self.greedy_eps = 1
        self.greedy_eps_step = (self.greedy_eps - min_greedy) / step_number_greedy_stop
        self.min_greedy = min_greedy

        self.writer = tf.summary.FileWriter("../summary" , self.sess.graph)
        self.merged_summary = tf.summary.merge_all()

    def next_step(self):
        self.prev_s = self.train_env.s
        rnd_action = np.zeros( (self.train_env.ACTION_NUMBER) )
        rnd_action[rnd.randint(0 , self.train_env.ACTION_NUMBER - 1)] = 1.0
        self.action = rnd_action if rnd.random() < self.greedy_eps else self.sess.run(self.nn.output ,
                                                                                         feed_dict = {self.nn.s: [self.train_env.s] })[0]
        self.train_env.act(np.argmax(self.action))
        self.add_mem()

        if self.greedy_eps > self.min_greedy:
            self.greedy_eps -= self.greedy_eps_step

        if self.train_env.is_terminate():
            print(self.train_env.score)
            self.train_env.reset()


    def train_batch(self , batch_size , frame_train):
        print(self.greedy_eps)
        nb_batch = frame_train // batch_size
        for batch_id in range(nb_batch):
            batch = self.mem.pick_random(batch_size)
            _ , cost , summaries = self.sess.run([self.trainer , self.nn.cost , self.merged_summary] ,
                          feed_dict={self.nn.s : batch['s'], self.nn.s_ : batch['s_'], self.nn.r : batch['r'], self.nn.a : batch['a']})
            self.writer.add_summary(summaries)
            #print('cost : ' , cost)


    def play(self , n_step):
        for i in range(n_step):
            self.next_step()

    def add_mem(self):
        self.mem.push(self.prev_s , self.train_env.s , self.train_env.r , self.action)
示例#3
0
    def __init__(self , nn , train_env , exp_mem_size = 200000 , learning_rate = 0.0001 , step_number_greedy_stop=10000 , min_greedy = 0.05):
        self.nn = nn
        self.train_env = train_env
        self.action = 0

        self.sess = tf.Session()
        self.trainer = tf.train.AdamOptimizer(learning_rate).minimize(nn.cost)
        self.sess.run(tf.global_variables_initializer())

        self.mem = ExperienceMemory(exp_mem_size)
        self.greedy_eps = 1
        self.greedy_eps_step = (self.greedy_eps - min_greedy) / step_number_greedy_stop
        self.min_greedy = min_greedy

        self.writer = tf.summary.FileWriter("../summary" , self.sess.graph)
        self.merged_summary = tf.summary.merge_all()
示例#4
0
    def __init__(self, neural_net: nn.Module, state_dim: int,
                 batch_size: int, lr=0.01, epsilon_prob=0.05, discount=0.9,
                 device=None):
        """
        :param neural_net: A Neural Network created with PyTorch. Needs to be a subclass of
                           torch.nn.Module and implement methodes __init__ and forward(self, batch).
        :param state_dim: Number of dimensions needed to define a state. Needs to equal the input dimension
                          of the given neural net.
        :param batch_size: Number of experiences on which the network trains during each update.
                           NOTE that the network has to explore at least batch_size experiences
                           before training a first time.
        :param lr:   Learning rate.
        :param epsilon_prob: Probability that the network chooses a random action rather than the
                             best one according to the QValues. Only relevant if decide() is used.
        :param discount: Discount factor (usually called gamma), representing the importance of early
                         decisions comparatively to later ones.
        :param device:   Device that will be used to compute the calculations. Defaults to the the
                         first gpu if possible, or the CPU otherwise.
        """
        self.net = neural_net
        self.net.zero_grad()
        self.state_dim = state_dim
        self.forward = self.net.forward
        self.batch_size = batch_size
        self.epsilon = epsilon_prob
        self.discount = discount

        # Random decision mode: If True, the agent will decide of actions randomly
        self.random_mode = False

        # If the user did not specify a computation device, the cpu is used by default
        # This is because GPU isn't necessarily faster for explorations
        if device is None:
            dev_name = "cpu"
            device = torch.device(dev_name)

        # Set the neural network and memory to the device
        self.net.to(device)
        self.mem = ExperienceMemory(device)

        self.device = device

        # Training memory
        self.loss_mem = []

        # Update tools
        self.optimizer = optim.SGD(self.net.parameters(), lr)
示例#5
0
def run_carla_client(args):
    # Here we will run 3 episodes with 300 frames each.
    number_of_episodes = 60000
    frames_per_episode = 400

    # We assume the CARLA server is already waiting for a client to connect at
    # host:port. To create a connection we can use the `make_carla_client`
    # context manager, it creates a CARLA client object and starts the
    # connection. It will throw an exception if something goes wrong. The
    # context manager makes sure the connection is always cleaned up on exit.
    with make_carla_client(args.host, args.port, 30) as client:
        print('CarlaClient connected')

        # =============================================================================
        #       Global initialisations
        # =============================================================================
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        K.set_session(sess)

        state_size = {
            'state_2D': (
                64,
                64,
                9,
            ),
            'state_1D': (17, )
        }
        action_size = (5, )

        critic = Critic(sess, state_size, action_size, CRITIC_LR)
        critic.target_train()
        actor = Actor(sess, state_size, action_size, ACTOR_LR)
        actor.target_train()
        memory = ExperienceMemory(100000, False)

        target_update_counter = 0
        target_update_freq = TARGET_UPDATE_BASE_FREQ

        explore_rate = 0.2

        success_counter = 0

        total_t = 0
        t = 0
        #NOTE Ez csak egy próba, eztmég át kell alakítani
        target = {
            'pos': np.array([-3.7, 236.4, 0.9]),
            'ori': np.array([0.00, -1.00, 0.00])
        }

        if args.settings_filepath is None:
            # Create a CarlaSettings object. This object is a wrapper around
            # the CarlaSettings.ini file. Here we set the configuration we
            # want for the new episode.
            settings = CarlaSettings()
            settings.set(SynchronousMode=True,
                         SendNonPlayerAgentsInfo=True,
                         NumberOfVehicles=0,
                         NumberOfPedestrians=0,
                         WeatherId=random.choice([1]),
                         QualityLevel=args.quality_level)
            #            settings.randomize_seeds()
            #
            #            settings.randomize_seeds()
            # The default camera captures RGB images of the scene.
            camera0 = Camera('CameraRGB')
            # Set image resolution in pixels.
            camera0.set_image_size(64, 64)
            # Set its position relative to the car in centimeters.
            camera0.set_position(0.30, 0, 1.30)
            settings.add_sensor(camera0)
        else:

            # Alternatively, we can load these settings from a file.
            with open(args.settings_filepath, 'r') as fp:
                settings = fp.read()
        scene = client.load_settings(settings)

        # =============================================================================
        #       EPISODES LOOP
        # =============================================================================
        for episode in range(0, number_of_episodes):
            # Start a new episode.
            # Choose one player start at random.
            number_of_player_starts = len(scene.player_start_spots)
            player_start = random.randint(0, max(0,
                                                 number_of_player_starts - 1))
            player_start = 0
            total_reward = 0.
            # Notify the server that we want to start the episode at the
            # player_start index. This function blocks until the server is ready
            # to start the episode.
            print('Starting new episode...')
            client.start_episode(player_start)

            #TODO Ezen belül kéne implementálni a tanuló algoritmusunkat

            # =============================================================================
            #           Episodic intitialisations
            # =============================================================================
            collisions = {'car': 0, 'ped': 0, 'other': 0}
            reverse = -1.0
            measurements, sensor_data = client.read_data()
            state = get_state_from_data(measurements, sensor_data, reverse)
            goal = get_goal_from_data(target)
            t = 0
            stand_still_counter = 0
            # =============================================================================
            #           STEPS LOOP
            # =============================================================================
            for frame in range(0, frames_per_episode):
                t = t + 1
                total_t += 1
                target_update_counter += 1
                explore_dev = 0.6 / (1 + total_t / 30000)
                explore_rate = 0.3 / (1 + total_t / 30000)
                # Print some of the measurements.
                #   print_measurements(measurements)

                # Save the images to disk if requested.
                if args.save_images_to_disk and False:
                    for name, measurement in sensor_data.items():
                        filename = args.out_filename_format.format(
                            episode, name, frame)
                        measurement.save_to_disk(filename)

                if state['state_1D'][9] < 5 and t > 50:
                    stand_still_counter += 1
                else:
                    stand_still_counter = 0
                #Calculate the action
                a_pred = actor.model.predict([
                    np.expand_dims(state['state_2D'], 0),
                    np.expand_dims(np.concatenate((state['state_1D'], goal)),
                                   0)
                ])[0]
                #Add exploration noise to action
                a = add_noise(a_pred, explore_dev, explore_rate)
                control = get_control_from_a(a)
                #Sendcontrol to the server
                client.send_control(control)

                #
                # =============================================================================
                #               TRAINING THE NETWORKS
                # =============================================================================
                if memory.num_items > 6000:
                    batch, indeces = memory.sample_experience(MINI_BATCH_SIZE)
                    raw_states = [[e[0]['state_2D'], e[0]['state_1D']]
                                  for e in batch]
                    goals = np.asarray([e[5] for e in batch])
                    states = {
                        'state_2D':
                        np.atleast_2d(np.asarray([e[0]
                                                  for e in raw_states[:]])),
                        'state_1D':
                        np.atleast_2d(
                            np.asarray([
                                np.concatenate([e[1], goals[i]], axis=-1)
                                for i, e in enumerate(raw_states[:])
                            ]))
                    }

                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([np.sum(e[2])
                                          for e in batch]).reshape(-1, 1)

                    raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']]
                                      for e in batch]
                    new_states = {
                        'state_2D':
                        np.atleast_2d(
                            np.asarray([e[0] for e in raw_new_states[:]])),
                        'state_1D':
                        np.atleast_2d(
                            np.asarray([
                                np.concatenate([e[1], goals[i]], axis=-1)
                                for i, e in enumerate(raw_new_states[:])
                            ]))
                    }

                    overs = np.asarray([e[4] for e in batch]).reshape(-1, 1)

                    best_a_preds = actor.target_model.predict(
                        [new_states['state_2D'], new_states['state_1D']])
                    max_qs = critic.target_model.predict([
                        new_states['state_2D'], new_states['state_1D'],
                        best_a_preds
                    ])

                    ys = rewards + (1 - overs) * GAMMA * max_qs
                    #Train Critic network
                    critic.model.train_on_batch(
                        [states['state_2D'], states['state_1D'], actions], ys)
                    #Train Actor network
                    a_for_grads = actor.model.predict(
                        [states['state_2D'], states['state_1D']])
                    a_grads = critic.gradients(states, a_for_grads)
                    actor.train(states, a_grads)

                    #Train target networks
                    if target_update_counter >= int(target_update_freq):
                        target_update_counter = 0
                        target_update_freq = target_update_freq * TARGET_UPDATE_MULTIPLIER
                        critic.target_train()
                        actor.target_train()
# =============================================================================
#               GET AND STORE OBSERVATIONS
# =============================================================================
#Get next measurements
                measurements, sensor_data = client.read_data()
                new_state = get_state_from_data(measurements, sensor_data,
                                                reverse, state)

                #TODO Calculate reward
                r_goal, success = calculate_goal_reward(
                    np.atleast_2d(new_state['state_1D']), goal)
                r_general, collisions = calculate_general_reward(
                    measurements, collisions)
                over = stand_still_counter > 30 or success
                success_counter += int(bool(success) * 1)
                total_reward += r_goal
                total_reward += r_general
                #Store observation
                if t > 10:
                    experience = pd.DataFrame(
                        [[
                            state, a,
                            np.array([r_goal, r_general]), new_state,
                            bool(over), goal, episode, 0
                        ]],
                        columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'],
                        copy=True)
                    memory.add_experience(experience)

                #Set the state to the next state
                state = new_state
                if over:
                    break
            sub_goal = deepcopy(state['state_1D'][0:6])
            print(str(episode) + ". Episode###################")
            print("Total reward: " + str(total_reward))
            print("Success counter: " + str(success_counter))
            if (episode % 10 == 0):
                print("############## DEBUG LOG ################")
                print("Memory state: " + str(memory.num_items))
                print("Target update counter: " + str(target_update_counter))
                print("Exploration rate: " + str(explore_rate))
                print("Exploration dev: " + str(explore_dev))
                print("Total timesteps: " + str(total_t))
                print("Average episode length: " + str(total_t /
                                                       (episode + 1)))
                print("#########################################")


# =============================================================================
#           REPLAY FOR SUBGOALS
# =============================================================================
            batch = memory.get_last_episode(t)
            raw_new_states = [[e[3]['state_2D'], e[3]['state_1D']]
                              for e in batch]
            new_states = {
                'state_2D':
                np.atleast_2d(np.asarray([e[0] for e in raw_new_states[:]])),
                'state_1D':
                np.atleast_2d(np.asarray([e[1] for e in raw_new_states[:]]))
            }
            rewards = np.asarray([e[2] for e in batch]).reshape(-1, 2)
            r_subgoal = calculate_goal_reward(new_states['state_1D'],
                                              sub_goal)[0]
            rewards[:, 0] = r_subgoal
            subgoal_batch = [[
                v[0], v[1],
                list(rewards)[i], v[3], v[4], sub_goal, v[6], v[7]
            ] for i, v in enumerate(batch)]
            experiences = pd.DataFrame(
                subgoal_batch,
                columns=['s', 'a', 'r', "s'", 'over', 'g', 'e', 'p'],
                copy=True)
            memory.add_experience(experiences)
示例#6
0
def main():

    parser = argparse.ArgumentParser(
        'a program to train or run a deep q-learning agent')
    parser.add_argument("game", type=str, help="name of game to play")
    parser.add_argument("agent_type",
                        type=str,
                        help="name of learning/acting technique used")
    parser.add_argument("agent_name",
                        type=str,
                        help="unique name of this agent instance")
    parser.add_argument("--rom_path",
                        type=str,
                        help="path to directory containing atari game roms",
                        default='../roms')
    parser.add_argument(
        "--watch",
        help=
        "if true, a pretrained model with the specified name is loaded and tested with the game screen displayed",
        action='store_true')

    parser.add_argument("--epochs",
                        type=int,
                        help="number of epochs",
                        default=200)
    parser.add_argument("--epoch_length",
                        type=int,
                        help="number of steps in an epoch",
                        default=250000)
    parser.add_argument("--test_steps",
                        type=int,
                        help="max number of steps per test",
                        default=125000)
    parser.add_argument("--test_steps_hardcap",
                        type=int,
                        help="absolute max number of steps per test",
                        default=135000)
    parser.add_argument("--test_episodes",
                        type=int,
                        help="max number of episodes per test",
                        default=30)
    parser.add_argument("--history_length",
                        type=int,
                        help="number of frames in a state",
                        default=4)
    parser.add_argument("--training_frequency",
                        type=int,
                        help="number of steps run before training",
                        default=4)
    parser.add_argument(
        "--random_exploration_length",
        type=int,
        help=
        "number of randomly-generated experiences to initially fill experience memory",
        default=50000)
    parser.add_argument("--initial_exploration_rate",
                        type=float,
                        help="initial exploration rate",
                        default=1.0)
    parser.add_argument("--final_exploration_rate",
                        type=float,
                        help="final exploration rate from linear annealing",
                        default=0.1)
    parser.add_argument(
        "--final_exploration_frame",
        type=int,
        help="frame at which the final exploration rate is reached",
        default=1000000)
    parser.add_argument("--test_exploration_rate",
                        type=float,
                        help="exploration rate while testing",
                        default=0.05)
    parser.add_argument("--frame_skip",
                        type=int,
                        help="number of frames to repeat chosen action",
                        default=4)
    parser.add_argument("--screen_dims",
                        type=tuple,
                        help="dimensions to resize frames",
                        default=(84, 84))
    # used for stochasticity and to help prevent overfitting.
    # Must be greater than frame_skip * (observation_length -1) + buffer_length - 1
    parser.add_argument("--max_start_wait",
                        type=int,
                        help="max number of frames to wait for initial state",
                        default=60)
    # buffer_length = 1 prevents blending
    parser.add_argument("--buffer_length",
                        type=int,
                        help="length of buffer to blend frames",
                        default=2)
    parser.add_argument("--blend_method",
                        type=str,
                        help="method used to blend frames",
                        choices=('max'),
                        default='max')
    parser.add_argument("--reward_processing",
                        type=str,
                        help="method to process rewards",
                        choices=('clip', 'none'),
                        default='clip')
    # must set network_architecture to custom in order use custom architecture
    parser.add_argument(
        "--conv_kernel_shapes",
        type=tuple,
        help=
        "shapes of convnet kernels: ((height, width, in_channels, out_channels), (next layer))"
    )
    # must have same length as conv_kernel_shapes
    parser.add_argument(
        "--conv_strides",
        type=tuple,
        help="connvet strides: ((1, height, width, 1), (next layer))")
    # currently,  you must have at least one dense layer
    parser.add_argument(
        "--dense_layer_shapes",
        type=tuple,
        help="shapes of dense layers: ((in_size, out_size), (next layer))")
    parser.add_argument("--discount_factor",
                        type=float,
                        help="constant to discount future rewards",
                        default=0.99)
    parser.add_argument("--learning_rate",
                        type=float,
                        help="constant to scale parameter updates",
                        default=0.00025)
    parser.add_argument("--optimizer",
                        type=str,
                        help="optimization method for network",
                        choices=('rmsprop', 'graves_rmsprop'),
                        default='rmsprop')
    parser.add_argument("--rmsprop_decay",
                        type=float,
                        help="decay constant for moving average in rmsprop",
                        default=0.95)
    parser.add_argument("--rmsprop_epsilon",
                        type=int,
                        help="constant to stabilize rmsprop",
                        default=0.01)
    # set error_clipping to less than 0 to disable
    parser.add_argument(
        "--error_clipping",
        type=str,
        help="constant at which td-error becomes linear instead of quadratic",
        default=1.0)
    # set gradient clipping to 0 or less to disable.  Currently only works with graves_rmsprop.
    parser.add_argument("--gradient_clip",
                        type=str,
                        help="clip gradients to have the provided L2-norm",
                        default=0)
    parser.add_argument("--target_update_frequency",
                        type=int,
                        help="number of steps between target network updates",
                        default=10000)
    parser.add_argument(
        "--memory_capacity",
        type=int,
        help="max number of experiences to store in experience memory",
        default=1000000)
    parser.add_argument(
        "--batch_size",
        type=int,
        help="number of transitions sampled from memory during learning",
        default=32)
    # must set to custom in order to specify custom architecture
    parser.add_argument("--network_architecture",
                        type=str,
                        help="name of prespecified network architecture",
                        choices=("deepmind_nips", "deepmind_nature, custom"),
                        default="deepmind_nature")
    parser.add_argument("--recording_frequency",
                        type=int,
                        help="number of steps before tensorboard recording",
                        default=50000)

    parser.add_argument("--saving_threshold",
                        type=int,
                        help="min score threshold for saving model.",
                        default=0)

    parser.add_argument("--parallel",
                        help="parallelize acting and learning",
                        action='store_true')
    parser.add_argument(
        "--double_dqn",
        help="use double q-learning algorithm in error target calculation",
        action='store_true')
    args = parser.parse_args()

    if args.network_architecture == 'deepmind_nature':
        args.conv_kernel_shapes = [[8, 8, 4, 32], [4, 4, 32, 64],
                                   [3, 3, 64, 64]]
        args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1], [1, 1, 1, 1]]
        args.dense_layer_shapes = [[3136, 512]]
    elif args.network_architecture == 'deepmind_nips':
        args.conv_kernel_shapes = [[8, 8, 4, 16], [4, 4, 16, 32]]
        args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1]]
        args.dense_layer_shapes = [[2592, 256]]

    if not args.watch:
        train_stats = RecordStats(args, False)
        test_stats = RecordStats(args, True)
        training_emulator = AtariEmulator(args)
        testing_emulator = AtariEmulator(args)
        num_actions = len(training_emulator.get_possible_actions())
        experience_memory = ExperienceMemory(args, num_actions)

        q_network = None
        agent = None
        if args.parallel:
            q_network = ParallelQNetwork(args, num_actions)
            agent = ParallelDQNAgent(args, q_network, training_emulator,
                                     experience_memory, num_actions,
                                     train_stats)
        else:
            q_network = QNetwork(args, num_actions)
            agent = DQNAgent(args, q_network, training_emulator,
                             experience_memory, num_actions, train_stats)

        experiment.run_experiment(args, agent, testing_emulator, test_stats)

    else:
        testing_emulator = AtariEmulator(args)
        num_actions = len(testing_emulator.get_possible_actions())
        q_network = QNetwork(args, num_actions)
        agent = DQNAgent(args, q_network, None, None, num_actions, None)
        experiment.evaluate_agent(args, agent, testing_emulator, None)
示例#7
0
class QNetwork:
    """
    A QNetwork is a neural network which uses a Q-Learning approach associated with
    Deep Learning to learn to approximate a reward function over a given state-action couple.
    A QNetwork object is not the neural network itself but rather a tool to make it work with
    Q Deep Learning.
    """

    def __init__(self, neural_net: nn.Module, state_dim: int,
                 batch_size: int, lr=0.01, epsilon_prob=0.05, discount=0.9,
                 device=None):
        """
        :param neural_net: A Neural Network created with PyTorch. Needs to be a subclass of
                           torch.nn.Module and implement methodes __init__ and forward(self, batch).
        :param state_dim: Number of dimensions needed to define a state. Needs to equal the input dimension
                          of the given neural net.
        :param batch_size: Number of experiences on which the network trains during each update.
                           NOTE that the network has to explore at least batch_size experiences
                           before training a first time.
        :param lr:   Learning rate.
        :param epsilon_prob: Probability that the network chooses a random action rather than the
                             best one according to the QValues. Only relevant if decide() is used.
        :param discount: Discount factor (usually called gamma), representing the importance of early
                         decisions comparatively to later ones.
        :param device:   Device that will be used to compute the calculations. Defaults to the the
                         first gpu if possible, or the CPU otherwise.
        """
        self.net = neural_net
        self.net.zero_grad()
        self.state_dim = state_dim
        self.forward = self.net.forward
        self.batch_size = batch_size
        self.epsilon = epsilon_prob
        self.discount = discount

        # Random decision mode: If True, the agent will decide of actions randomly
        self.random_mode = False

        # If the user did not specify a computation device, the cpu is used by default
        # This is because GPU isn't necessarily faster for explorations
        if device is None:
            dev_name = "cpu"
            device = torch.device(dev_name)

        # Set the neural network and memory to the device
        self.net.to(device)
        self.mem = ExperienceMemory(device)

        self.device = device

        # Training memory
        self.loss_mem = []

        # Update tools
        self.optimizer = optim.SGD(self.net.parameters(), lr)

    def memorize(self, states: torch.tensor, actions: torch.IntTensor, next_states: torch.tensor,
                 rewards: torch.tensor):
        """
        Memorizes a sequence of experiences which can be trained on later.
        An experience is a (s, a, ns, r) tuple where:
        -s is the starting state;
        -a is the decided action;
        -ns is the state resulting from taking action a in state s;
        -r is the reward received from the environment.
        :param states: A 2D (batch_size, state_dim) shaped tensor containing the experiences' states.
        :param actions: A 2D (batch_size, 1) integer tensor containing the experiences' decided actions.
        :param next_states: A 2D (batch_size, state_dim + 1) tensor containing the experiences' next_states.
                        The last value of the second dimension must be 1 if the state is final or 0 otherwise.
        :param rewards: A 2D (batch_size, 1) tensor containing the experiences' rewards.
        """
        self.mem.memorize(states, actions, next_states, rewards)

    def memorize_exploration(self, states: torch.tensor,
                             actions: torch.IntTensor,
                             rewards: torch.tensor,
                             last_state_is_final=True):
        """
        Memorizes a whole exploration process with a final single reward.
        Should be used for processes for which the reward isn't specifically known for
        every state-action couple, but rather according to a final score.
        :param states: Successive states encountered. Should be a tensor of shape
                      (number_of_states, state_dim)
        :param actions: Successive actions decided by the agent. Should be a tensor of shape
                       (number_of_states - 1, )
        :param next_states: For each state-action (s, a) encountered, state s' returned by the
                           environment. Same shape as :param state:.
        :param rewards (number_of_states - 1, )-sized 1D Tensor indicating the rewards for the episode
        :param last_state_is_final: Indicates whether the last state in the exploration was final.
        """
        states = states.to(self.device)

        # Creates a tensor containing [0, 0, ..., 0, 1] to indicate that only the last state was final
        final_indicator = torch.zeros(states.size()[0] - 1, device=self.device)
        final_indicator[-1] = last_state_is_final
        # States at the beginning of each step, including the final indicator
        next_states = torch.cat((states[1:], final_indicator.view(-1, 1)), dim=1)

        actions = actions.to(self.device)
        rewards = rewards.to(self.device)
        self.mem.memorize(states[:-1], actions, next_states, rewards)

    def set_last_rewards(self, nb_experiences: int, reward: torch.double):
        """
        Sets the rewards for the last memorized experiences to a given value.
        This should be used for example when the reward is not known for every specific
        (state, action) couple, but can be deduced from the final state reached: Use this function
        to set the rewards for the episode to the final reward.
        :param nb_experiences: number of experiences whose rewards should be affected
        :param reward: scalar indicating to which value the last rewards should be set
        """
        self.mem.set_last_rewards(nb_experiences, reward)

    def decide(self, states: torch.tensor):
        """
        Decides which action is best for a given batch of states.
        :param states (Batch_size, state_dim) set of states.
        :return: A (Batch_size, 1) int tensor A where A[i, 0] is the index of the decided action.
        """
        # Make sure the states tensor runs on the right device
        states = states.to(self.device)

        output = self.forward(states)
        random_actions = torch.randint(0, output.size()[1], (states.size()[0],), device=self.device)

        # If the network is on random mode, return random actions
        if self.random_mode:
            return random_actions
        else:
            dice = torch.rand(states.size()[0], device=self.device)
            actions = torch.argmax(output, dim=1).type(torch.int64)
            return actions * (dice >= self.epsilon) + random_actions * (dice < self.epsilon)

    def decide_best(self, states: torch.tensor):
        """
        Decides which action is best for a given batch of states, without taking the epsilon strategy into account.
        :param states: (Batch_size, state_dim) set of states.
        :return: A (Batch_size, 1) int tensor A where A[i, 0] is the index of the preferred action according to the
            network.
        """
        # Make sure the states tensor runs on the right device
        states = states.to(self.device)

        output = self.forward(states)
        return torch.argmax(output, dim=1).type(torch.int64);

    def clear_memory(self):
        """
        Clears the agent's Experience Memory.
        """
        self.mem.clear()

    def set_random_mode(self, value: bool):
        """
        Sets the network to a random mode: if True, the network will decide of actions
        randomly (As if the epsilon probability was 1).
        """
        self.random_mode = value

    def train_on_batch(self, states, actions, next_states, rewards):
        """
        Trains the network on a batch of experiences
        :param states: (batch_size, state_dim) tensor indicating the states.
        :param actions: (batch_size, 1) int tensor indicating actions taken
        :param next_states: (batch_size, state_dim + 1) tensor indicating next states.
                The last value of the second dimension should be either 1 if the state
                is a final state or 0 otherwise.
        :param rewards: (batch_size, 1) float tensor indicating
        """

        """
        The Target value to compute the loss is taken as
        y = reward + discount * max {Q[next_state, a'] for all action a'}
        Since we do not have that maximum value, we use the network's estimation.
        """

        # Tensor containing information about whether the states are final
        final_indicator = next_states[:, -1]

        # Now remove that information from the next states tensor
        next_states = next_states[:, :-1]

        # Divide final and non final states
        non_final_states = next_states[final_indicator == 0, :]

        output = self.forward(states).gather(1, actions.view(states.size()[0], 1)).view((-1,))

        # Modify the target so that Y[k, a] = r  + gamma * max_net_val
        # and Y[k, a'] is unchanged for a' != a
        # If the next state is final, don't take into account the reward obtainable from it
        target = torch.zeros(rewards.size(), device=self.device)
        target[final_indicator == 1] = rewards[final_indicator == 1]

        # If the next state isn't final, estimate the max reward obtainable from it
        # using the network itself.
        if non_final_states.size()[0] > 0:
            max_next_qval = self.forward(non_final_states).max(1)[0]
            target[final_indicator == 0] = rewards[final_indicator == 0] + self.discount * max_next_qval
        target = target.detach()

        # Compute the loss
        loss = func.mse_loss(output, target)
        self.loss_mem.append(loss)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update(self):
        """
        Updates the QNetwork's parameters using its experience memory.
        """
        # Get a random batch from the experience memory
        states, actions, next_states, rewards = self.mem.random_batch(self.batch_size)
        self.train_on_batch(states, actions, next_states, rewards)

    def train_on_memory(self, batch_size, epochs):
        """
        Trains the agent on experiences from its experience replay memory.
        :param batch_size: Batch size for training
        :param epochs: Number of times the mem should be fully browsed
        """
        print("Training on ", epochs, " epochs from the replay memory..")

        # Get all data from the replay memory
        states, actions, next_states, rewards = self.mem.all()

        # Shuffling the batches
        lines_shuffle = torch.randperm(states.size()[0])
        states = states[lines_shuffle]
        actions = actions[lines_shuffle]
        rewards = rewards[lines_shuffle]
        next_states = next_states[lines_shuffle]

        # Split them into batches
        states_batches = torch.split(states, batch_size)
        actions_batches = torch.split(actions, batch_size)
        next_states_batches = torch.split(next_states, batch_size)
        rewards_batches = torch.split(rewards, batch_size)

        # Number of batches
        nb_batches = len(states_batches)

        # Train
        for ep in range(epochs):
            batches_completed = 0
            for states, actions, next_states, rewards \
                    in zip(states_batches, actions_batches, next_states_batches, rewards_batches):
                self.train_on_batch(states, actions, next_states, rewards)
                batches_completed += 1
                printProgressBar(batches_completed, nb_batches,
                                 "Epoch " + str(ep + 1) + "/" + str(epochs), length=90)

    def show_training(self):
        """
        Plots the training metrics.
        """
        plt.plot([self.batch_size * (i + 1) for i in range(len(self.loss_mem))], self.loss_mem)
        plt.xlabel("Batches")
        plt.ylabel("MSE Loss")

    def plot_trajectory(self, initial_states: torch.tensor, next_state_function,
                        steps=100):
        """
        ONLY AVAILABLE IF STATE DIM IS 1 OR 2.
        Plots the trajectory of the agent starting from the given initial states
        on a 2D (if self.state_dim == 1) or 3D (if self.state_dim == 2) graph.
        :param initial_states: (N, state_dim) torch tensor indicating the starting states
        :param next_state_function: Function used to determine the next state.
            Should have signature (state: torch.tensor, action: int)
        :param steps: Number of successive states that should be plotted.
        """
        # Make sure the initial state runs on the right device
        initial_states = initial_states.to(self.device)

        if self.state_dim != 1 and self.state_dim != 2:
            raise ValueError("State dimension too large to plot agent trajectory.\n")
        for initial_state in initial_states:
            states = torch.empty((steps, self.state_dim))
            states[0] = initial_state

            # Exploration
            for step in range(steps - 1):
                action = self.decide_best(states[step].view(1, -1)).item()
                states[step + 1] = next_state_function(states[step], action)

            # Plotting
            if self.state_dim == 1:
                plt.plot(torch.arange(0, step), states)
                plt.plot([0], [initial_state[0]], "go")
                plt.plot([steps - 1], [states[-1].item()], "ro")
            elif self.state_dim == 2:
                plt.plot(states[:, 0], states[:, 1])
                plt.plot([initial_state[0]], [initial_state[1]], "go")
                plt.plot([states[-1, 0]], [states[-1, 1]], "ro")

    def set_learning_rate(self, new_lr: float):
        """
        Sets a value for the network's learning rate.
        :param new_lr: New value for the learning rate
        """
        self.optimizer.lr = new_lr

    def set_device(self, device: torch.device):
        """
        Sets a new device for training computations.
        :param device:  Torch device object.
        """
        self.device = device
        self.mem.to(device)
        self.net.to(device)
示例#8
0
def train(BATCH_SIZE, ENC_WEIGHTS, DEC_WEIGHTS, DIS_WEIGHTS):
    print("Loading data definitions...")
    frames_source = hkl.load(os.path.join(DATA_DIR, 'sources_train_128.hkl'))

    # Build video progressions
    videos_list = []
    start_frame_index = 1
    end_frame_index = VIDEO_LENGTH + 1
    while (end_frame_index <= len(frames_source)):
        frame_list = frames_source[start_frame_index:end_frame_index]
        if (len(set(frame_list)) == 1):
            videos_list.append(range(start_frame_index, end_frame_index))
            start_frame_index = start_frame_index + 1
            end_frame_index = end_frame_index + 1
        else:
            start_frame_index = end_frame_index - 1
            end_frame_index = start_frame_index + VIDEO_LENGTH

    videos_list = np.asarray(videos_list, dtype=np.int32)
    n_videos = videos_list.shape[0]

    if SHUFFLE:
        # Shuffle images to aid generalization
        videos_list = np.random.permutation(videos_list)

    # Build the Spatio-temporal Autoencoder
    print("Creating models...")
    encoder = encoder_model()
    decoder = decoder_model()

    intermediate_decoder = Model(inputs=decoder.layers[0].input,
                                 outputs=decoder.layers[10].output)
    mask_gen = Sequential()
    mask_gen.add(encoder)
    mask_gen.add(intermediate_decoder)
    mask_gen.compile(loss='mean_squared_error', optimizer=OPTIM_G)

    autoencoder = autoencoder_model(encoder, decoder)

    if ADVERSARIAL:
        discriminator = discriminator_model()
        aae = aae_model(autoencoder, discriminator)
        aae.compile(loss='binary_crossentropy', optimizer=OPTIM_G)
        set_trainability(discriminator, True)
        discriminator.compile(loss='binary_crossentropy', optimizer=OPTIM_D)
        run_utilities(encoder, decoder, autoencoder, discriminator,
                      ENC_WEIGHTS, DEC_WEIGHTS, DIS_WEIGHTS)
    else:
        run_utilities(encoder, decoder, autoencoder, 'None', ENC_WEIGHTS,
                      DEC_WEIGHTS, 'None')

    autoencoder.compile(loss=mse_kld_loss, optimizer=OPTIM_A)

    NB_ITERATIONS = int(n_videos / BATCH_SIZE)

    # Setup TensorBoard Callback
    TC = tb_callback.TensorBoard(log_dir=TF_LOG_DIR,
                                 histogram_freq=0,
                                 write_graph=False,
                                 write_images=False)
    LRS = lrs_callback.LearningRateScheduler(schedule=schedule)
    LRS.set_model(autoencoder)

    print("Beginning Training...")
    # Begin Training
    for epoch in range(NB_EPOCHS_AUTOENCODER):
        print("\n\nEpoch ", epoch)
        loss = []

        # Set learning rate every epoch
        LRS.on_epoch_begin(epoch=epoch)
        lr = K.get_value(autoencoder.optimizer.lr)
        print("Learning rate: " + str(lr))

        for index in range(NB_ITERATIONS):
            # Train Autoencoder
            X = load_X(videos_list, index, DATA_DIR)
            X_train = X[:, 0:int(VIDEO_LENGTH / 2)]
            y_train = X[:, int(VIDEO_LENGTH / 2):]
            loss.append(autoencoder.train_on_batch(X_train, y_train))

            arrow = int(index / (NB_ITERATIONS / 40))
            stdout.write("\rIteration: " + str(index) + "/" +
                         str(NB_ITERATIONS - 1) + "  " + "loss: " +
                         str(loss[len(loss) - 1]) + "\t    [" +
                         "{0}>".format("=" * (arrow)))
            stdout.flush()

        if SAVE_GENERATED_IMAGES:
            # Save generated images to file
            predicted_images = autoencoder.predict(X_train, verbose=0)
            orig_image, truth_image, pred_image = combine_images(
                X_train, y_train, predicted_images)
            pred_image = pred_image * 127.5 + 127.5
            orig_image = orig_image * 127.5 + 127.5
            truth_image = truth_image * 127.5 + 127.5
            if epoch == 0:
                cv2.imwrite(
                    os.path.join(GEN_IMAGES_DIR,
                                 str(epoch) + "_" + str(index) + "_orig.png"),
                    orig_image)
                cv2.imwrite(
                    os.path.join(GEN_IMAGES_DIR,
                                 str(epoch) + "_" + str(index) + "_truth.png"),
                    truth_image)
            cv2.imwrite(
                os.path.join(GEN_IMAGES_DIR,
                             str(epoch) + "_" + str(index) + "_pred.png"),
                pred_image)

        # then after each epoch/iteration
        avg_loss = sum(loss) / len(loss)
        logs = {'loss': avg_loss}
        TC.on_epoch_end(epoch, logs)

        # Log the losses
        with open(os.path.join(LOG_DIR, 'losses.json'), 'a') as log_file:
            log_file.write("{\"epoch\":%d, \"d_loss\":%f};\n" %
                           (epoch, avg_loss))

        print("\nAvg loss: " + str(avg_loss))

        # Save predicted mask per epoch
        predicted_attn_1 = mask_gen.predict(X_train, verbose=0)
        a_pred_1 = np.reshape(predicted_attn_1, newshape=(10, 10, 16, 16, 1))
        np.save(
            os.path.join(TEST_RESULTS_DIR,
                         'attention_weights_gen1_' + str(epoch) + '.npy'),
            a_pred_1)

        # Save model weights per epoch to file
        encoder.save_weights(
            os.path.join(CHECKPOINT_DIR,
                         'encoder_epoch_' + str(epoch) + '.h5'), True)
        decoder.save_weights(
            os.path.join(CHECKPOINT_DIR,
                         'decoder_epoch_' + str(epoch) + '.h5'), True)

    # Train AAE
    if ADVERSARIAL:
        exp_memory = ExperienceMemory(memory_length=100)
        for epoch in range(NB_EPOCHS_AAE):
            print("\n\nEpoch ", epoch)
            g_loss = []
            d_loss = []
            # a_loss = []

            # # Set learning rate every epoch
            # LRS.on_epoch_begin(epoch=epoch)
            lr = K.get_value(autoencoder.optimizer.lr)
            print("Learning rate: " + str(lr))

            for index in range(NB_ITERATIONS):
                # Train Autoencoder
                X = load_X(videos_list, index, DATA_DIR)
                X_train = X[:, 0:int(VIDEO_LENGTH / 2)]
                y_train = X[:, int(VIDEO_LENGTH / 2):]

                future_images = autoencoder.predict(X_train, verbose=0)
                trainable_fakes = exp_memory.get_trainable_fakes(
                    current_gens=future_images, exp_window_size=5)

                # Train Discriminator on future images (y_train, not X_train)
                X = np.concatenate((y_train, trainable_fakes))
                y = np.concatenate(
                    (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.int),
                     np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.int)),
                    axis=0)
                d_loss.append(discriminator.train_on_batch(X, y))

                # Train AAE
                set_trainability(discriminator, False)
                y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.int)
                g_loss.append(aae.train_on_batch(X_train, y))
                set_trainability(discriminator, True)

                # # Train Autoencoder
                # a_loss.append(autoencoder.train_on_batch(X_train, y_train))

                arrow = int(index / (NB_ITERATIONS / 30))
                stdout.write("\rIteration: " + str(index) + "/" +
                             str(NB_ITERATIONS - 1) + "  " + "g_loss: " +
                             str(g_loss[len(g_loss) - 1]) + "  " + "d_loss: " +
                             str(d_loss[len(d_loss) - 1]) + "\t    [" +
                             "{0}>".format("=" * (arrow)))
                stdout.flush()

            if SAVE_GENERATED_IMAGES:
                # Save generated images to file
                predicted_images = autoencoder.predict(X_train, verbose=0)
                orig_image, truth_image, pred_image = combine_images(
                    X_train, y_train, predicted_images)
                pred_image = pred_image * 127.5 + 127.5
                orig_image = orig_image * 127.5 + 127.5
                truth_image = truth_image * 127.5 + 127.5
                if epoch == 0:
                    cv2.imwrite(
                        os.path.join(
                            GEN_IMAGES_DIR,
                            str(epoch) + "_" + str(index) + "_aae_orig.png"),
                        orig_image)
                    cv2.imwrite(
                        os.path.join(
                            GEN_IMAGES_DIR,
                            str(epoch) + "_" + str(index) + "_aae_truth.png"),
                        truth_image)
                cv2.imwrite(
                    os.path.join(
                        GEN_IMAGES_DIR,
                        str(epoch) + "_" + str(index) + "_aae_pred.png"),
                    pred_image)

                predicted_attn_1 = mask_gen.predict(X_train, verbose=0)
                a_pred_1 = np.reshape(predicted_attn_1,
                                      newshape=(10, 10, 16, 16, 1))
                np.save(
                    os.path.join(
                        TEST_RESULTS_DIR,
                        'attention_weights_gen1_' + str(epoch) + '.npy'),
                    a_pred_1)

            # then after each epoch/iteration
            # avg_a_loss = sum(a_loss) / len(a_loss)
            avg_g_loss = sum(g_loss) / len(g_loss)
            avg_d_loss = sum(d_loss) / len(d_loss)
            logs = {'g_loss': avg_g_loss, 'd_loss': avg_d_loss}
            TC.on_epoch_end(epoch, logs)

            # Log the losses
            with open(os.path.join(LOG_DIR, 'losses_aae.json'),
                      'a') as log_file:
                log_file.write(
                    "{\"epoch\":%d, \"g_loss\":%f, \"d_loss\":%f};\n" %
                    (epoch, avg_g_loss, avg_d_loss))

            print("\nAvg g_loss: " + str(avg_g_loss) + "  Avg d_loss: " +
                  str(avg_d_loss))

            # Save model weights per epoch to file
            encoder.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'encoder_aae_epoch_' + str(epoch) + '.h5'), True)
            decoder.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'decoder_aae_epoch_' + str(epoch) + '.h5'), True)
            discriminator.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'discriminator_aae_epoch_' + str(epoch) + '.h5'),
                True)

    # End TensorBoard Callback
    TC.on_train_end('_')
示例#9
0
def main():
    STEP_LOG_RATE = 1000
    TENSORBOARD_ROOT_PATH = "tensorboard"
    CHECKPOINT_ROOT_PATH = "checkpoints_test"
    CHECKPOINTS_STEPS = 100000
    EXPERIENCE_MEMORY_CAPACITY = 6400000
    MINIBATCH_SIZE = 32
    GAMMA = 0.99
    EPSILON_START = 1.0
    EPSILON_END = 0.1
    EPSILON_DECAY_STEPS = 1000000
    LEARNING_RATE = 0.0001
    FIELD_WIDTH = 5
    FIELD_HEIGHT = 5
    USE_TARGET_NETWORK = True
    TARGET_NETWORK_UPDATE_STEPS = 10000
    STATE_AS_COORDINATES = True
    STATE_NORMALISATION = True

    descriptiveString = buildDescriptiveString(EXPERIENCE_MEMORY_CAPACITY, \
        MINIBATCH_SIZE, GAMMA, EPSILON_START, EPSILON_END, EPSILON_DECAY_STEPS, \
        LEARNING_RATE, STATE_AS_COORDINATES, STATE_NORMALISATION, \
        FIELD_WIDTH, FIELD_HEIGHT, USE_TARGET_NETWORK, TARGET_NETWORK_UPDATE_STEPS)

    tensorboardDirectory = os.path.join(TENSORBOARD_ROOT_PATH,
                                        descriptiveString)
    checkpointDirectory = os.path.join(CHECKPOINT_ROOT_PATH, descriptiveString)

    # create catch environment
    catch = Catch(FIELD_WIDTH, FIELD_HEIGHT, STATE_AS_COORDINATES,
                  STATE_NORMALISATION)
    numberOfActions = catch.getNumberOfActions()
    stateSize = catch.getStateSize()
    # create experience memory
    experienceMemory = ExperienceMemory(EXPERIENCE_MEMORY_CAPACITY, stateSize)

    ########################################################################################################################################################
    input, output, outputLabel, onlineSummary = createModel(stateSize, \
        numberOfActions, isTargetNetwork=False)

    if USE_TARGET_NETWORK:
        targetInput, targetOutput, _, targetSummary = createModel(stateSize, \
            numberOfActions, isTargetNetwork=True)

    with tf.name_scope("train"):
        optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE)
        loss = tf.losses.huber_loss(labels=outputLabel, predictions=output)
        train = optimizer.minimize(loss)

        tf.summary.scalar("loss", loss)

    episodicStepsSummary = tf.Summary()
    episodicRewardSummary = tf.Summary()
    explorationSummary = tf.Summary()
    experienceMemorySizeSummary = tf.Summary()

    episodicStepsSummary.value.add(tag="episodic_steps", simple_value=None)
    episodicRewardSummary.value.add(tag="episodic_reward", simple_value=None)
    explorationSummary.value.add(tag="exploration", simple_value=None)
    experienceMemorySizeSummary.value.add(tag="experience_memory_size",
                                          simple_value=None)

    trainSummary = tf.summary.merge_all(scope="train")

    sess = tf.Session()
    init = tf.global_variables_initializer()

    sess.run(init)

    writer = tf.summary.FileWriter(tensorboardDirectory, sess.graph)

    updateTargetNetwork(sess, writer, targetSummary, 0)
    ########################################################################################################################################################

    step = 0
    episode = 0
    epsilon = EPSILON_START

    while step < EPSILON_DECAY_STEPS:
        episode += 1

        catch.reset()
        state = catch.getState()
        done = False
        episodeReward = 0
        episodeSteps = 0

        while not done and step < EPSILON_DECAY_STEPS:
            step += 1
            # select next action
            if np.random.random() <= epsilon:
                actionNumber = np.random.randint(numberOfActions)
            else:
                prediction = sess.run(
                    output,
                    feed_dict={input: np.reshape(state, (-1, stateSize))})
                actionNumber = np.argmax(prediction[0])
            # convert action number to action
            action = list(Actions)[actionNumber]
            # execute selected action
            reward, nextState, done = catch.move(action)
            # store experience to memory
            experienceMemory.store(state, actionNumber, reward, nextState,
                                   done)
            # replace current state by next state
            state = nextState
            # replay experiences
            if experienceMemory.size() > MINIBATCH_SIZE:
                # sample from experience memory
                ids, states, actions, rewards, nextStates, nextStateTerminals = experienceMemory.sample(
                    MINIBATCH_SIZE)

                if USE_TARGET_NETWORK:
                    statePredictions = sess.run(output,
                                                feed_dict={input: states})
                    nextStatePredictions = sess.run(
                        targetOutput, feed_dict={targetInput: nextStates})
                else:
                    predictions = sess.run(output,
                                           feed_dict={
                                               input:
                                               np.concatenate(
                                                   (states, nextStates))
                                           })
                    statePredictions = predictions[:MINIBATCH_SIZE]
                    nextStatePredictions = predictions[MINIBATCH_SIZE:]

                statePredictions[np.arange(MINIBATCH_SIZE), actions] = \
                                rewards + np.invert(nextStateTerminals) * GAMMA * \
                                nextStatePredictions.max(axis=1)

                # update online network
                _, onlineSummaryResult, trainSummaryResult = sess.run(
                    [train, onlineSummary, trainSummary],
                    feed_dict={
                        input: states,
                        outputLabel: statePredictions
                    })
                # write summary
                if step % STEP_LOG_RATE == 0:
                    writer.add_summary(onlineSummaryResult, step)
                    writer.add_summary(trainSummaryResult, step)

            episodeReward += reward
            episodeSteps += 1
            # update target network
            if USE_TARGET_NETWORK and step % TARGET_NETWORK_UPDATE_STEPS == 0:
                updateTargetNetwork(sess, writer, targetSummary, step)
            # write exploration summary
            if step % STEP_LOG_RATE == 0:
                explorationSummary.value[0].simple_value = epsilon
                experienceMemorySizeSummary.value[
                    0].simple_value = experienceMemory.size()
                writer.add_summary(explorationSummary, step)
                writer.add_summary(experienceMemorySizeSummary, step)
            # save checkpoint
            if step % CHECKPOINTS_STEPS == 0:
                saveModel(checkpointDirectory, step, sess)
            # calculate epsilon for next step
            epsilon = EPSILON_START - (EPSILON_START - EPSILON_END) / (
                EPSILON_DECAY_STEPS / step)

        # write episodic summary
        episodicStepsSummary.value[0].simple_value = episodeSteps
        episodicRewardSummary.value[0].simple_value = episodeReward
        writer.add_summary(episodicStepsSummary, step)
        writer.add_summary(episodicRewardSummary, step)
def train(BATCH_SIZE, ENC_WEIGHTS, DEC_WEIGHTS, GEN_WEIGHTS, DIS_WEIGHTS):
    print("Loading data definitions...")
    frames_source = hkl.load(os.path.join(DATA_DIR, 'sources_train_128.hkl'))

    # Build video progressions
    videos_list = []
    start_frame_index = 1
    end_frame_index = VIDEO_LENGTH + 1
    while (end_frame_index <= len(frames_source)):
        frame_list = frames_source[start_frame_index:end_frame_index]
        if (len(set(frame_list)) == 1):
            videos_list.append(range(start_frame_index, end_frame_index))
            start_frame_index = start_frame_index + 1
            end_frame_index = end_frame_index + 1
        else:
            start_frame_index = end_frame_index - 1
            end_frame_index = start_frame_index + VIDEO_LENGTH

    videos_list = np.asarray(videos_list, dtype=np.int32)
    n_videos = videos_list.shape[0]

    # Setup validation
    val_frames_source = hkl.load(
        os.path.join(VAL_DATA_DIR, 'sources_val_128.hkl'))
    val_videos_list = []
    start_frame_index = 1
    end_frame_index = VIDEO_LENGTH + 1
    while (end_frame_index <= len(val_frames_source)):
        val_frame_list = val_frames_source[start_frame_index:end_frame_index]
        if (len(set(val_frame_list)) == 1):
            val_videos_list.append(range(start_frame_index, end_frame_index))
            start_frame_index = start_frame_index + VIDEO_LENGTH
            end_frame_index = end_frame_index + VIDEO_LENGTH
        else:
            start_frame_index = end_frame_index - 1
            end_frame_index = start_frame_index + VIDEO_LENGTH

    val_videos_list = np.asarray(val_videos_list, dtype=np.int32)
    n_val_videos = val_videos_list.shape[0]

    if SHUFFLE:
        # Shuffle images to aid generalization
        videos_list = np.random.permutation(videos_list)

    # Build the Spatio-temporal Autoencoder
    print("Creating models...")
    encoder = encoder_model()
    decoder = decoder_model()
    autoencoder = autoencoder_model(encoder, decoder)
    autoencoder.compile(loss="mean_squared_error", optimizer=OPTIM_A)

    intermediate_decoder = Model(inputs=decoder.layers[0].input,
                                 outputs=decoder.layers[1].output)
    mask_gen_1 = Sequential()
    mask_gen_1.add(encoder)
    mask_gen_1.add(intermediate_decoder)
    mask_gen_1.compile(loss='mean_squared_error', optimizer=OPTIM_G)

    if ADVERSARIAL:
        generator = refiner_g_model()
        discriminator = refiner_d_model()
        gan = gan_model(autoencoder, generator, discriminator)
        generator.compile(loss='binary_crossentropy', optimizer='sgd')
        gan.compile(loss=['mae', 'binary_crossentropy'],
                    loss_weights=LOSS_WEIGHTS,
                    optimizer=OPTIM_G,
                    metrics=['accuracy'])
        print('GAN')
        print(gan.summary())
        set_trainability(discriminator, True)
        discriminator.compile(loss='binary_crossentropy',
                              optimizer=OPTIM_D,
                              metrics=['accuracy'])
        run_utilities(encoder, decoder, autoencoder, generator, discriminator,
                      gan, ENC_WEIGHTS, DEC_WEIGHTS, GEN_WEIGHTS, DIS_WEIGHTS)
    else:
        run_utilities(encoder, decoder, autoencoder, 'None', 'None', 'None',
                      ENC_WEIGHTS, DEC_WEIGHTS, 'None', 'None')

    NB_ITERATIONS = int(n_videos / BATCH_SIZE)
    # NB_ITERATIONS = 5
    NB_VAL_ITERATIONS = int(n_val_videos / BATCH_SIZE)

    # for i in range(len(decoder.layers)):
    #     print (decoder.layers[i], str(i))
    #
    # exit(0)

    # Setup TensorBoard Callback
    TC = tb_callback.TensorBoard(log_dir=TF_LOG_DIR,
                                 histogram_freq=0,
                                 write_graph=False,
                                 write_images=False)
    TC_gan = tb_callback.TensorBoard(log_dir=TF_LOG_GAN_DIR,
                                     histogram_freq=0,
                                     write_graph=False,
                                     write_images=False)
    LRS = lrs_callback.LearningRateScheduler(schedule=schedule)
    LRS.set_model(autoencoder)

    print("Beginning Training...")
    # Begin Training
    for epoch in range(NB_EPOCHS_AUTOENCODER):
        print("\n\nEpoch ", epoch)
        loss = []
        val_loss = []

        # Set learning rate every epoch
        LRS.on_epoch_begin(epoch=epoch)
        lr = K.get_value(autoencoder.optimizer.lr)
        print("Learning rate: " + str(lr))

        for index in range(NB_ITERATIONS):
            # Train Autoencoder
            X = load_X(videos_list, index, DATA_DIR, (128, 128, 3))
            X_train = X[:, 0:10]
            y_train = X[:, 10:]
            loss.append(autoencoder.train_on_batch(X_train, y_train))

            arrow = int(index / (NB_ITERATIONS / 40))
            stdout.write("\rIter: " + str(index) + "/" +
                         str(NB_ITERATIONS - 1) + "  " + "loss: " +
                         str(loss[len(loss) - 1]) + "\t    [" +
                         "{0}>".format("=" * (arrow)))
            stdout.flush()

        if SAVE_GENERATED_IMAGES:
            # Save generated images to file
            predicted_images = autoencoder.predict(X_train, verbose=0)
            orig_image, truth_image, pred_image = combine_images(
                X_train, y_train, predicted_images)
            pred_image = pred_image * 127.5 + 127.5
            orig_image = orig_image * 127.5 + 127.5
            truth_image = truth_image * 127.5 + 127.5
            if epoch == 0:
                cv2.imwrite(
                    os.path.join(GEN_IMAGES_DIR,
                                 str(epoch) + "_" + str(index) + "_orig.png"),
                    orig_image)
                cv2.imwrite(
                    os.path.join(GEN_IMAGES_DIR,
                                 str(epoch) + "_" + str(index) + "_truth.png"),
                    truth_image)
            cv2.imwrite(
                os.path.join(GEN_IMAGES_DIR,
                             str(epoch) + "_" + str(index) + "_pred.png"),
                pred_image)

        predicted_attn = mask_gen_1.predict(X_train, verbose=0)
        a_pred = np.reshape(predicted_attn,
                            newshape=(BATCH_SIZE, VIDEO_LENGTH - 10, 16, 16,
                                      1))
        np.save(
            os.path.join(ATTN_WEIGHTS_DIR,
                         'attention_weights_cla_gen1_' + str(epoch) + '.npy'),
            a_pred)

        # Run over validation data
        for index in range(NB_VAL_ITERATIONS):
            X = load_X(val_videos_list, index, VAL_DATA_DIR, (128, 128, 3))
            X_train = X[:, 0:10]
            y_train = X[:, 10:]
            val_loss.append(autoencoder.test_on_batch(X_train, y_train))

            arrow = int(index / (NB_VAL_ITERATIONS / 40))
            stdout.write("\rIter: " + str(index) + "/" +
                         str(NB_VAL_ITERATIONS - 1) + "  " + "val_loss: " +
                         str(val_loss[len(val_loss) - 1]) + "\t    [" +
                         "{0}>".format("=" * (arrow)))
            stdout.flush()

        # then after each epoch/iteration
        avg_loss = sum(loss) / len(loss)
        avg_val_loss = sum(val_loss) / len(val_loss)
        logs = {'loss': avg_loss, 'val_loss': avg_val_loss}
        TC.on_epoch_end(epoch, logs)

        # Log the losses
        with open(os.path.join(LOG_DIR, 'losses.json'), 'a') as log_file:
            log_file.write("{\"epoch\":%d, \"loss\":%f};\n" %
                           (epoch, avg_loss))

            print("\nAvg loss: " + str(avg_loss) + " Avg val loss: " +
                  str(avg_val_loss))

        # Save model weights per epoch to file
        encoder.save_weights(
            os.path.join(CHECKPOINT_DIR,
                         'encoder_epoch_' + str(epoch) + '.h5'), True)
        decoder.save_weights(
            os.path.join(CHECKPOINT_DIR,
                         'decoder_epoch_' + str(epoch) + '.h5'), True)

        predicted_attn = mask_gen_1.predict(X_train, verbose=0)
        a_pred = np.reshape(predicted_attn,
                            newshape=(BATCH_SIZE, VIDEO_LENGTH - 10, 16, 16,
                                      1))
        np.save(
            os.path.join(ATTN_WEIGHTS_DIR,
                         'attention_weights_cla_gen1_' + str(epoch) + '.npy'),
            a_pred)

    # Train AAE
    if ADVERSARIAL:
        print("Training Stage II.")
        exp_memory = ExperienceMemory(memory_length=100)
        for epoch in range(NB_EPOCHS_GAN):
            print("\n\nEpoch ", epoch)
            g_loss = []
            val_g_loss = []
            d_loss = []
            val_d_loss = []
            # a_loss = []

            # # Set learning rate every epoch
            # LRS.on_epoch_begin(epoch=epoch)
            lr = K.get_value(gan.optimizer.lr)
            print("GAN learning rate: " + str(lr))
            lr = K.get_value(discriminator.optimizer.lr)
            print("Disc learning rate: " + str(lr))
            print("g_loss_metrics: " + str(gan.metrics_names))
            print("d_loss_metrics: " + str(discriminator.metrics_names))

            for index in range(NB_ITERATIONS):
                # Train Autoencoder
                X = load_X(videos_list, index, DATA_DIR, (128, 128, 3))
                X_hd = load_X(videos_list, index, HD_DATA_DIR, (256, 256, 3))
                X128 = X[:, 0:int(VIDEO_LENGTH / 2)]
                Y128 = autoencoder.predict(X128, verbose=0)
                X256_real = X_hd[:, int(VIDEO_LENGTH / 2):]
                X256_fake = generator.predict(Y128, verbose=0)

                trainable_fakes = exp_memory.get_trainable_fakes(
                    current_gens=X256_fake, exp_window_size=4)

                # Train Discriminator on future images (y_train, not X_train)
                X = np.concatenate((X256_real, trainable_fakes))
                y = np.concatenate(
                    (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32),
                     np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)),
                    axis=0)
                d_loss.append(discriminator.train_on_batch(X, y))

                # Train AAE
                set_trainability(discriminator, False)
                y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)
                g_loss.append(gan.train_on_batch(X128, [X256_real, y]))
                set_trainability(discriminator, True)

                # # Train Autoencoder
                # a_loss.append(autoencoder.train_on_batch(X_train, y_train))

                arrow = int(index / (NB_ITERATIONS / 30))
                stdout.write("\rIter: " + str(index) + "/" +
                             str(NB_ITERATIONS - 1) + "  " + "g_loss: " +
                             str([g_loss[len(g_loss) - 1][j]
                                  for j in [0, -1]]) + "  " + "d_loss: " +
                             str(d_loss[len(d_loss) - 1]) + "\t    [" +
                             "{0}>".format("=" * (arrow)))
                stdout.flush()

            if SAVE_GENERATED_IMAGES:
                # Save generated images to file
                predicted_images = generator.predict(Y128, verbose=0)
                orig_image, truth_image, pred_image = combine_images(
                    Y128, X256_real, predicted_images)
                pred_image = pred_image * 127.5 + 127.5
                orig_image = orig_image * 127.5 + 127.5
                truth_image = truth_image * 127.5 + 127.5
                if epoch == 0:
                    cv2.imwrite(
                        os.path.join(
                            CLA_GEN_IMAGES_DIR,
                            str(epoch) + "_" + str(index) + "_gan_orig.png"),
                        orig_image)
                    cv2.imwrite(
                        os.path.join(
                            CLA_GEN_IMAGES_DIR,
                            str(epoch) + "_" + str(index) + "_gan_truth.png"),
                        truth_image)
                cv2.imwrite(
                    os.path.join(
                        CLA_GEN_IMAGES_DIR,
                        str(epoch) + "_" + str(index) + "_gan_pred.png"),
                    pred_image)

            # Run over validation data
            print('')
            for index in range(NB_VAL_ITERATIONS):
                X = load_X(val_videos_list, index, VAL_DATA_DIR, (128, 128, 3))
                X_hd = load_X(val_videos_list, index, VAL_HD_DATA_DIR,
                              (256, 256, 3))
                X128_val = X[:, 0:int(VIDEO_LENGTH / 2)]
                Y128_val = autoencoder.predict(X128, verbose=0)
                X256_real_val = X_hd[:, int(VIDEO_LENGTH / 2):]
                X256_fake_val = generator.predict(Y128_val, verbose=0)

                X = np.concatenate((X256_real_val, X256_fake_val))
                y = np.concatenate(
                    (np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32),
                     np.zeros(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)),
                    axis=0)
                val_d_loss.append(discriminator.test_on_batch(X, y))

                y = np.ones(shape=(BATCH_SIZE, 10, 1), dtype=np.float32)
                val_g_loss.append(
                    gan.test_on_batch(X128_val, [X256_real_val, y]))

                arrow = int(index / (NB_VAL_ITERATIONS / 40))
                stdout.write(
                    "\rIter: " + str(index) + "/" +
                    str(NB_VAL_ITERATIONS - 1) + "  " + "val_g_loss: " +
                    str([val_g_loss[len(val_g_loss) - 1][j]
                         for j in [0, -1]]) + "  " + "val_d_loss: " +
                    str(val_d_loss[len(val_d_loss) - 1]))
                stdout.flush()

            # then after each epoch/iteration
            avg_d_loss = np.mean(np.asarray(d_loss, dtype=np.float32), axis=0)
            avg_val_d_loss = np.mean(np.asarray(val_d_loss, dtype=np.float32),
                                     axis=0)
            avg_g_loss = np.mean(np.asarray(g_loss, dtype=np.float32), axis=0)
            avg_val_g_loss = np.mean(np.asarray(val_g_loss, dtype=np.float32),
                                     axis=0)

            loss_values = np.asarray(avg_d_loss.tolist() + avg_val_d_loss.tolist() \
                                     + avg_g_loss.tolist() + avg_val_g_loss.tolist(), dtype=np.float32)
            d_loss_keys = [
                'd_' + metric for metric in discriminator.metrics_names
            ]
            g_loss_keys = ['g_' + metric for metric in gan.metrics_names]
            val_d_loss_keys = [
                'd_val_' + metric for metric in discriminator.metrics_names
            ]
            val_g_loss_keys = [
                'g_val_' + metric for metric in gan.metrics_names
            ]

            loss_keys = d_loss_keys + val_d_loss_keys + \
                        g_loss_keys + val_g_loss_keys
            logs = dict(zip(loss_keys, loss_values))

            TC_gan.on_epoch_end(epoch, logs)

            # Log the losses
            with open(os.path.join(LOG_DIR, 'losses_gan.json'),
                      'a') as log_file:
                log_file.write("{\"epoch\":%d, %s;\n" % (epoch, logs))

            print("\nAvg d_loss: " + str(avg_d_loss) + " Avg val_d_loss: " +
                  str(avg_val_d_loss) + "\nAvg g_loss: " +
                  str([avg_g_loss[j] for j in [0, -1]]) + " Avg val_g_loss: " +
                  str([avg_val_g_loss[j] for j in [0, -1]]))

            # Save model weights per epoch to file
            encoder.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'encoder_gan_epoch_' + str(epoch) + '.h5'), True)
            decoder.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'decoder_gan_epoch_' + str(epoch) + '.h5'), True)
            generator.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'generator_gan_epoch_' + str(epoch) + '.h5'),
                True)
            discriminator.save_weights(
                os.path.join(CHECKPOINT_DIR,
                             'discriminator_gan_epoch_' + str(epoch) + '.h5'),
                True)

    # End TensorBoard Callback
    TC.on_train_end('_')
示例#11
0
class QAgent:
    """
    A QAgent represents a QLearning Agent, which approximates
    the optimal QValue of every state, action couple (s, a).
    """
    def __init__(self,
                 nb_states: int,
                 nb_actions: int,
                 epsilon_prob: float = 0.05,
                 gamma=0.99,
                 lr=0.1,
                 batch_replay_size=1024):
        """
        :param nb_states:   Number of states reachable in the environment.
        :param nb_actions:  Number of possible actions. If the number of actions
                            differs depending on the state, should be the maximum
                            amount of actions.
        :param epsilon_prob: Epsilon probability. Defaults to 5%.
        :param gamma Discount factor.
        :param lr Learning rate
        :param batch_replay_size Size of batches to train on during updates.
        """
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Matrix containing Qvalues for every (s, a) couple
        self.Q = torch.zeros([nb_states, nb_actions], dtype=torch.float32)
        self.epsilon_prob = epsilon_prob

        # Discount Factor
        self.gamma = gamma

        # Learning rate
        self.lr = lr

        # Experience memory
        self.mem = ExperienceMemory()
        self.batch_replay_size = batch_replay_size

    def decide(self, state: int):
        """
        :param state: State index
        :return: The action a that is best according to the agent
                (The one that has the best QValue), or a random action
                with probability epsilon.
        """
        if random() < self.epsilon_prob:
            return randint(0, self.nb_actions - 1)
        return torch.argmax(self.Q[state])

    def memorize(self, state: int, action: int, next_state: int,
                 reward: torch.float32):
        """
        Stores an experience into the experience memory.
        :param state:
        :param action:
        :param next_state:
        :param reward:
        """
        self.mem.memorize(torch.tensor([[state]]), torch.tensor([[action]]),
                          torch.tensor([[next_state]]), reward)

    def update(self):
        """
        Updates the agent's Q values using experience replay.
        """
        states, actions, nstates, rewards = self.mem.random_batch(
            self.batch_replay_size)
        for s, a, ns, r in zip(states, actions, nstates, rewards):
            s = s.item()
            a = a.item()
            ns = ns.item()
            r = r.item()
            self.Q[s, a] = (1 - self.lr) * self.Q[s, a] \
                + self.lr * (r + self.gamma * torch.max(self.Q[ns]))