Exemplo n.º 1
0
    def reset(self):
        self.ep_return = 0.0
        self.newtile = False
        self.tile_visited_count = 0
        self.last_touch_with_track = 0
        self.last_new_tile = 0
        self.obst_contact = False
        self.obst_contact_count = 0
        self.obst_contact_list = []
        self.t = 0.0
        self.steps_in_episode = 0
        self.state = np.zeros(self.observation_space.shape)
        self.internal_frames = self.skip_frames * (self.frames_per_state -
                                                   1) + 1
        self.int_state = np.zeros([STATE_H, STATE_W, self.internal_frames])

        if self.track_use >= self.repeat_track * self.episodes_per_track:
            intento = 0
            while intento < 21:
                success = self._create_track()
                intento += 1
                if success:
                    self.track_use = 0
                    self.episode_start = range(
                        0, len(self.track),
                        int(len(self.track) / self.episodes_per_track))
                    #print(self.episode_start)
                    break
                if self.verbose > 0:
                    print(
                        intento,
                        " retry to generate new track (normal below 10, limit 20)"
                    )
        else:
            self._create_tiles(self.track, self.border)

        start_tile = self.episode_start[self.track_use %
                                        self.episodes_per_track]
        #print(start_tile, self.track_use, self.episodes_per_track)

        if self.car is not None:
            self.car.destroy()
        if self.episodes_per_track > 1:
            self.car = Car(self.world, *self.track[start_tile][1:4])
        else:
            self.car = Car(self.world, *self.track[0][1:4])

        #trying to detect two very close reset()
        if self.action_taken > 2:
            self.track_use += 1
            self.action_taken = 0
        #self.track_use += 1

        return self.step(None)[0]
Exemplo n.º 2
0
def simulate_batch(batch_num):
    env = CarRacing()

    obs_data = []
    action_data = []
    action = env.action_space.sample()
    for i_episode in range(_BATCH_SIZE):
        observation = env.reset()
        # Little hack to make the Car start at random positions in the race-track
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])
        observation = normalize_observation(observation)

        obs_sequence = []

        for _ in range(_TIME_STEPS):
            if _RENDER:
                env.render()

            action = generate_action(action)

            observation, reward, done, info = env.step(action)
            observation = normalize_observation(observation)

            obs_data.append(observation)

    print("Saving dataset for batch {}".format(batch_num))
    np.save('../data/obs_data_VAE_{}'.format(batch_num), obs_data)

    env.close()
def multiple_runs(on):
    env = CarRacing()

    states = []
    actions = []
    for run in range(MAX_RUNS):
        state = env.reset()
        # done = False
        counter = 0
        for game_time in range(MAX_GAME_TIME):
            # env.render()
            action = generate_action()
            state = _process_frame(state)
            states.append(state)
            actions.append(action)
            state, r, done, _ = env.step(action)

            # print(r)

            if counter == REST_NUM:
                print('RUN:{},GT:{},DATA:{}'.format(run, game_time,
                                                    len(states)))
                position = np.random.randint(len(env.track))
                env.car = Car(env.world, *env.track[position][1:4])
                counter = 0
            counter += 1
    states = np.array(states, dtype=np.uint8)
    actions = np.array(actions, dtype=np.float16)
    save_name = 'rollout_v2_{}.npz'.format(on)
    # np.save(dst + '/' + save_name, frame_and_action)

    np.savez_compressed(dst + '/' + save_name, action=actions, state=states)
def play(params, render=True, verbose=False):
    _NUM_TRIALS = 12
    agent_reward = 0
    for trial in range(_NUM_TRIALS):
        observation = env.reset()
        # Little hack to make the Car start at random positions in the race-track
        np.random.seed(int(str(time.time()*1000000)[10:13]))
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        total_reward = 0.0
        steps = 0
        while True:
            if render:
                env.render()
            action = decide_action(observation, params)
            observation, r, done, info = env.step(action)
            total_reward += r
            # NB: done is not True after 1000 steps when using the hack above for
            # 	  random init of position
            if verbose and (steps % 200 == 0 or steps == 999):
                print("\naction " + str(["{:+0.2f}".format(x) for x in action]))
                print("step {} total_reward {:+0.2f}".format(steps, total_reward))

            steps += 1
            if steps == 999:
                break

        agent_reward += total_reward

    # If reward is out of scale, clip it
    agent_reward = np.maximum(-(100*_NUM_TRIALS), agent_reward)
    return - (agent_reward / _NUM_TRIALS)
Exemplo n.º 5
0
    def reset(
        self,
        *,
        seed: Optional[int] = None,
        return_info: bool = False,
        options: Optional[dict] = None,
    ):
        super().reset(seed=seed)
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.new_lap = False
        self.road_poly = []

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print("retry to generate track (normal if there are not many"
                      "instances of this message)")
        self.car = Car(self.world, *self.track[0][1:4])

        if not return_info:
            return self.step(None)[0]
        else:
            return self.step(None)[0], {}
    def run(self, env, model, img_resize=None, random_start=False):

        obs = []
        actions = []
        rewards = []

        ob = env.reset()

        if random_start:  #CarRacing random track tile start
            position = np.random.randint(len(env.track))
            env.env.car = Car(env.env.world, *env.env.track[position][1:4])

        done = False
        while not done:

            if img_resize:
                ob = ob[0:84, :, :]
                ob = cv2.resize(ob,
                                dsize=img_resize,
                                interpolation=cv2.INTER_CUBIC)
                ob_model = torch.tensor(ob / 255).view(
                    1, img_resize[0], img_resize[1],
                    3).permute(0, 3, 1, 2).type('torch.FloatTensor')

            action = model(ob_model.to(self.device)).detach().cpu().numpy()[0]

            obs.append(ob)
            actions.append(action)

            ob, r, done, _ = env.step(action)

            rewards.append(r)

        return obs, actions, rewards
Exemplo n.º 7
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []
        self.obstacle_poly = []
        self.steps = 0

        while True:
            success_track = self._create_track()

            if self.obstacles:
                if success_track:
                    success_obstacles = self._create_obstacles()
                else:
                    success_obstacles = False
            else:
                success_obstacles = True  # just so it goes through to next stage

            if success_track and success_obstacles:
                break
            if self.verbose == 1:
                print("retry to generate track (normal if there are not many"
                      "instances of this message)")
        self.car = Car(self.world, *self.track[0][1:4])

        return self.step(None)[0]
Exemplo n.º 8
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []
        self.state = np.zeros(self.observation_space.shape)
        self._last_rewards = []

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print(
                    "retry to generate track (normal if there are not many instances of this message)"
                )
        self.car = Car(self.world, *self.track[0][1:4])

        # there are 20 frames of noise at the begining (+ 4 frames per state)
        for _ in range(24):
            obs = self.step(None)[0]

        return obs
Exemplo n.º 9
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print(
                    "retry to generate track (normal if there are not many of this messages)"
                )
        if self.random:
            startpos = randint(110, 250)
            ind = 20
        else:
            startpos = 5
            ind = 4
        self.car = Car(self.world, *self.track[startpos][1:ind])

        return self.step(None)[0]
Exemplo n.º 10
0
def simulate_batch(batch_num):
    car_env = CarRacing()

    obs_data = []
    action_data = []
    action = car_env.action_space.sample()
    for item in range(batch_size):
        en_observ = car_env.reset()
        # this make car to start in random positions 
        position = np.random.randint(len(car_env.track))
        car_env.car = Car(car_env.world, *car_env.track[position][1:4])
        en_observ = norm_obse(en_observ)

        obs_sequence = []

        # time steps
        for i in range(steps):
            if render:
                car_env.render()

            action = create_action(action)

            en_observ, reward, done, info = car_env.step(action)
            en_observ = norm_obse(en_observ)

            obs_data.append(en_observ)

    print("Saving dataset for batch {}".format(batch_num))
    np.save('data/TR_data_{}'.format(batch_num), obs_data)
    
    car_env.close()
Exemplo n.º 11
0
 def _randomize_car_pos(self):
     random_car_position = np.random.randint(len(
         self.environment.env.track))
     self.environment.car = Car(
         self.environment.world,
         *self.environment.track[random_car_position][1:4])
     obs, _, _, _ = self.step([0, 0, 0])
     return obs
Exemplo n.º 12
0
def play(params):
    with torch.no_grad():
        block_print()
        device = torch.device("cpu")
        vae_model = vae.ConvVAE(VAE_Z_SIZE, VAE_KL_TOLERANCE)
        if os.path.exists("checkpoints/vae_checkpoint.pth"):
            vae_model.load_state_dict(
                torch.load("checkpoints/vae_checkpoint.pth",
                           map_location=device))
        vae_model = vae_model.eval()
        vae_model.to(device)

        rnn_model = rnn.MDMRNN(MDN_NUM_MIXTURES, MDN_HIDDEN_SIZE,
                               MDN_INPUT_SIZE, MDN_NUM_LAYERS, MDN_BATCH_SIZE,
                               1, MDN_OUTPUT_SIZE)
        if os.path.exists("checkpoints/rnn_checkpoint.pth"):
            rnn_model.load_state_dict(
                torch.load("checkpoints/rnn_checkpoint.pth",
                           map_location=device))
        rnn_model.to(device)
        rnn_model = rnn_model.eval()

        controller_model = controller.Controller(CMA_EMBEDDING_SIZE,
                                                 CMA_NUM_ACTIONS, params)

        env = CarRacing()
        _NUM_TRIALS = 16
        agent_reward = 0
        for trial in range(_NUM_TRIALS):
            observation = env.reset()
            # Little hack to make the Car start at random positions in the race-track
            np.random.seed(int(str(time.time() * 1000000)[10:13]))
            position = np.random.randint(len(env.track))
            env.car = Car(env.world, *env.track[position][1:4])

            hidden_state, cell_state = train_rnn.init_hidden(
                MDN_NUM_LAYERS, MDN_BATCH_SIZE, MDN_HIDDEN_SIZE, device)

            total_reward = 0.0
            steps = 0
            while True:
                action, hidden_state, cell_state = decide_action(
                    vae_model, rnn_model, controller_model, observation,
                    hidden_state, cell_state, device)
                observation, r, done, info = env.step(action)
                total_reward += r
                # NB: done is not True after 1000 steps when using the hack above for
                # 	  random init of position

                steps += 1
                if steps == 999:
                    break

            # If reward is out of scale, clip it
            total_reward = np.maximum(-100, total_reward)
            agent_reward += total_reward
        env.close()
        return -(agent_reward / _NUM_TRIALS)
Exemplo n.º 13
0
    def reset(self):
        print('the time played(total):')
        print('***********************count is ', self.count)
        print(self.time_count)
        print('long term reward for this episode is ', self.long_term_reward)
        self.long_term_reward = 0
        self.time_count += 1
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []
        self.count = 0
        self.life_count = 0.0
        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print(
                    "retry to generate track (normal if there are not many instances of this message)"
                )
        ####################################
        agent_cars = []
        for i in range(number_agent):
            num_1 = i * degree_d
            if i == 1:
                car = Car(self.world, *(0, 225.0, initial_distance_apart))
                print('##################################################',
                      self.track[num_1][1:4])
            else:
                car = Car(self.world, *self.track[num_1][1:4])
            print
            if i == 1:  ######################################set first car or not

                car.lead_car = True
                print('*************************************')
            agent_cars.append(car)
        self.car = agent_cars
        self.car1 = agent_cars
        #self.car = agent_cars[1]
        #self.car = Car(self.world, *self.track[70][1:4])#original

        return self.step(None)[0]
Exemplo n.º 14
0
def main():
    print("Generating data for env CarRacing-v0")

    env = CarRacing()

    for obs_idx in range(1, 10):

        env.reset()

        observations = []

        for i in range(1000):
            position = np.random.randint(len(env.track))
            angle = np.random.randint(-20, 20)
            x_off = np.random.randint(-20, 20)
            init_data = list(env.track[position][1:4])
            init_data[0] += angle
            init_data[1] += x_off
            env.car = Car(env.world, *init_data)

            observation = env.step(None)[0]

            cropped_obs = normalize_observation(
                observation[:CROP_SIZE,
                            CROP_W_OFFSET:CROP_SIZE + CROP_W_OFFSET, :])

            cropped_obs = cv2.resize(cropped_obs,
                                     dsize=(64, 64),
                                     interpolation=cv2.INTER_CUBIC).astype(
                                         np.float32)

            np.clip(cropped_obs, 0.0, 1.0, cropped_obs)

            if i % 10 == 0:
                print(i)

            if i % 100 == 0:
                plt.imshow(cropped_obs)
                plt.show()

            observations.append(cropped_obs)

        observations = np.array(observations, dtype=np.float32)

        if not os.path.exists("data"):
            os.mkdir("data")

        np.save("data/observations_%d.npy" % obs_idx, observations)
Exemplo n.º 15
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []
        self.human_render = False

        while True:
            success = self._create_track()
            if success: break
            #print("retry to generate track (normal if there are not many of this messages)")
        self.car = Car(self.world, *self.track[0][1:4])

        return self.step(None)[0]
Exemplo n.º 16
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []

        while True:
            success = self._create_track()
            if success:
                break
            # if self.verbose == 1:
                # print("retry to generate track (normal if there are not many of this messages)")
        self.car = Car(self.world, *self.track[0][1:4], WHEEL_COLOR=self.WHEEL_COLOR, WHEEL_WHITE=self.WHEEL_WHITE, MUD_COLOR=self.MUD_COLOR, HULL_COLOR=self.HULL_COLOR)

        return self.step(None)[0]
    def fast_reset(self):
        self.car2 = None
        self.laps = 0
        self.on_road = True
        self.next_road_tile = 0

        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.human_render = False
        for tile in self.road:
            tile.road_visited = False
        self.road_poly = copy.deepcopy(self.original_road_poly)
        self.car.destroy()
        self.car = Car(self.world, *self.track[0][1:4])

        return self.step(None)
Exemplo n.º 18
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print("retry to generate track (normal if there are not many"
                      "instances of this message)")
        self.car = Car(self.world, *self.track[0][1:4])

        return self.step(None)[0]
Exemplo n.º 19
0
    def reset(self):
        self.num_step = 1

        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []

        while True:
            success = self._create_track()
            if success:
                break
            # if self.verbose == 1:
            #     print("retry to generate track (normal if there are not many of this messages)")
        self.car = Car(self.world, *self.track[0][1:4])

        self.state_temp = np.zeros((STATE_W, STATE_H, 3), dtype=np.uint8)
        return self.old_step((0, 0, 0))[0]
Exemplo n.º 20
0
    def reset(self):
        self._destroy()
        self.time = -1.0
        self.tile_visited_count = 0
        self.state = None
        self.done = False
        self.reward = 0.0
        self.prev_reward = 0.0

        # Build ground
        self.ground = Ground(self.world, PLAYFIELD, PLAYFIELD)

        # Build track tiles
        self.track_tiles_coordinates = TrackCoordinatesBuilder.load_track(self)
        self.track_tiles = [
            TrackTile(self.world, self.track_tiles_coordinates[i],
                      self.track_tiles_coordinates[i - 1])
            for i, element in enumerate(self.track_tiles_coordinates)
        ]
        # Build cones
        cones_coordinates = []
        for i in range(0, len(self.track_tiles)):
            sensor_vertices = self.track_tiles[i].b2Data.fixtures[
                0].shape.vertices
            for j in range(0, len(sensor_vertices)):
                cones_coordinates.append(sensor_vertices[j])
        self.cones = [
            Cone(world=self.world,
                 position=(cone_coordinate[0], cone_coordinate[1]))
            for cone_coordinate in cones_coordinates
        ]

        init_angle = 0
        init_x, init_y = self.track_tiles[0].position

        self.car = Car(self.world,
                       init_angle=init_angle,
                       init_x=init_x,
                       init_y=init_y)

        return self.step(None)[0]
Exemplo n.º 21
0
    def reset(self):
        self._destroy()
        self.reward = 0.0
        self.prev_reward = 0.0
        self.tile_visited_count = 0
        self.t = 0.0
        self.road_poly = []
        self.track_direction = random.choice([-1, 1])
        if self.viewer:
            self.viewer.geoms = []

        while True:
            success = self._create_track()
            if success:
                break
            if self.verbose == 1:
                print(
                    "retry to generate track (normal if there are not many of this messages)"
                )
        self.car = Car(self.world, *self.track[0][1:4], draw_car=True)

        return self.step(None)[0]
Exemplo n.º 22
0
def multiple_runs(on):
    env = CarRacing()
    frame_and_action = []
    for run in range(MAX_RUNS):
        env.reset()
        # done = False
        counter = 0
        for game_time in range(MAX_GAME_TIME):
            # env.render()
            action = generate_action()
            state, r, done, _ = env.step(action)
            frame_and_action.append({'state': state, 'action': action})
            # print(r)
            counter += 1
            if counter > REST_NUM:
                print('RUN:{},GT:{},DATA:{}'.format(run, game_time,
                                                    len(frame_and_action)))
                position = np.random.randint(len(env.track))
                env.car = Car(env.world, *env.track[position][1:4])
                counter = 0
    save_name = 'rollout_{}.npy'.format(on)
    np.save(dst + '/' + save_name, frame_and_action)
def simulate_batch(batch_num, save=True, time_steps=None, reduce_size=True):
    env = CarRacing()

    if time_steps is None:
        time_steps = _TIME_STEPS

    obs_data = []
    action_data = []
    action = env.action_space.sample()
    for i_episode in range(_BATCH_SIZE):
        observation = env.reset()
        # Little hack to make the Car start at random positions in the race-track
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])
        observation = normalize_observation(observation,
                                            output_4d=False,
                                            reduce_size=reduce_size)
        obs_data.append(observation)

        for _ in range(time_steps):
            if _RENDER:
                env.render()

            action = generate_action(action)

            observation, reward, done, info = env.step(action)
            observation = normalize_observation(observation,
                                                output_4d=False,
                                                reduce_size=reduce_size)

            obs_data.append(observation)

    if save:
        print("Saving dataset for batch {:03d}".format(batch_num))
        np.save('../data/obs_data_VAE_{:03d}'.format(batch_num), obs_data)

    env.close()
    return obs_data
Exemplo n.º 24
0
def play(params,
         render=True,
         verbose=False,
         save_visualization=False,
         max_len=999):
    time_start = datetime.datetime.now()
    print('Agent train run begun ' + str(time_start))

    sess, network = load_vae()
    env = CarRacing()

    # _NUM_TRIALS = 16  # <-- Ha and Schmidhuber
    _NUM_TRIALS = 8

    agent_reward = 0
    for trial in range(_NUM_TRIALS):
        observation = env.reset()
        observation = network.normalize_observation(observation)
        # Little hack to make the Car start at random positions in the race-track
        np.random.seed(int(str(time.time() * 1000000)[10:13]))
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        total_reward = 0.0
        steps = 0
        observations = [observation]
        while True:
            if render:
                env.render()
            observation = network.normalize_observation(observation)
            observations.append(observation)

            embedding = network.get_embedding(sess, observation)
            action = decide_action(sess, embedding, params)
            observation, r, done, info = env.step(action)
            total_reward += r
            # NB: done is not True after 1000 steps when using the hack above for
            #       random init of position
            if verbose and (steps % 200 == 0 or steps == 999):
                print("\naction " + str(["{:+0.2f}".format(x)
                                         for x in action]))
                print("step {} total_reward {:+0.2f}".format(
                    steps, total_reward))

            steps += 1
            if steps == max_len:
                break
            # if total_reward < -50:
            #     break
            if _IS_TEST and steps > 10:
                break

        total_reward = np.maximum(-100, total_reward)
        agent_reward += total_reward
        if save_visualization:
            title = 'train_agent_r{:.2f}'.format(agent_reward)
            print('Saving trajectory:', title)
            network.show_pred(title, np.concatenate(observations, 0))
            break
        print('.', end='')

    sess.close()
    env.close()
    print('Agent done - ' + str(time_start))

    return -(agent_reward / _NUM_TRIALS)
    def render(self,
               env,
               model,
               img_resize=(64, 64),
               dream=False,
               random_start=False,
               video=False):

        ob = env.reset()

        if random_start:
            position = np.random.randint(len(env.track))
            env.env.car = Car(env.env.world, *env.env.track[position][1:4])

        done = False

        # save videos
        obs = []
        obs_reconstruction = []

        while not done:

            action = None

            if dream:

                action = model.forward_dream(ob.to(
                    self.device)).detach().cpu().numpy()

            else:

                if img_resize:
                    ob = ob[0:84, :, :]
                    ob = cv2.resize(ob,
                                    dsize=img_resize,
                                    interpolation=cv2.INTER_CUBIC)
                    obs.append(ob)  # video

                    ob = torch.tensor(ob / 255).view(
                        1, img_resize[0], img_resize[1],
                        3).permute(0, 3, 1, 2).type('torch.FloatTensor')

                action = model(ob.to(self.device)).detach().cpu().numpy()[0]

                obs_reconstruction.append(
                    model.vae(ob.to(self.device))[0][0].detach().cpu().permute(
                        1, 2, 0).numpy())  # video

            env.render()
            ob, r, done, _ = env.step(action)

        if video:

            vid = []
            for x, y in zip(obs, obs_reconstruction):
                frame = np.zeros((64, 128, 3))
                frame[:, 0:64, :] += x
                frame[:, 64::, :] += y * 255
                vid.append(frame)

            w = imageio.get_writer('vae_video.mp4',
                                   format='FFMPEG',
                                   mode='I',
                                   fps=30,
                                   quality=9)
            for img in vid:
                w.append_data(img.astype(np.uint8))
            w.close()
Exemplo n.º 26
0
def train(
        seed: int = 69,
        batch_size: int = 256,
        num_steps: int = 5000000,
        updates_per_step: int = 1,
        start_steps: int = 100000,
        replay_size: int = 1000000,
        eval: bool = True,
        eval_interval: int = 50,
        accelerated_exploration: bool = True,
        save_models: bool = True,
        load_models: bool = True,
        save_memory: bool = True,
        load_memory: bool = False,
        path_to_actor: str = "./models/sac_actor_carracer_klein_6_24_18.pt",
        path_to_critic: str = "./models/sac_critic_carracer_klein_6_24_18.pt",
        path_to_buffer: str = "./memory/buffer_klein_6_24_18.pkl"):
    """
    ## The train function consist of:  
    
    - Setting up the environment, agent and replay buffer  
    - Logging hyperparameters and training results  
    - Loading previously saved actor and critic models  
    - Training loop  
    - Evaluation (every *eval_interval* episodes)  
    - Saving actor and critic models  
        
    ## Parameters:  
    
    - **seed** *(int)*: Seed value to generate random numbers.  
    - **batch_size** *(int)*: Number of samples that will be propagated through the Q, V, and policy network.  
    - **num_steps** *(int)*: Number of steps that the agent takes in the environment. Determines the training duration.   
    - **updates_per_step** *(int)*: Number of network parameter updates per step in the environment.  
    - **start_steps** *(int)*:  Number of steps for which a random action is sampled. After reaching *start_steps* an action
    according to the learned policy is chosen.
    - **replay_size** *(int)*: Size of the replay buffer.  
    - **eval** *(bool)*:  If *True* the trained policy is evaluated every *eval_interval* episodes.
    - **eval_interval** *(int)*: Interval of episodes after which to evaluate the trained policy.    
    - **accelerated_exploration** *(bool)*: If *True* an action with acceleration bias is sampled.  
    - **save_memory** *(bool)*: If *True* the experience replay buffer is saved to the harddrive.  
    - **save_models** *(bool)*: If *True* actor and critic models are saved to the harddrive.  
    - **load_models** *(bool)*: If *True* actor and critic models are loaded from *path_to_actor* and *path_to_critic*.  
    - **path_to_actor** *(str)*: Path to actor model.  
    - **path_to_critic** *(str)*: Path to critic model.  
    
    """
    # Environment
    env = gym.make("CarRacing-v0")
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # NOTE: ALWAYS CHECK PARAMETERS BEFORE TRAINING
    agent = SAC(env.action_space,
                policy="Gaussian",
                gamma=0.99,
                tau=0.005,
                lr=0.0003,
                alpha=0.2,
                automatic_temperature_tuning=True,
                batch_size=batch_size,
                hidden_size=512,
                target_update_interval=2,
                input_dim=32)

    # Memory
    memory = ReplayMemory(replay_size)
    if load_memory:
        # load memory and deactivate random exploration
        memory.load(path_to_buffer)

    if load_memory or load_models:
        start_steps = 0

    # Training Loop
    total_numsteps = 0
    updates = 0

    # Log Settings and training results
    date = datetime.now()
    log_dir = Path(f"runs/{date.year}_SAC_{date.month}_{date.day}_{date.hour}")

    writer = SummaryWriter(log_dir=log_dir)

    settings_msg = (
        f"Training SAC for {num_steps} steps"
        "\n\nTRAINING SETTINGS:\n"
        f"Seed={seed}, Batch size: {batch_size}, Updates per step: {updates_per_step}\n"
        f"Accelerated exploration: {accelerated_exploration}, Start steps: {start_steps}, Replay size: {replay_size}"
        "\n\nALGORITHM SETTINGS:\n"
        f"Policy: {agent.policy_type}, Automatic temperature tuning: {agent.automatic_temperature_tuning}\n"
        f"Gamma: {agent.gamma}, Tau: {agent.tau}, Alpha: {agent.alpha}, LR: {agent.lr}\n"
        f"Target update interval: {agent.target_update_interval}, Latent dim: {agent.input_dim}, Hidden size: {agent.hidden_size}"
    )
    with open(log_dir / "settings.txt", "w") as file:
        file.write(settings_msg)

    if load_models:
        try:
            agent.load_model(path_to_actor, path_to_critic)
        except FileNotFoundError:
            warnings.warn(
                "Couldn't locate models in the specified paths. Training from scratch.",
                RuntimeWarning)

    for i_episode in itertools.count(1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()
        state = process_observation(state)
        state = encoder.sample(state)
        # choose random starting position for the car
        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        if accelerated_exploration:
            # choose random starting position for the car
            # position = np.random.randint(len(env.track))
            # env.car = Car(env.world, *env.track[position][1:4])
            # Sample random action
            action = env.action_space.sample()

        while not done:
            if total_numsteps < start_steps and not load_models:
                # sample action with acceleration bias if accelerated_action = True
                if accelerated_exploration:
                    action = generate_action(action)
                else:
                    action = env.action_space.sample()
            else:
                action = agent.select_action(state)

            if len(memory) > batch_size:
                # Number of updates per step in environment
                for _ in range(updates_per_step):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                        memory, batch_size, updates)
                    writer.add_scalar('loss/critic_1', critic_1_loss, updates)
                    writer.add_scalar('loss/critic_2', critic_2_loss, updates)
                    writer.add_scalar('loss/policy', policy_loss, updates)
                    writer.add_scalar('loss/entropy_loss', ent_loss, updates)
                    writer.add_scalar('entropy_temperature/alpha', alpha,
                                      updates)
                    updates += 1

            next_state, reward, done, _ = env.step(action)  # Step
            next_state = process_observation(next_state)
            next_state = encoder.sample(next_state)
            episode_steps += 1
            total_numsteps += 1
            episode_reward += reward

            # Ignore the "done" signal if it comes from hitting the time horizon.
            # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
            mask = 1 if episode_steps == env._max_episode_steps else float(
                not done)

            memory.push(state, action, reward, next_state,
                        mask)  # Append transition to memory

            state = next_state

        if total_numsteps > num_steps:
            break

        writer.add_scalar('reward/train', episode_reward, i_episode)

        print(
            f"Episode: {i_episode}, total numsteps: {total_numsteps}, episode steps: {episode_steps}, reward: {round(episode_reward, 2)}"
        )

        if i_episode % eval_interval == 0 and eval == True:
            avg_reward = 0.
            episodes = 10

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")

            for _ in range(episodes):
                state = env.reset()
                state = process_observation(state)
                state = encoder.sample(state)

                episode_reward = 0
                done = False
                while not done:
                    action = agent.select_action(state, eval=True)

                    next_state, reward, done, _ = env.step(action)
                    next_state = process_observation(next_state)
                    next_state = encoder.sample(next_state)
                    episode_reward += reward

                    state = next_state
                avg_reward += episode_reward
            avg_reward /= episodes

            if save_models:
                agent.save_model(
                    "carracer",
                    f"{getuser()}_{date.month}_{date.day}_{date.hour}")
            if save_memory:
                memory.save(
                    f"buffer_{getuser()}_{date.month}_{date.day}_{date.hour}")

            writer.add_scalar("avg_reward/test", avg_reward, i_episode)

            print("-" * 40)
            print(
                f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}"
            )
            print("-" * 40)

    env.close()
Exemplo n.º 27
0
def main(args):

    env_name = args.env_name
    total_episodes = args.total_episodes
    start_batch = args.start_batch
    time_steps = args.time_steps
    render = args.render
    batch_size = args.batch_size
    run_all_envs = args.run_all_envs

    if run_all_envs:
        envs_to_generate = config.train_envs
    else:
        envs_to_generate = [env_name]

    for current_env_name in envs_to_generate:
        print("Generating data for env {}".format(current_env_name))

        env = make_env(current_env_name)
        s = 0
        batch = start_batch

        batch_size = min(batch_size, total_episodes)

        while s < total_episodes:
            obs_data = []
            action_data = []

            for i_episode in range(batch_size):
                print('-----')
                observation = env.reset()
                observation = config.adjust_obs(observation)

                # Position car randomly on track
                position = np.random.randint(len(env.track))
                env.car = Car(env.world, *env.track[position][1:4])

                # plt.imshow(observation)
                # plt.show()

                env.render()
                done = False
                action = env.action_space.sample()
                t = 0
                obs_sequence = []
                action_sequence = []

                while t < time_steps:  #and not done:
                    t = t + 1

                    action = config.generate_data_action(t, action)

                    obs_sequence.append(observation)
                    action_sequence.append(action)

                    observation, reward, done, info = env.step(action)
                    observation = config.adjust_obs(observation)

                    if render:
                        env.render()

                obs_data.append(obs_sequence)
                action_data.append(action_sequence)

                print("Batch {} Episode {} finished after {} timesteps".format(
                    batch, i_episode, t + 1))
                print("Current dataset contains {} observations".format(
                    sum(map(len, obs_data))))

                s = s + 1

            print("Saving dataset for batch {}".format(batch))
            np.save('./data/obs_data_' + current_env_name + '_' + str(batch),
                    obs_data)
            np.save(
                './data/action_data_' + current_env_name + '_' + str(batch),
                action_data)

            batch = batch + 1

        env.close()
Exemplo n.º 28
0
def simulate_batch(batch_num):
    og = start = time.time()
    block_print()
    with torch.no_grad():

        device = torch.device("cpu")
        vae_model = vae.ConvVAE(VAE_Z_SIZE, VAE_KL_TOLERANCE)
        if os.path.exists("checkpoints/vae_checkpoint.pth"):
            vae_model.load_state_dict(
                torch.load("checkpoints/vae_checkpoint.pth",
                           map_location=device))
        vae_model = vae_model.eval()
        vae_model.to(device)

        rnn_model = rnn.MDMRNN(MDN_NUM_MIXTURES, MDN_HIDDEN_SIZE,
                               MDN_INPUT_SIZE, MDN_NUM_LAYERS, MDN_BATCH_SIZE,
                               1, MDN_OUTPUT_SIZE)
        if os.path.exists("checkpoints/rnn_checkpoint.pth"):
            rnn_model.load_state_dict(
                torch.load("checkpoints/rnn_checkpoint.pth",
                           map_location=device))
        rnn_model.to(device)
        rnn_model = rnn_model.eval()

        if os.path.exists("checkpoints/params.pkl"):
            fo = open('checkpoints/params.pkl', 'rb')
            params = pickle.load(fo)
            fo.close()
            print("Loaded existing params")
        else:
            cma_num_params = CMA_NUM_ACTIONS * CMA_EMBEDDING_SIZE + CMA_NUM_ACTIONS
            params = controller.get_random_model_params(
                cma_num_params,
                np.random.rand() * 0.01)
        controller_model = controller.Controller(CMA_EMBEDDING_SIZE,
                                                 CMA_NUM_ACTIONS, params)

        env = CarRacing()

        observations = []
        actions = []

        observation = env.reset()

        position = np.random.randint(len(env.track))
        env.car = Car(env.world, *env.track[position][1:4])

        hidden_state, cell_state = train_rnn.init_hidden(
            MDN_NUM_LAYERS, MDN_BATCH_SIZE, MDN_HIDDEN_SIZE, device)

        observation = process_frame(observation)
        for _ in range(SEQUENCE_LENGTH + 1):
            observation = process_frame(observation)
            observations.append(observation)
            observation = normalize_observation(observation)
            observation = np.moveaxis(observation, 2, 0)
            observation = np.reshape(observation, (-1, 3, 64, 64))
            observation = torch.tensor(observation, device=device)
            mu, log_var = vae_model.encode(observation)
            embedding = vae_model.reparameterize(mu, log_var)

            controller_input = torch.cat(
                (embedding, hidden_state.reshape(1, -1)), dim=1)
            action = controller_model.forward(controller_input)
            actions.append(action)
            observation, reward, done, info = env.step(action)
            action_tensor = torch.from_numpy(action).float().to(device)
            action_tensor = action_tensor.view(1, -1)
            rnn_inputs = torch.cat((embedding, action_tensor), dim=1)
            pi, mean, sigma, hidden_state, cell_state = rnn_model.forward(
                rnn_inputs, hidden_state, cell_state)

        observations = np.array(observations, dtype=np.uint8)
        actions = np.array(actions, dtype=np.float16)
        np.savez_compressed('data/obs_data_VAE_{}'.format(batch_num),
                            obs=observations,
                            action=actions)
        env.close()
    end = time.time()
    logging.info("_" + str(batch_num) + " Total: " + str(end - og))