def experiment(n_episodes, default_policy=False, policy=None, render=False): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering agent_config: DQNAgent object Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # steps per episode env = gym.make('CartPole-v0') env = env.unwrapped env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0, tb_dir=None) else: layer1 = Dense(10, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=2000, update_rate=100, epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.1, optimizer=keras.optimizers.RMSprop(0.001), memory_size=2000, tb_dir=None) for _ in tqdm(range(n_episodes), desc="Episode"): state = env.reset() cumulative_reward = 0 state = np.reshape(state, [1, 4]) t = 0 while True: if (render): env.render() time.sleep(0.1) next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) x, x_dot, theta, theta_dot = new_state new_state = np.reshape(new_state, [1, 4]) # Reward shaping r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 r3 = -abs(theta_dot) reward = r1 + r2 + r3 agent.memoise((state, next_action, reward, new_state, end)) if end or t > 199: if t < 195: res[0] += 1 else: res[1] += 1 # print("ENTRATO!,", t, "steps","reward: ",cumulative_reward) steps.append(t) break else: state = new_state cumulative_reward += reward agent.learn() t += 1 cumulative_reward += reward scores.append(cumulative_reward) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent }
def experiment(n_episodes, default_policy=False, policy=None, render=False): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ with tf.device('/gpu:0'): res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode reward_list = RingBuffer(100) env = gym.make('PongDeterministic-v4') input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0.05, epsilon_lower_bound=0.05) else: layers = [ Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4), kernel_initializer=VarianceScaling(scale=2.0)), Conv2D(64, (4, 4), strides=(2, 2), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), Conv2D(64, (3, 3), strides=(1, 1), activation='relu', kernel_initializer=VarianceScaling(scale=2.0)), Flatten(), Dense(512, activation='relu'), Dense(output_dim) ] agent = DQNAgent(output_dim, layers, use_ddqn=True, memory_size=700000, gamma=0.99, learn_thresh=50000, epsilon_lower_bound=0.02, epsilon_decay_function=lambda e: e - (0.98 / 950000), update_rate=10000, optimizer=Adam(0.00025)) gathered_frame = 0 for episode_number in tqdm(range(n_episodes), desc="Episode"): frame = env.reset() state = pre_processing(frame) empty_state = np.zeros(state.shape, dtype="uint8") cumulative_reward = 0 has_lost_life = True t = 0 while True: if has_lost_life: next_action = 1 # [1, 4, 5][ran.randint(0, 2)] stack = np.stack( (empty_state, empty_state, empty_state, empty_state), axis=2) stack = np.reshape([stack], (1, 84, 84, 4)) for _ in range(ran.randint(1, 10)): gathered_frame += 1 frame, reward, end, _ = env.step(next_action) new_state = np.reshape(pre_processing(frame), (1, 84, 84, 1)) new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) stack = new_stack if (render): env.render() has_lost_life = False next_action = agent.act(stack) new_state, reward, end, _ = env.step(next_action) if (render): env.render() time.sleep(0.02) reward = np.clip(reward, -1., 1.) if reward != 0: has_lost_life = True cumulative_reward += reward new_state = np.reshape(pre_processing(new_state), (1, 84, 84, 1)) new_stack = np.append(new_state, stack[:, :, :, :3], axis=3) agent.memoise( (stack, next_action, reward, new_state, has_lost_life)) stack = new_stack gathered_frame += 1 if end: reward_list.append(cumulative_reward) if cumulative_reward > 0: res[1] += 1 print("You Won!, steps:", t, "reward:", reward_list.mean(), "frames:", gathered_frame) else: res[0] += 1 print("You Lost!, steps:", t, "reward:", reward_list.mean(), "frames:", gathered_frame) steps.append(t) break agent.learn() t += 1 scores.append(cumulative_reward) if episode_number >= 50 and episode_number % 10 == 0: model_name = "partial_model_pong" + str(episode_number) agent.save_model(model_name) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent }
def experiment(n_episodes, default_policy=False, policy=None, render=False, agent_config=None): """ Run a RL experiment that can be either training or testing Args: n_episodes: number of train/test episodes default_policy: boolean to enable testing/training phase policy: numpy tensor with a trained policy render: enable OpenAI environment graphical rendering agent_config: DQNAgent object Returns: Dictionary with: cumulative experiments outcomes list of steps per episode list of cumulative rewards trained policy """ res = [0, 0] # array of results accumulator: {[0]: Loss, [1]: Victory} scores = [] # Cumulative rewards steps = [] # Steps per episode env = gym.make('MountainCar-v0') env.seed(seed) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n if agent_config is None: if default_policy: agent = DQNAgent(output_dim, None, use_ddqn=True, default_policy=True, model_filename=policy, epsilon=0, epsilon_lower_bound=0, learn_thresh=0) else: layer1 = Dense(15, input_dim=input_dim, activation='relu') layer2 = Dense(output_dim) agent = DQNAgent(output_dim, [layer1, layer2], use_ddqn=True, learn_thresh=1000, update_rate=300, epsilon_decay_function=lambda e: e * 0.95, epsilon_lower_bound=0.01, optimizer=keras.optimizers.RMSprop(0.001)) else: agent = agent_config for i_episode in tqdm(range(n_episodes), desc="Episode"): state = env.reset() cumulative_reward = 0 # Model validation for early stopping if i_episode > 0 and (i_episode % 100) == 0 and not default_policy: agent.save_model("tmp_model") evaluation_result = experiment(500, default_policy=True, policy="tmp_model") acc = accuracy(evaluation_result["results"]) if acc == 100: break else: print("Accuracy:", acc, "Episode:", i_episode) state = np.reshape(state, [1, 2]) for t in range(env._max_episode_steps): if (render): env.render() next_action = agent.act(state) new_state, reward, end, _ = env.step(next_action) reward = abs(new_state[0] - (-0.5)) # r in [0, 1] (reward shaping) new_state = np.reshape(new_state, [1, 2]) agent.memoise((state, next_action, reward, new_state, end)) if end: if t == env._max_episode_steps - 1: res[0] += 1 else: res[1] += 1 # print("ENTRATO!,", t, "steps") steps.append(t) break else: state = new_state cumulative_reward += reward agent.learn() cumulative_reward += reward scores.append(cumulative_reward) env.close() return { "results": np.array(res), "steps": np.array(steps), "scores": np.array(scores), "agent": agent }