def main(): Hyper.init() env = make_env( Constants.env_id) # See wrapper code for environment in atari_image.py Hyper.n_actions = env.action_space.n shape = (env.observation_space.shape) agent = Agent(input_dims=shape, env=env, n_actions=env.action_space.n) filename = f"{Constants.env_id}_games{Hyper.n_games}_alpha{Hyper.alpha}.png" figure_file = f'plots/{filename}' best_ave_score = env.reward_range[0] best_score = 0 score_history = [] load_checkpoint = False if load_checkpoint: agent.load_models() env.render(mode='human') total_steps = 0 game_id = 0 for i in range(Hyper.n_games): game_id += 1 if game_id % 20 == 0: Hyper.alpha = Hyper.alpha * 1.2 Hyper.beta = Hyper.beta * 1.2 observation = env.reset() done = False steps = 0 score = 0 while not done: # Sample action from the policy action = agent.choose_action(observation) # Sample transition from the environment new_observation, reward, done, info = env.step(action) steps += 1 total_steps += 1 # Store transition in the replay buffer agent.remember(observation, action, reward, new_observation, done) if not load_checkpoint: agent.learn() score += reward observation = new_observation score_history.append(score) avg_score = np.mean(score_history[-100:]) if score > best_score: best_score = score if avg_score > best_ave_score: best_ave_score = avg_score if not load_checkpoint: agent.save_models() episode = i + 1 print( f"episode {episode}: score {score}, best_score {best_score}, best ave score {best_ave_score}, trailing 100 games avg {avg_score}, steps {steps}, total steps {total_steps}" ) print(f"total number of steps taken: {total_steps}") if not load_checkpoint: x = [i + 1 for i in range(Hyper.n_games)] plot_learning_curve(x, score_history, figure_file)
observation = one_hot_single_value(observation, n_states) done = False score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, _ = env.step(action) observation_ = one_hot_single_value(observation_, n_states) score += reward agent.remember(observation, action, reward, observation_, done) if not load_checkpoint: pass observation = observation_ agent.learn() score_history.append(score) avg_score = np.mean(score_history[-100:]) if avg_score > best_score: best_score = avg_score if not load_checkpoint: agent.save_models() print('episode ', i, 'score %.1f' % score, 'avg_score %.1f' % avg_score) if not load_checkpoint: x = [i + 1 for i in range(n_games)] plot_learning_curve(x, score_history, figure_file) with open('scores/score_history__.p', 'wb') as fp: pickle.dump(score_history, fp)
rewards = [model.train_on_env(env) for _ in range(100)] mean_rewards.append(np.mean(rewards)) print("mean reward:%.3f" % (np.mean(rewards))) plt.figure(figsize=[9, 6]) plt.title("Mean reward per 100 games") plt.plot(mean_rewards) plt.grid() # plt.show() plt.savefig('plots/SAC_learning_curve.png') plt.close() if np.mean(rewards) >= 1000: print("TRAINED!") break model.save_models() #model.load("experts/saved_expert/pg.model") num_expert = 100 states = np.array([]) probs = np.array([]) actions = np.array([]) for i in range(num_expert): state, prob, action, _ = model.generate_session(env) states = np.concatenate((states, state.reshape(-1))) probs = np.concatenate((probs, prob)) actions = np.concatenate((actions, action)) states = states.reshape(-1, 5) np.save('expert_samples/sac_inverted_pendulum_states', states) np.save('expert_samples/sac_inverted_pendulum_actions', actions) np.save('expert_samples/sac_inverted_pendulum_probs', probs)