while not done: if i < render_episodes: # render for viewing experience env.render() obs_img = np.roll(obs_img, 1, axis=0) obs_img[0, :] = observation reshaped = obs_img.reshape((1, history_len * obs_len)) action = model.predict_move(reshaped, train=False) # if the model is not initialized, take a random action instead if action == None: action = env.action_space.sample() # use action to make a move observation, reward, done, info = env.step(action) cumulative_reward += reward print('current average: {} in {} games'.format( cumulative_reward / (i + 1), (i + 1))) print('average score: {}'.format(cumulative_reward / eval_episodes)) return cumulative_reward / eval_episodes if __name__ == "__main__": # create model model = nn.Control_Model() EvalModel(model)
train_episodes = 200 # number of episodes before training eval_episodes = 10 # number of episode when evaluating render_episodes = 1 # number of episodes to render when evaluating env = gym.make(env_name) if enable_video: env = gym.wrappers.Monitor(env, directory=vid_dir, force=False, resume=True) obs_len = len(env.observation_space.low) act_len = env.action_space.n # create model model = nn.Control_Model(obs_len * historic_data_len, act_len) train_count = 0 # train to until above the threshold while (em.EvalModel(model, env, eval_episodes, render_episodes, historic_data_len, obs_len) < trained_threshold): max_reward = -np.inf # run a segment of 200 'games' and train off of the max score for i in range(train_episodes): cumulative_reward = 0 obs_log = [] action_log = [] done = False obs_img = np.zeros((historic_data_len, obs_len))