def main(): env = Maze() action_dim = env.n_actions observation_dim = env.n_features logger.info('observation_dim {}, action_dim {}'.format( observation_dim, action_dim)) model = Model(act_dim=action_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=observation_dim, act_dim=action_dim) # 加载模型 #if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') # run_episode(env, agent, train_or_test='test', render=True) # exit() for i in range(301): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 300 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./model.ckpt agent.save('./model.ckpt')
def main(): env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # if os.path.exists('./model.ckpt'): # agent.restore('./model.ckpt') # run_episode(env, agent, train_or_test='test', render=True) # exit() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./model.ckpt agent.save('./model.ckpt')
def main(): # create environment dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) # env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n obs_dim = 6 act_dim = 2 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('./policy_grad_model.ckpt'): agent.restore('./policy_grad_model.ckpt') # run_episode(env, agent, train_or_test='test', render=True) # exit() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list, gamma=0.9) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./policy_grad_model.ckpt agent.save('./policy_grad_model.ckpt')
def train(n_episodes=5000, max_t=1000, gamma=1.0): env = gym.make('CartPole-v0') env.seed(0) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n model = Model(state_dim, action_dim, seed=1) alg = PolicyGradient(model, device=device, lr=LR) agent = Agent(alg, state_dim, action_dim, device) if LOAD_MODEL: agent.alg.model.load_state_dict(torch.load('checkpoint2.pth')) scores_deque = deque(maxlen=100) for episode in range(1, n_episodes+1): current_state = env.reset() log_prob_list, reward_list = [], [] for t in range(max_t): action, log_prob = agent.step(current_state) next_state, reward, done, _ = env.step(action) log_prob_list.append(log_prob) reward_list.append(reward) if episode % SHOW_EVERY == 0: env.render() if done: break else: current_state = next_state scores_deque.append(sum(reward_list)) agent.learn(log_prob_list, reward_list, gamma) if episode % PRINT_EVERY == 0: print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_deque))) if np.mean(scores_deque)>=195.0: print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode-100, np.mean(scores_deque))) torch.save(agent.alg.model.state_dict(), 'checkpoint2.pth') break env.close()
def main(): # 创建环境 game = Snake(width=256, height=256, init_length=10) p = PLE(game, fps=30, display_screen=True, force_fps=True) # 根据parl框架构建agent p.reset_game() print(p.getActionSet()) act_dim = len(p.getActionSet()) obs_dim = 256 * 256 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # # 加载模型 # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'): # agent.restore('model_dir/pg_pong_episode_19.ckpt') best_total_reward = -float('inf') for i in range(50000): obs_list, action_list, reward_list = run_episode(p, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 50 == 0: total_reward = evaluate(p, agent, render=True) if total_reward > best_total_reward: best_total_reward = total_reward agent.save( 'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format( i, total_reward)) logger.info('Test reward: {}'.format(total_reward))
def test(): env = gym.make('CartPole-v1') env.seed(0) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n model = Model(state_dim, action_dim, seed=1) alg = PolicyGradient(model, device=device, lr=LR) agent = Agent(alg, state_dim, action_dim, device) agent.alg.model.load_state_dict(torch.load('checkpoint2.pth')) for i in range(10): state = env.reset() for j in range(150): action = agent.predict(state) state, reward, done, _ = env.step(action) env.render() if done: break env.close()
# G_i = r_i + γ·G_i+1 reward_list[i] += gamma * reward_list[i + 1] # Gt return np.array(reward_list) if __name__ == '__main__': df = pd.read_csv('./data/data4.csv') env = CustomEnv(df) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # for i in range(1000): # obs_list, action_list, reward_list = run_episode(env, agent) # if i % 10 == 0: # logger.info("Episode {}, Reward Sum {}.".format( # i, sum(reward_list))) # # batch_obs = np.array(obs_list) # batch_action = np.array(action_list) # batch_reward = calc_reward_to_go(reward_list) # # agent.learn(batch_obs, batch_action, batch_reward) # if (i + 1) % 100 == 0: # total_reward = evaluate(env, agent, render=True)
def main(): # 创建环境 env_test = gym.make('stocks-v0', frame_bound=(1800, 2150), window_size=10) obs_dim = 20 act_dim = 2 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 # agent.restore('./stock_pg_v1_2.ckpt') test_flag = 1 # 是否直接测试 if (test_flag == 1): for i in range(5000): # 每次跟新训练环境 start = np.random.randint(10, 1900) env_train = gym.make('stocks-v0', frame_bound=(start, start + 100), window_size=10) # 每次都是单个环境 # env_train = gym.make('stocks-v0', frame_bound=(10, 2000), window_size=10) obs_list, action_list, reward_list = run_episode(env_train, agent) if i % 50 == 0: logger.info("Train Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list) cost = agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env_test, agent) logger.info('Episode {}, Test reward: {}'.format( i + 1, total_reward)) # 保存模型 ckpt = 'stock_pg_v2/steps_{}.ckpt'.format(i) agent.save(ckpt) plt.cla() env_test.render_all() plt.show() # save the parameters to ./model.ckpt # agent.save('./stock_pg_v1_2.ckpt') else: # 加载模型 agent.restore('./stock_pg_v2/steps_4899.ckpt') total_reward = evaluate(env_test, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) plt.cla() env_test.render_all() plt.show()