def main(): dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2, 8], probs=[0.5, 0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) action_dim = 2 state_dim = 6 rpm = DQNPytorchReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = DQNPtorchModel(state_dim=state_dim, act_dim=action_dim) algorithm = DQNPytorchAlg(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = DQNPytorchAgent( algorithm, obs_dim=state_dim, act_dim=action_dim, e_greed=0.1, # 有一定概率随机选取动作,探索 e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低 # 加载模型 # save_path = './dqn_model.ckpt' # agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env, agent, rpm) max_episode = 2000 # start train episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 50): total_reward = run_episode(env, agent, rpm) episode += 1 # test part eval_reward, num_accpet = evaluate(env, agent) # render=True 查看显示效果 print( f'episode{episode}:evaluate reward,{eval_reward}, num of accpet:{num_accpet}' ) # 训练结束,保存模型 save_path = './dqn_pytorch_model.ckpt' agent.save(save_path)
def main(): # create environment dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) # env = gym.make('CartPole-v0') # env = env.unwrapped # Cancel the minimum score limit # obs_dim = env.observation_space.shape[0] # act_dim = env.action_space.n obs_dim = 6 act_dim = 2 logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = Model(act_dim=act_dim) alg = PolicyGradient(model, lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('./policy_grad_model.ckpt'): agent.restore('./policy_grad_model.ckpt') # run_episode(env, agent, train_or_test='test', render=True) # exit() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) if i % 10 == 0: logger.info("Episode {}, Reward Sum {}.".format( i, sum(reward_list))) batch_obs = np.array(obs_list) batch_action = np.array(action_list) batch_reward = calc_reward_to_go(reward_list, gamma=0.9) agent.learn(batch_obs, batch_action, batch_reward) if (i + 1) % 100 == 0: total_reward = evaluate(env, agent, render=True) logger.info('Test reward: {}'.format(total_reward)) # save the parameters to ./policy_grad_model.ckpt agent.save('./policy_grad_model.ckpt')
NUM_ITER = 1000 # for each experiment, simulate 100 trajectories. NUM_EPISODE = 1 # for each experiment, tuning num_epoch times NUM_EPOCH = 1 BATCH_SIZE = 899 BETA = 1 EPSILON = 0.1 # create environment dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) evaluation = Evaluation() class PPODataset(Dataset): def __init__(self, obs_list, action_list, advantage_list): self.obs_list = torch.cat(obs_list, 0) self.action_list = torch.tensor(action_list, dtype=torch.int64) self.advantage_list = torch.tensor(advantage_list, dtype=torch.float32) def __getitem__(self, index): return self.obs_list[index,:], self.action_list[index], self.advantage_list[index] def __len__(self): return self.obs_list.shape[0]
return throughput, packet_loss_rate if __name__ == "__main__": cf = configparser.ConfigParser() cf.read('config.ini', encoding='utf8') learning_rate = cf.getfloat('dqn', 'learning_rate') reward_decay = cf.getfloat('dqn', 'reward_decay') epsilon_max = cf.getfloat('dqn', 'epsilon_max') e_greedy_increment = cf.getfloat('dqn', 'e_greedy_increment') replay_capacity = cf.getint('dqn', 'replay_capacity') target_update_iter = cf.getint('dqn', 'target_update_iter') min_replay_history = cf.getfloat('dqn', 'min_replay_history') env = Environment() RL = DQNPrioritizedReplay_Dueling( n_actions=env.n_actions, n_features=env.n_features, learning_rate=learning_rate, reward_decay=reward_decay, e_greedy=epsilon_max, replace_target_iter=target_update_iter, memory_size=replay_capacity, e_greedy_increment=e_greedy_increment, prioritized=True, dueling=True, n_neurons=100, # output_graph=True, )
def main(): # create environment dist1 = Distribution(id=0, vals=[2], probs=[1]) dist2 = Distribution(id=1, vals=[5], probs=[1]) dist3 = Distribution(id=2, vals=[2, 8], probs=[0.5, 0.5]) env = Environment(total_bandwidth = 10,\ distribution_list=[dist1,dist2,dist3], \ mu_list=[1,2,3], lambda_list=[3,2,1],\ num_of_each_type_distribution_list=[300,300,300]) evaluation = Evaluation() obs_dim = 6 act_dim = 2 # logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 根据parl框架构建agent model = PGTorchModel(state_dim=obs_dim, act_dim=act_dim) alg = PGTorchAlgorithm(model, lr=LEARNING_RATE) agent = PGTorchAgent(alg, obs_dim=obs_dim, act_dim=act_dim) # 加载模型 if os.path.exists('./pg_torch_model'): agent.restore('./pg_torch_model') writer = SummaryWriter() for i in range(1000): obs_list, action_list, reward_list = run_episode(env, agent) writer.add_scalars('Reward/train', {'train_reward':sum(reward_list)/len(reward_list), \ 'reject when full': evaluation.reject_when_full_avg_reward, \ 'always accept': evaluation.always_accept_avg_reward,\ 'always reject': evaluation.always_reject_avg_reward}, i) # writer.add_scalar('Reward/train', evaluation.always_reject_avg_reward, i) if i % 10 == 0: # logger.info("Episode {}, Reward Sum {}.".format( # i, sum(reward_list))) print("Episode {}, Reward Sum {}.".format( i, sum(reward_list) / len(reward_list))) batch_obs = torch.from_numpy(np.array(obs_list)) batch_action = torch.from_numpy(np.array(action_list)) batch_reward = torch.from_numpy( calc_reward_to_go(reward_list, gamma=0.9)) loss = agent.learn(batch_obs, batch_action, batch_reward) writer.add_scalar('Loss/train', loss, i) if (i + 1) % 100 == 0: avg_reward, avg_acc_rate = evaluation.evaluate(agent) writer.add_scalars('reward Test', {'test reward': avg_reward, \ 'reject when full': evaluation.reject_when_full_avg_reward, \ 'always accept': evaluation.always_accept_avg_reward,\ 'always reject': evaluation.always_reject_avg_reward}, i) writer.add_scalars('accept rate Test', {'test rate': avg_acc_rate, \ 'reject when full': evaluation.reject_when_full_avg_acc_rate, \ 'always accept': evaluation.always_accept_avg_acc_rate,\ 'always reject': evaluation.always_reject_avg_acc_rate}, i) print('avg_reward', avg_reward, 'avg_acc_rate', avg_acc_rate, 'base ', evaluation.reject_when_full_avg_reward) writer.close() # save the parameters to ./pg_torch_model agent.save('./pg_torch_model')