Пример #1
0
def main():
    dist1 = Distribution(id=0, vals=[2], probs=[1])
    dist2 = Distribution(id=1, vals=[5], probs=[1])
    dist3 = Distribution(id=2, vals=[2, 8], probs=[0.5, 0.5])

    env = Environment(total_bandwidth = 10,\
        distribution_list=[dist1,dist2,dist3], \
        mu_list=[1,2,3], lambda_list=[3,2,1],\
        num_of_each_type_distribution_list=[300,300,300])

    action_dim = 2
    state_dim = 6
    rpm = DQNPytorchReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = DQNPtorchModel(state_dim=state_dim, act_dim=action_dim)
    algorithm = DQNPytorchAlg(model,
                              act_dim=action_dim,
                              gamma=GAMMA,
                              lr=LEARNING_RATE)
    agent = DQNPytorchAgent(
        algorithm,
        obs_dim=state_dim,
        act_dim=action_dim,
        e_greed=0.1,  # 有一定概率随机选取动作,探索
        e_greed_decrement=1e-6)  # 随着训练逐步收敛,探索的程度慢慢降低

    # 加载模型
    # save_path = './dqn_model.ckpt'
    # agent.restore(save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env, agent, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(env, agent, rpm)
            episode += 1

        # test part
        eval_reward, num_accpet = evaluate(env, agent)  # render=True 查看显示效果
        print(
            f'episode{episode}:evaluate reward,{eval_reward}, num of accpet:{num_accpet}'
        )

    # 训练结束,保存模型
    save_path = './dqn_pytorch_model.ckpt'
    agent.save(save_path)
Пример #2
0
def main():
     # create environment
    dist1 = Distribution(id=0, vals=[2], probs=[1])
    dist2 = Distribution(id=1, vals=[5], probs=[1])
    dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5])

    env = Environment(total_bandwidth = 10,\
        distribution_list=[dist1,dist2,dist3], \
        mu_list=[1,2,3], lambda_list=[3,2,1],\
        num_of_each_type_distribution_list=[300,300,300])
    # env = gym.make('CartPole-v0')
    # env = env.unwrapped # Cancel the minimum score limit
    # obs_dim = env.observation_space.shape[0]
    # act_dim = env.action_space.n
    obs_dim = 6
    act_dim = 2
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('./policy_grad_model.ckpt'):
        agent.restore('./policy_grad_model.ckpt')
        # run_episode(env, agent, train_or_test='test', render=True)
        # exit()

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list, gamma=0.9)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=True)
            logger.info('Test reward: {}'.format(total_reward))

    # save the parameters to ./policy_grad_model.ckpt
    agent.save('./policy_grad_model.ckpt')
Пример #3
0
NUM_ITER = 1000
# for each experiment, simulate 100 trajectories.
NUM_EPISODE = 1
# for each experiment, tuning num_epoch times
NUM_EPOCH = 1
BATCH_SIZE = 899
BETA = 1
EPSILON = 0.1

 # create environment
dist1 = Distribution(id=0, vals=[2], probs=[1])
dist2 = Distribution(id=1, vals=[5], probs=[1])
dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5])

env = Environment(total_bandwidth = 10,\
    distribution_list=[dist1,dist2,dist3], \
    mu_list=[1,2,3], lambda_list=[3,2,1],\
    num_of_each_type_distribution_list=[300,300,300])
evaluation = Evaluation()

class PPODataset(Dataset):
    def __init__(self, obs_list, action_list, advantage_list):
        self.obs_list = torch.cat(obs_list, 0)
        self.action_list = torch.tensor(action_list, dtype=torch.int64)
        self.advantage_list = torch.tensor(advantage_list, dtype=torch.float32)

    def __getitem__(self, index):
        return self.obs_list[index,:], self.action_list[index], self.advantage_list[index]
    
    def __len__(self):
        return self.obs_list.shape[0]
Пример #4
0
        return throughput, packet_loss_rate


if __name__ == "__main__":

    cf = configparser.ConfigParser()
    cf.read('config.ini', encoding='utf8')
    learning_rate = cf.getfloat('dqn', 'learning_rate')
    reward_decay = cf.getfloat('dqn', 'reward_decay')
    epsilon_max = cf.getfloat('dqn', 'epsilon_max')
    e_greedy_increment = cf.getfloat('dqn', 'e_greedy_increment')
    replay_capacity = cf.getint('dqn', 'replay_capacity')
    target_update_iter = cf.getint('dqn', 'target_update_iter')
    min_replay_history = cf.getfloat('dqn', 'min_replay_history')

    env = Environment()

    RL = DQNPrioritizedReplay_Dueling(
        n_actions=env.n_actions,
        n_features=env.n_features,
        learning_rate=learning_rate,
        reward_decay=reward_decay,
        e_greedy=epsilon_max,
        replace_target_iter=target_update_iter,
        memory_size=replay_capacity,
        e_greedy_increment=e_greedy_increment,
        prioritized=True,
        dueling=True,
        n_neurons=100,
        # output_graph=True,
    )
Пример #5
0
def main():
    # create environment
    dist1 = Distribution(id=0, vals=[2], probs=[1])
    dist2 = Distribution(id=1, vals=[5], probs=[1])
    dist3 = Distribution(id=2, vals=[2, 8], probs=[0.5, 0.5])

    env = Environment(total_bandwidth = 10,\
        distribution_list=[dist1,dist2,dist3], \
        mu_list=[1,2,3], lambda_list=[3,2,1],\
        num_of_each_type_distribution_list=[300,300,300])
    evaluation = Evaluation()
    obs_dim = 6
    act_dim = 2
    # logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = PGTorchModel(state_dim=obs_dim, act_dim=act_dim)
    alg = PGTorchAlgorithm(model, lr=LEARNING_RATE)
    agent = PGTorchAgent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('./pg_torch_model'):
        agent.restore('./pg_torch_model')
    writer = SummaryWriter()

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        writer.add_scalars('Reward/train', {'train_reward':sum(reward_list)/len(reward_list), \
                'reject when full': evaluation.reject_when_full_avg_reward, \
                    'always accept': evaluation.always_accept_avg_reward,\
                        'always reject': evaluation.always_reject_avg_reward}, i)
        # writer.add_scalar('Reward/train', evaluation.always_reject_avg_reward, i)

        if i % 10 == 0:
            # logger.info("Episode {}, Reward Sum {}.".format(
            #     i, sum(reward_list)))
            print("Episode {}, Reward Sum {}.".format(
                i,
                sum(reward_list) / len(reward_list)))
        batch_obs = torch.from_numpy(np.array(obs_list))

        batch_action = torch.from_numpy(np.array(action_list))
        batch_reward = torch.from_numpy(
            calc_reward_to_go(reward_list, gamma=0.9))

        loss = agent.learn(batch_obs, batch_action, batch_reward)
        writer.add_scalar('Loss/train', loss, i)
        if (i + 1) % 100 == 0:
            avg_reward, avg_acc_rate = evaluation.evaluate(agent)
            writer.add_scalars('reward Test', {'test reward': avg_reward, \
                'reject when full': evaluation.reject_when_full_avg_reward, \
                    'always accept': evaluation.always_accept_avg_reward,\
                        'always reject': evaluation.always_reject_avg_reward}, i)
            writer.add_scalars('accept rate Test', {'test rate': avg_acc_rate, \
                'reject when full': evaluation.reject_when_full_avg_acc_rate, \
                    'always accept': evaluation.always_accept_avg_acc_rate,\
                        'always reject': evaluation.always_reject_avg_acc_rate}, i)
            print('avg_reward', avg_reward, 'avg_acc_rate', avg_acc_rate,
                  'base ', evaluation.reject_when_full_avg_reward)

    writer.close()
    # save the parameters to ./pg_torch_model
    agent.save('./pg_torch_model')