示例#1
0
    def evaluate(
        self,
        obs_queue: deque,
        agent: Agent,
        num_episode: int = 3,
        render: bool = False,
    ) -> Tuple[float, List[GymImg], ]:
        """evaluate uses the given agent to run the game for a few episodes and
        returns the average reward and the captured frames."""
        self.__env = self.__env_eval
        ep_rewards = []
        frames = []
        for _ in range(self.get_eval_lives() * num_episode):
            observations, ep_reward, _frames = self.reset(render=render)
            for obs in observations:
                obs_queue.append(obs)
            if render:
                frames.extend(_frames)
            done = False

            while not done:
                state = self.make_state(obs_queue).to(self.__device).float()
                action = agent.run(state)
                obs, reward, done = self.step(action)

                ep_reward += reward
                obs_queue.append(obs)
                if render:
                    frames.append(self.get_frame())

            ep_rewards.append(ep_reward)

        self.__env = self.__env_train
        return np.sum(ep_rewards) / num_episode, frames
示例#2
0
    def evaluate(
        self,
        obs_queue: deque,
        agent: Agent,
        num_episode: int = 3,
        render: bool = False,
    ) -> Tuple[float, List[GymImg],
               ]:  # 使用给定的代理运行游戏几次(3个玩家5个回合),并返回平均奖励和捕获的帧(实际上不返回帧)。
        """evaluate uses the given agent to run the game for a few episodes and
        returns the average reward and the captured frames."""
        self.__env = self.__env_eval
        ep_rewards = []
        frames = []
        for _ in range(self.get_eval_lives() * num_episode):
            observations, ep_reward, _frames = self.reset(
                render=render)  # 初始化测试环境
            for obs in observations:
                obs_queue.append(obs)
            if render: frames.extend(_frames)
            done = False

            while not done:  # 开始测试
                state = self.make_state(obs_queue).to(self.__device).float()
                action = agent.run(state)  # 得到AI的下一个步骤
                obs, reward, done = self.step(action)  # 得到下一状态、收益、是否结束

                ep_reward += reward
                obs_queue.append(obs)
                if render: frames.append(self.get_frame())

            ep_rewards.append(ep_reward)  # 统计收益

        self.__env = self.__env_train
        return np.sum(ep_rewards) / num_episode, frames  # 返回平均收益
MAX_STEPS = 5_000_000
EVALUATE_FREQ = 100_000

rand = random.Random()
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
os.mkdir(SAVE_PREFIX)  # 保存训练好的模型

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)  # 创建一个环境,用于跑atari这个游戏
agent = Agent(  # 创建一个agent
    env.get_action_dim(),  # 游戏中动作的数量,一共有三个,分别是左右和不动
    device,  # 训练使用的设备
    GAMMA,
    new_seed(),
    EPS_START,  # epsilon的开始值
    EPS_END,  # epsilon的最小值
    EPS_DECAY,  # epsilon递减的
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE,
                      device)  # 用来记录agent的动作于结果之间的联系,用于后面神经网络的训练

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
示例#4
0
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
if not os.path.exists(SAVE_PREFIX):
    os.mkdir(SAVE_PREFIX)

torch.manual_seed(new_seed())
# The number of threads here needs to be adjusted based on the number of CPU cores available
torch.set_num_threads(4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)
agent = Agent(
    env.get_action_dim(),
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
    restore=restore,
    rlmodel=rlmodel,
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
示例#5
0
EVALUATE_FREQ = 100_00  # 100000

rand = random.Random()
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
os.mkdir(SAVE_PREFIX)  # 创建目录保存模型

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
env = MyEnv(device)  # 环境
agent = Agent(  # 智能体
    env.get_action_dim(),  # 3
    device,  # cuda
    GAMMA,  # 0.99
    new_seed(),
    EPS_START,  # 1
    EPS_END,  # 0.1
    EPS_DECAY,  # 1e6
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)  # 初始化经验池

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS),
                   total=MAX_STEPS,
                   ncols=50,
                   leave=False,
                   unit="b")  # 可视化进度条
示例#6
0
MAX_STEPS = 50_000_000
EVALUATE_FREQ = 100_000

rand = random.Random()
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
os.mkdir(SAVE_PREFIX)

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)
agent = Agent(
    env.get_action_dim(),
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
)
# memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)
memory = Experience({
    'size': MEM_SIZE,
    'batch_size': BATCH_SIZE,
    'learn_start': WARM_STEPS,
    'steps': MAX_STEPS,
    'device': device,
    'channels': STACK_SIZE + 1
})

#### Training ####
示例#7
0
        print('current lab:', ve, 'shade rate:', get_shade_time(action_queue))


if __name__ == "__main__":

    versions = vs0
    for version in versions:
        #set_trace()
        print(version)
        dueling = False if version.find('dueling') == -1 else True
        stable = False if version.find('stable') == -1 else True
        if stable:
            action_queue = []
        env = MyEnv(device)
        agent = Agent(env.get_action_dim(), device, GAMMA, new_seed(),
                      EPS_START, EPS_END, EPS_DECAY, dueling, pretrained,
                      stable * 0.1)
        if version.find('PER') != -1:
            memory = PERMemory(STACK_SIZE + 1, MEM_SIZE, device)
            #memory = Memory_Buffer_PER(MEM_SIZE)
        else:
            memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)
            #memory = Memory_Buffer_PER(MEM_SIZE)

        #### Training ####
        obs_queue: deque = deque(maxlen=5)
        done = True

        avg_reward_arr = []

        progressive = tqdm(range(MAX_STEPS),
    if c == 0:
        return ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)
    else:
        return PERMemory(STACK_SIZE + 1, MEM_SIZE, device)


if __name__ == "__main__":
    REWARD = []
    global memory
    c = 1
    while (c >= 0):
        agent = Agent(
            env.get_action_dim(),
            device,
            GAMMA,
            new_seed(),
            EPS_START,
            EPS_END,
            EPS_DECAY,
            "./model_weights_b"  ####*************
        )
        memory = choosememory(c)
        Reward = []
        for step in tqdm(range(MAX_STEPS), ncols=50, leave=False, unit="b"):
            if done:
                observations, _, _ = env.reset()
                for obs in observations:
                    obs_queue.append(obs)

            training = len(memory) > WARM_STEPS
            state = env.make_state(obs_queue).to(device).float()
            action, value_this = agent.run(state, training)  #当前选择的行动和Q值
示例#9
0
from utils_env import MyEnv
from utils_drl import Agent

# In[2]:

target = 78
model_name = f"model_{target:03d}"
model_path = f"./models/{model_name}"
device = torch.device("cpu")
env = MyEnv(device)
agent = Agent(env.get_action_dim(),
              device,
              0.99,
              0,
              0,
              0,
              1,
              model_path,
              use_dueling=True,
              use_PR=True,
              use_DDQN=True)

# In[3]:

obs_queue = deque(maxlen=5)
avg_reward, frames = env.evaluate(obs_queue, agent, render=True)
print(f"Avg. Reward: {avg_reward:.1f}")

get_ipython().system('rm -r eval_*')
target_dir = f"eval_{target:03d}"
os.mkdir(target_dir)
示例#10
0
EVALUATE_FREQ = 100_000  # 评估频率,每100_000次停下来评估一下

rand = random.Random()  # [0,1]中的任一浮点数值
rand.seed(GLOBAL_SEED)  # 根据输入seed固定获得相同的随机数
new_seed = lambda: rand.randint(0, 1000_000)  #0~1000000中任选其一
os.mkdir(SAVE_PREFIX)  # 在"./models"创建目录

torch.manual_seed(new_seed())  # 将new_seed赋值给cpu的随机数种子
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#创建设备:GPU/CPU?
device = torch.device("cpu")
env = MyEnv(device)
agent = Agent(  # 根据预设参数初始化
    env.get_action_dim(),  # 返回3,三个动作:["NOOP", "RIGHT", "LEFT"]
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE,
                      device)  # 循环队列,三者分别对应通道、容量、设备,容量为MEM_SIZE=100_000

#### Training ####
obs_queue: deque = deque(maxlen=5)  #创建观察队列
done = True

progressive = tqdm(
    range(MAX_STEPS),
    total=MAX_STEPS,  #   预期的迭代次数
    ncols=50,  #  可以自定义进度条的总长度
示例#11
0
MAX_STEPS = 50_000_000
EVALUATE_FREQ = 100_000

rand = random.Random()
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
os.mkdir(SAVE_PREFIX)

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)
agent = Agent(
    env.get_action_dim(),
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
)
memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True

progressive = tqdm(range(MAX_STEPS), total=MAX_STEPS,
                   ncols=50, leave=False, unit="b")
for step in progressive:
    if done:
        observations, _, _ = env.reset()
MAX_STEPS = 10_000_0
EVALUATE_FREQ = 100_0

rand = random.Random()
rand.seed(GLOBAL_SEED)
new_seed = lambda: rand.randint(0, 1000_000)
#os.mkdir(SAVE_PREFIX)

torch.manual_seed(new_seed())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = MyEnv(device)
agent = Agent(
    env.get_action_dim(),
    device,
    GAMMA,
    new_seed(),
    EPS_START,
    EPS_END,
    EPS_DECAY,
)

prioritized = True
if prioritized:
    memory = MemoryBufferPER(STACK_SIZE + 1, MEM_SIZE, device)
else:
    memory = ReplayMemory(STACK_SIZE + 1, MEM_SIZE, device)

#### Training ####
obs_queue: deque = deque(maxlen=5)
done = True
episode_reward = 0