step = 0 while step < arg.game_num: obs = env.reset() done = 0 total_reward = 0 step = agent.step_move() epsilon = max(1 - step * arg.epsilon_decrease, arg.epsilon_min) while not done: if np.random.uniform(0, 1) < epsilon: action = agent.random_action() else: action = agent.choose_action(obs) obs_, reward, done, _ = env.step( action + 1) # because there is only three action replay_buffer.store_transition(obs, obs_, action, reward, done) total_reward += reward obs = obs_ print('in {}, {}th game: the reward {} '.format( arg.run_name, step, total_reward)) if step % train_period == 0: s1, s2, a, r, d = replay_buffer.sample(batch_size=train_batch) if step % record_period == 0: loss = agent.train(s1, s2, a, r, d, True) agent.log_reward(total_reward) agent.save() else: loss = agent.train(s1, s2, a, r, d, False) print('{}th game: the training loss {}'.format(step, loss))
memory = [] total_reward = 0 obs = env.reset() frame_processor = FrameProcessor(obs) obs, reward, done, _ = env.step(agent.random_action() + 1) # do random action at the first frame total_reward += reward # play one game while not done: input_frame = frame_processor.process(obs) prob = agent.get_action_prob(input_frame) action = np.random.choice(3, p=prob) obs, reward, done, _ = env.step(action + 1) if reward == 0: replay_buffer.store_transition(input_frame, action) else: total_reward += reward if reward == 1: replay_buffer.back_trace_reward(reward, 1) else: replay_buffer.back_trace_reward(reward, 0.9) memory.append(replay_buffer) replay_buffer = ReplayBuffer(input_shape=[160, 160], start_size=32, max_size=10000000) # train in the memory step = agent.step_move() loss = 0 for j in range(len(memory)):