def collect_stats(agent: DQNAgent, n_games=1000): MAX_STEPS = 1000 lenghts = [] looped = 0 for i in range(1, n_games+1): env = gym.make('snake-v0') # env.__init__(human_mode=False) observation = env.reset() done = False steps = 0 agent.epsilon = 0.0 state = agent.get_last_observations(observation) while not done and steps < MAX_STEPS: action = agent.act(state) next_observation, _, done, _ = env.step(action) state = agent.get_last_observations(next_observation) steps += 1 if steps == MAX_STEPS: looped += 1 else: lenghts.append(len(env.game.snake.body)) if i % (n_games//10) == 0: print(f"Avg len: {sum(lenghts) / len(lenghts):.2f}, looped {looped}/{i}")
def run_episode(environment: gym.Env, agent: DQNAgent, render: bool, max_length: int): """ Run one episode in the given environment with the agent. Arguments: environment {`gym.Env`} -- Environment representing the Markov Decision Process agent {`DQNAgent`} -- Reinforcment Learning agent that acts in the envíronment render {`bool`} -- Whether the frames of the episode should be rendered on the screen max_length {`int`} -- Maximum number of steps before the episode is terminated Returns: `float` -- Cumulated reward that the agent received during the episode """ episode_reward = 0 state = environment.reset() for _ in range(max_length): if render: environment.render() action = agent.act(state) next_state, reward, terminal, _ = environment.step(action) agent.observe( Transition(state, action, reward, None if terminal else next_state)) episode_reward += reward if terminal: break else: state = next_state return episode_reward
class DQNScheduler: def __init__(self, simulator): self.agent = DQNAgent(25, 6) self.agent.load("./save/car-100-dqn.h5") self.simulator = simulator self.agent.epsilon = 0 def schedule(self): action = self.agent.act(np.reshape(self.simulator.get_state(), [1, 25])) return action
def _run_agent_one_ep(env: BaseEnv, agent: DQNAgent, config: Config, eps: float, behavior_name: str, train: Optional[bool] = True): # Get a starting state env.reset() decision_steps, terminal_steps = env.get_steps(behavior_name) state = decision_steps.obs[0] agent_id = decision_steps.agent_id[0] done = False did_win = False episode_reward = 0.0 import time while not done: reward = 0.0 # Get and perform an action action = agent.act(decision_steps.obs[0], eps) env.set_actions(behavior_name, np.expand_dims(action, 0).reshape(-1, 1)) env.step() decision_steps, terminal_steps = env.get_steps(behavior_name) # Determine S', R, Done next_state = None if agent_id in decision_steps: reward += decision_steps.reward[0] next_state = decision_steps.obs[0] if agent_id in terminal_steps: terminal_reward = terminal_steps.reward[0] # Add win/loss did_win = True if math.isclose(terminal_reward, 1.0) else False reward += terminal_reward next_state = terminal_steps.obs[0] done = True assert next_state is not None, f"next_state cannot be None. Agent {agent_id} did not appear in decision or terminal steps" if train: # Learn from (S, A, R, S') experience = Experience(state, action, reward, next_state, done) agent.step(experience) # Set new state state = next_state episode_reward += reward return (episode_reward, did_win)
def watch_agent(agent: DQNAgent): env = gym.make('snake-v0') env.__init__(human_mode=True) observation = env.reset() renderer=Renderer(env.game) try: done = False steps = 0 agent.epsilon = 0 state = agent.get_last_observations(observation) while not done: # time.sleep(0.001) renderer.render_frame() action = agent.act(state) next_observation, _, done, _ = env.step(action) state = agent.get_last_observations(next_observation) steps += 1 finally: renderer.close_window() print(f"Snake length: {len(env.game.snake.body)}") print(f"Simulation ended after {steps} steps.")
def main(argv): args = parser.parse_args(argv[1:]) if args.usage == 'help': return parser.print_help() if is_environments_gen(args): _write_env_file(args) elif is_environments_list(args): all_registry = registry.all() registry_envs_name = [ trim_env_spec_name(env.__repr__()) for env in all_registry ] for environment in registry_envs_name: print(environment) elif is_environments_act(args): env = gym.make(args.environment_name) if is_action_type('dqn', args): if args.pre_defined_state_size == 'nesgym': pre_state_size = 172032 elif args.pre_defined_state_size == 'gym': pre_state_size = env.observation_space.shape[0] elif args.pre_defined_state_size == 'gym-atari': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-atari-extend': pre_state_size = 120000 elif args.pre_defined_state_size == 'gym-atari-small': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-gomoku': pre_state_size = 361 # state_size = (1,) + env.observation_space.shape state_size = pre_state_size action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # try: # agent.load('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # except Exception: # pass done = False batch_size = 64 i_episodes = args.i_episodes timesteps = args.timesteps factor = args.seed_factor for i_episode in range(i_episodes): state = env.reset() if is_action_type('dqn', args): state = np.reshape(state, [1, pre_state_size]) for t in range(timesteps): try: if args.render == 'present': env.render() if args.render == 'presented': env.render(args.render) if args.action_type == 'alternate': action_choice = i_episodes * 2 action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'specific': action = env.action_space.sample() elif args.action_type == 'conditional': action_choice = i_episodes action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'numerical': action = env.action_space.n elif is_action_type('dqn', args) and len(state) == 5: action = agent.act(state) elif is_action_type('dqn', args) and len(state) != 5: action = env.action_space.sample() collect_stat(action, ['input', 'actions'], stats) observation, reward, done, info = env.step(action) if is_action_type('dqn', args): reward = reward if not done else -10 observation = np.reshape(observation, [1, pre_state_size]) agent.remember(state, action, reward, observation, done) state = observation # collect_stat(observation,['observation'],stats) collect_stat(reward, ['rewards'], stats) # collect_stat(done,['output','done'],stats) # collect_stat(info,['output','info'],stats) if done: max_episodes_range = (i_episodes - 1) episode_timesteps_iteration_limit = max_episodes_range - 1 is_latest_episode = is_filled_latest_episode_with_iteration( i_episode, episode_timesteps_iteration_limit) increased_timestep = increase_timestep(t) print('i_episode {}'.format(i_episode)) print('Episode finished after {} timesteps'.format( increased_timestep)) if is_action_type('dqn', args): print('Episode: {}/{}, score: {}, e: {:.2}'.format( i_episode, i_episodes, t, agent.epsilon)) collect_stat(t, ['output', 'timestep', 'iteration'], stats) collect_stat(increased_timestep, ['output', 'timestep', 'increased'], stats) is_latest_episode_to_save_state = lambda args_cached: is_latest_episode and args_cached.output_stats_filename if is_latest_episode_to_save_state(args): filename = args.output_stats_filename pre_df = { # 'observations': stats['observations'], 'rewards': stats['rewards'], # 'done-output': stats['output']['done'], # 'info-output': stats['output']['info'], # 'iteration-timestep': stats['output']['timestep']['iteration'], # 'increased-timestep': stats['output']['timestep']['increased'], 'actions-input': stats['input']['actions'] } df = pd.DataFrame(pre_df) stamp = lambda: '%s' % (int(datetime.now(). timestamp())) with open( 'data/{}-{}.csv'.format(stamp(), filename), 'w') as f: f.write(df.to_csv()) f.close() print('Statistics file saved ({}.csv)!'.format( filename)) del df del filename print(check_output_env_label()) del is_latest_episode_to_save_state del increased_timestep del is_latest_episode del episode_timesteps_iteration_limit del max_episodes_range break except Exception as e: print('Rendering execution ({})'.format(e)) finally: print('Execution of timestep done') if is_action_type('dqn', args) and (len(agent.memory) > batch_size): agent.replay(batch_size) # agent.save('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # env.close() else: parser.print_help()
def main(): print "Creating DQN agent..." # env = gym.make("codegen-v0") set_debugger_org_frc() iters = 6300 n_goal = 0 n_goal_all = 0 time_stamp = 0 max_steps = 5 agent = DQNAgent(max_steps) agent.dqn.initial_exploration = 6000 * max_steps for iter in range(iters): print "\n********Iteration # ", iter, "***********\n" # 1 iteration env = gym.make("codegen-v0") num = random.randrange(1, 100) print "Goal Number : ", num + 1 env.my_input = num #env.goal = "['" + env.my_input + "']" env.goal = str(num + 1) code = env._reset() step_in_episode = 0 total_score = 0.0 reward = 0.0 mystate = [] my_state_new = [] # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss # while True: while step_in_episode < max_steps: # state = env.code_index_list + [-1]*(max_steps-len(env.code_index_list state = env.code_index_list[:] state += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # state = state.tolist() # state = 1; # print "env = ",env.code_index_list # print "state = ",state # raw_input() if step_in_episode == 0: action_idx = agent.start(code, state) else: action_idx = agent.act(code, state, reward) code, reward, terminal, info = env._step(action_idx, agent.dqn.actions) state_prime = env.code_index_list[:] state_prime += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss print "state : " print state print "state' : " print state_prime if step_in_episode == max_steps - 1: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) else: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 0) agent.dqn.experience_replay(agent.dqn.time_stamp) agent.dqn.target_model_update(agent.dqn.time_stamp, soft_update=False) total_score += reward if terminal: agent.dqn.goal_idx.append(agent.dqn.time_stamp) agent.end(reward) agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) n_goal_all += 1 step_in_episode += 1 agent.dqn.time_stamp += 1 if iters - iter <= 100: n_goal += 1 break step_in_episode += 1 agent.dqn.time_stamp += 1 if iter == 1 + (agent.dqn.initial_exploration / max_steps): print "n_goal_all = ", n_goal_all print agent.dqn.goal_idx raw_input() print "n_goal : ", n_goal print "epsilon : ", agent.epsilon
class VideoStreamingTest(object): def __init__(self, host, port): self.state_size = 3 self.action_size = 7 self.done = False self.batch_size = 32 self.agent = DQNAgent(self.state_size, self.action_size) self.state_now = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.state_last = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.action_for_next = 0 self.action_for_now = 0 self.reward = 0 self.forward = "T394" self.left = "S450" self.right = "S270" self.backward = "T330" self.stop = "T370" self.middle = "S360" #dqn parameters self.server_socket = socket.socket() self.server_socket.bind((host, port)) self.server_socket.listen(0) self.connection, self.client_address = self.server_socket.accept() self.connection = self.connection.makefile("rb") self.host_name = socket.gethostname() self.host_ip = socket.gethostbyname(self.host_name) self.temp_result = None self.finnal_result = None self.RANGE = 350 self.WIDTH = 720 self.time_now = 0 self.count = 0 self.streaming() def dqn_loop(self): if self.finnal_result['me']['r'] > 1: self.done = True else: self.done = False if True: self.prepare_state() #更新前一次状态,并获取这一次状态 self.prepare_action() #更新前一次动作,并获取本次操作 if self.count == 1: self.prepare_reward() #获取上一次活动的奖励 else: self.count += 1 self.act_move() #更新小车运动状态 if self.count == 1: self.remember_step() #收集本次数据 if len(self.agent.memory) > self.batch_size: self.agent.replay(self.batch_size) def prepare_state(self): self.state_last = self.state_now state_now_ = [self.finnal_result['me']['alpha_big'], \ self.finnal_result['me']['alpha_small'], \ self.finnal_result['me']['r']] self.state_now = np.reshape(state_now_, [1, self.state_size]) #self.state_now = state_now_ def prepare_action(self): self.action_for_now = self.action_for_next self.action_for_next = self.agent.act(self.state_now) def prepare_reward(self): #运行条件:state_last非空 if self.done: self.reward = -10 else: self.reward = (self.state_last[0][2] - self.state_now[0][2]) * 100 #self.reward = (self.state_last[2] - self.state_now[2])*100 def remember_step(self): self.agent.remember(self.state_last, self.action_for_now, self.reward, self.state_now, self.done) def act_move(self): if self.done: self.action_for_next = 0 if self.action_for_next == 0: #停止 str_S = self.middle str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 1: #前进 str_S = self.middle str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 2: #左转前进 str_S = self.left str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 3: #右转前进 str_S = self.right str_T = self.forward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 4: #后退 str_S = self.middle str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.middle str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.middle str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) elif self.action_for_next == 5: #左转后退 str_S = self.left str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.left str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") str_S = self.left str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") elif self.action_for_next == 6: #右转后退 str_S = self.right str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.right str_T = self.stop str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) str_S = self.right str_T = self.backward str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) def get_one_car(self, x1, y1, x2, y2): x0 = (x1 + x2) / 2 y0 = (y1 + y2) / 2 detx = x1 - x2 dety = y1 - y2 temp_x0 = x0 - self.WIDTH / 2 temp_y0 = y0 - self.WIDTH / 2 if detx > 0: alpha_small = math.atan(dety / detx) elif detx < 0: alpha_small = math.atan(dety / detx) + math.pi else: if dety > 0: alpha_small = math.pi / 2 else: alpha_small = 0 - math.pi / 2 if temp_x0 > 0: alpha_big = math.atan(temp_y0 / temp_x0) elif temp_x0 < 0: alpha_big = math.atan(temp_y0 / temp_x0) + math.pi else: if temp_y0 > 0: alpha_big = math.pi / 2 else: alpha_big = 0 - math.pi / 2 alpha_small = alpha_small / math.pi - 0.5 alpha_big = alpha_big / math.pi - 0.5 r = math.sqrt(temp_x0**2 + temp_y0**2) / self.RANGE return { "alpha_big": alpha_big, "alpha_small": alpha_small, "r": r, "x0": x0, "y0": y0 } def get_finnal_result(self): red_x = self.temp_result["red"]["x"] red_y = self.temp_result["red"]["y"] green_x = self.temp_result["green"]["x"] green_y = self.temp_result["green"]["y"] blue_x = self.temp_result["blue"]["x"] blue_y = self.temp_result["blue"]["y"] yellow_x = self.temp_result["yellow"]["x"] yellow_y = self.temp_result["yellow"]["y"] finnal_temp = {} me_temp = self.get_one_car(red_x, red_y, green_x, green_y) enemy_temp = self.get_one_car(blue_x, blue_y, yellow_x, yellow_y) finnal_temp["me"] = me_temp finnal_temp["enemy"] = enemy_temp self.finnal_result = finnal_temp def draw(self, frame, lowerRGB, upperRGB, word): hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) # 根据阈值构建掩膜 mask = cv2.inRange(hsv, lowerRGB, upperRGB) # 腐蚀操作 mask = cv2.erode(mask, None, iterations=2) # 膨胀操作,其实先腐蚀再膨胀的效果是开运算,去除噪点 mask = cv2.dilate(mask, None, iterations=2) cnts = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[-2] # 初始化瓶盖圆形轮廓质心 center = None # 如果存在轮廓 if len(cnts) > 0: # 找到面积最大的轮廓 c = max(cnts, key=cv2.contourArea) # 确定面积最大的轮廓的外接圆 ((x, y), radius) = cv2.minEnclosingCircle(c) # 计算轮廓的矩 M = cv2.moments(c) # 计算质心 center = (int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"])) # 只有当半径大于10时,才执行画图 if radius > 10: cv2.circle(frame, (int(x), int(y)), int(radius), (0, 255, 255), 2) cv2.circle(frame, center, 5, (0, 0, 255), -1) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(frame, word, (int(x), int(y)), font, 1.2, (255, 255, 255), 2) result = {} result["x"] = x result["y"] = y return result def streaming(self): try: print("Host: ", self.host_name + " " + self.host_ip) print("Connection from: ", self.client_address) print("Streaming...") print("Press 'q' to exit") redLower = np.array([170, 100, 200]) redUpper = np.array([179, 255, 255]) greenLower = np.array([65, 100, 100]) greenUpper = np.array([85, 255, 255]) #blueLower = np.array([0, 0, 150]) #blueUpper = np.array([100, 100, 255]) blueLower = np.array([95, 100, 100]) blueUpper = np.array([115, 255, 255]) yellowLower = np.array([5, 100, 100]) yellowUpper = np.array([20, 255, 255]) # need bytes here stream_bytes = b" " while True: stream_bytes += self.connection.read(1024) first = stream_bytes.find(b"\xff\xd8") last = stream_bytes.find(b"\xff\xd9") #str_ = 'S270' #str_ = str_.encode("utf-8") #socket_tcp.send(str_) #f = open('record_' + str(self.count) + '.json', 'w') #json.dump(dic_dump, f) #f.close() if first != -1 and last != -1: jpg = stream_bytes[first:last + 2] stream_bytes = stream_bytes[last + 2:] image = cv2.imdecode(np.frombuffer(jpg, dtype=np.uint8), cv2.IMREAD_COLOR) frame = image result_red = self.draw(frame, redLower, redUpper, "RED") result_green = self.draw(frame, greenLower, greenUpper, "GREEN") result_blue = self.draw(frame, blueLower, blueUpper, "blue") result_yellow = self.draw(frame, yellowLower, yellowUpper, "YELLOW") result = {} result["red"] = result_red result["green"] = result_green result["blue"] = result_blue result["yellow"] = result_yellow self.temp_result = result flag = True if not result_red: flag = False if not result_green: flag = False if not result_blue: flag = False if not result_yellow: flag = False if flag: self.get_finnal_result() self.time_now = int((time.time() - start_time) * 1000) self.dqn_loop() ''' dic_dump = {'data': self.finnal_result, 'time' : self.time_now} f = open('./test_1/record_' + str(self.count) + '.json', 'w') json.dump(dic_dump, f) f.close() self.count +=1 ''' cv2.line(frame, (int(self.temp_result["red"]["x"]), int(self.temp_result["red"]["y"])), (int(self.temp_result["green"]["x"]), int(self.temp_result["green"]["y"])), (0, 255, 0), 1, 4) cv2.line(frame, (int(self.temp_result["blue"]["x"]), int(self.temp_result["blue"]["y"])), (int(self.temp_result["yellow"]["x"]), int(self.temp_result["yellow"]["y"])), (0, 255, 0), 1, 4) cv2.line(frame, (int(self.finnal_result["me"]["x0"]), int(self.finnal_result["me"]["y0"])), (int(self.WIDTH / 2), int(self.WIDTH / 2)), (0, 0, 255), 4, 4) cv2.line(frame, (int(self.finnal_result["enemy"]["x0"]), int(self.finnal_result["enemy"]["y0"])), (int(self.WIDTH / 2), int(self.WIDTH / 2)), (255, 0, 0), 4, 4) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(frame, str(self.finnal_result["me"]["alpha_big"]), (int(self.finnal_result["me"]["x0"]), int(self.finnal_result["me"]["y0"])), font, 1, (0, 255, 0), 1) cv2.putText( frame, str(self.finnal_result["enemy"]["alpha_small"]), (int(self.finnal_result["enemy"]["x0"]), int(self.finnal_result["enemy"]["y0"])), font, 1, (0, 255, 0), 1) else: str_S = "S360" str_T = "T370" str_S = str_S.encode("utf-8") str_T = str_T.encode("utf-8") socket_tcp.send(str_S) socket_tcp.send(str_T) #print(self.finnal_result) cv2.imshow("Frame", frame) if cv2.waitKey(1) & 0xFF == ord("q"): break finally: self.connection.close() self.server_socket.close()
i = [] v = [] r = [] for e in range(EPISODES): WH = w.generateWind() hdg0_rand = random.choice(hdg0_rand_vec) * TORAD hdg0 = hdg0_rand * np.ones(10) mdp.simulator.hyst.reset() # We reinitialize the memory of the flow state = mdp.initializeMDP(hdg0, WH) loss_sim_list = [] for time in range(80): WH = w.generateWind() action = agent.act(state) next_state, reward = mdp.transition(action, WH) agent.remember( state, action, reward, next_state) # store the transition + the state flow in the # final state !! state = next_state if len(agent.memory) >= batch_size: loss_sim_list.append(agent.replay(batch_size)) # For data visualisation i.append(mdp.s[0, -1]) v.append(mdp.s[1, -1]) r.append(mdp.reward) loss_over_simulation_time = np.sum(np.array([loss_sim_list])[0]) / len( np.array([loss_sim_list])[0])
def main(): print "Creating DQN agent..." iters = 10000 n_goal = 0 n_goal_all = 0 time_stamp = 0 ############################################################ # print x # max_steps = 3 # actions = ["print", " ", "x"] ############################################################ ############################################################ # print x+1 max_steps = 5 actions = ["print", " ", "x", "+", "1"] ############################################################ agent = DQNAgent(max_steps, actions) agent.dqn.initial_exploration = iters * 0.6 results = [] policy_frozen = False wins_file = "wins.txt" with io.FileIO(wins_file, "w") as file: file.write("Winning codes:\n") for iter in range(iters): print "\n\n::{}::".format(iter) if iter == 4300: # 2300: policy_frozen = True env = gym.make("codegen-v0") num = random.randrange(1, 100) env.my_input = num ############################################################ # print x # env.goal = str(num) ############################################################ ############################################################ # print x+1 env.goal = str(num + 1) ############################################################ code = env._reset() step_in_episode = 0 total_score = 0.0 reward = 0.0 mystate = [] my_state_new = [] while step_in_episode < max_steps: state = env.code_index_list[:] state += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() if step_in_episode == 0: action_idx = agent.start(code, state, policy_frozen) else: action_idx = agent.act(code, state, reward) code, reward, terminal, info = env._step(action_idx, agent.dqn.actions) state_prime = env.code_index_list[:] state_prime += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() agent.dqn.experience_replay(agent.dqn.time_stamp) if step_in_episode == max_steps - 1 or terminal: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, True) if terminal: agent.dqn.goal_idx.append(agent.dqn.time_stamp) agent.dqn.time_stamp += 1 else: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, False) total_score += reward if terminal: agent.end(reward) n_goal_all += 1 step_in_episode += 1 if iters - iter <= 100: n_goal += 1 step_in_episode += 1 if iter >= 100: results = results[1:] if reward >= 1: print "WIN" results.append(1.0) with io.FileIO(wins_file, "a") as f: f.write( "\n=====================\n{}\n=====================\n\n". format(code)) f.flush() os.fsync(f) else: results.append(0.0) total_iters = 100 if iter >= 100 else iter + 1 print "TOTAL {:.2f}% of wins in last {} iters, sum: {}, total good: {}".format( 100 * sum(results) / total_iters, total_iters, sum(results), len(agent.dqn.goal_idx)) if iter == 1 + agent.dqn.initial_exploration: print "n_goal_all = ", n_goal_all print agent.dqn.goal_idx raw_input() print "n_goal : ", n_goal print "epsilon : ", agent.epsilon
batch_size = 32 title = env.symbol.upper() + ' MDP Replay ' + os.path.basename( __file__).split('.')[0] grapher = Grapher(title) with open('./save/losses_' + stock_name + '.txt', 'w') as f: for e in range(EPISODES + 1): # Train state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): cash, nown, price = env.holdings[0], env.holdings[ 1], env.state[-1] # env.render() action = agent.act(state, time) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) # agent.train(state, action, reward, next_state, done) if len(agent.memory) > batch_size: agent.replay(batch_size) if e % 2 == 0: #cash, nown, price = state[0, 1], state[0, 2], state[0, -1] # cash, nown, price = *env.holdings, state[0,-1] grapher.add(cash, nown, price, action, reward, loss=agent.loss)
observation=obs, input_shape=[len(obs)], training=True, policy=policy) agent.compile() result = [] for episode in range(500): # 1000エピソード回す agent.reset() observation = env.reset() # 環境の初期化 # observation, _, _, _ = env.step(env.action_space.sample()) observation = deepcopy(observation) agent.observe(observation) for t in range(250): # n回試行する # env.render() # 表示 action = agent.act() observation, reward, done, info = env.step( action) # アクションを実行した結果の状態、報酬、ゲームをクリアしたかどうか、その他の情報を返す observation = deepcopy(observation) agent.observe(observation, reward, done) if done: break # test agent.training = False observation = env.reset() # 環境の初期化 agent.observe(observation) for t in range(250): # env.render() # 表示 action = agent.act() observation, reward, done, info = env.step(action)
class AgentTrainer(object): def __init__(self, config): # Create session to store trained parameters self.session = tf.Session() self.action_count = config["action_count"] # Create agent for training self.agent = DQNAgent(self.action_count) # Create memory to store observations self.memory = ExperienceMemory(config["replay_memory_size"]) # Tools for saving and loading networks self.saver = tf.train.Saver() # Last action that agent performed self.last_action_index = None # Deque to keep track of average reward and play time self.game_history = GameHistory(config["match_memory_size"]) # Deque to store losses self.episode_history = EpisodeHistory(config["replay_memory_size"]) self.INITIAL_EPSILON = config["initial_epsilon"] self.FINAL_EPSILON = config["final_epsilon"] self.OBSERVE = config["observe_step_count"] self.EXPLORE = config["explore_step_count"] self.FRAME_PER_ACTION = config["frame_per_action"] self.GAMMA = config["gamma"] self.LOG_PERIOD = config["log_period"] self.BATCH_SIZE = config["batch_size"] def init_training(self): # Initialize training parameters self.session.run(tf.global_variables_initializer()) self.epsilon = self.INITIAL_EPSILON self.t = 0 self.last_action_index = None def load_model(self, path): checkpoint = tf.train.get_checkpoint_state(path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.session, checkpoint.model_checkpoint_path) print("Successfully loaded: {}".format(checkpoint.model_checkpoint_path)) else: print("Could not find old network weights") def save_model(self, path): # Replace with os.path.join self.saver.save(self.session, path + "/dqn", global_step=self.t) def reset_state(self, initial_state): # Get the first state by doing nothing and preprocess the image to 80x80x4 x_t = initial_state x_t = transformImage(x_t) self.s_t = np.concatenate((x_t, x_t, x_t, x_t), axis=2) self.match_reward = 0 self.match_playtime = 0 self.gamma_pow = 1 def act(self): # Choose an action epsilon greedily action_index = 0 if self.t % self.FRAME_PER_ACTION == 0: if np.random.random() <= self.epsilon: action_index = np.random.randint(0, self.action_count) else: action_index = self.agent.act(self.session, self.s_t) else: action_index = self.last_action_index # do the same thing as before self.last_action_index = action_index return action_index def process_frame(self, screen, reward, terminal): if self.last_action_index is None: self.reset_state(screen) return a_t = np.zeros([self.action_count]) a_t[self.last_action_index] = 1 # scale down epsilon if self.epsilon > self.FINAL_EPSILON and self.t > self.OBSERVE: self.epsilon -= (self.INITIAL_EPSILON - self.FINAL_EPSILON) / self.EXPLORE # run the selected action and observe next state and reward x_t1, r_t = screen, reward x_t1 = transformImage(x_t1) s_t1 = np.append(x_t1, self.s_t[:, :, :3], axis=2) # store the transition in memory self.memory.add_experience((self.s_t, a_t, r_t, s_t1, terminal)) # only train if done observing if self.t > self.OBSERVE: loss = self.make_train_step() self.episode_history.add_episode(Episode(loss)) # update the old values self.s_t = s_t1 self.t += 1 # print info if self.t % self.LOG_PERIOD == 0: print("TIMESTEP {}, EPSILON {}, EPISODE_STATS {}, MATCH_STATS {}".format( self.t, self.epsilon, self.episode_history.get_average_stats(), self.game_history.get_average_stats())) sys.stdout.flush() self.match_reward += r_t * self.gamma_pow self.match_playtime += 1 self.gamma_pow *= self.GAMMA if terminal: self.game_history.add_match(MatchResults( self.match_reward, self.match_playtime, reward)) self.reset_state(screen) def make_train_step(self): # sample a minibatch to train on minibatch = self.memory.sample(self.BATCH_SIZE) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] # get the batch variables # s_j_batch, a_batch, r_batch, s_j1_batch, terminal_batch = zip(*minibatch) action_scores_batch = np.array(self.agent.score_actions(self.session, s_j1_batch)) # r_future = GAMMA * (1 - np.array(terminal_batch)) * np.max(action_scores_batch, axis=1) # y_batch = r_batch + r_future y_batch = [] for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + self.GAMMA * np.max(action_scores_batch[i])) return self.agent.train(self.session, y_batch, a_batch, s_j_batch)