def play(env, game, model_path): agent = DQNAgent(model_path) done, score = False, game.start_score observation = env.reset() # Initial history state = preprocess_frame(observation) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 4)) while not done: env.render() time.sleep(0.05) # Play action action = agent.choose_action(history) game_action = get_ingame_action(action) observation, reward, done, info = env.step(game_action) # Update history next_state = preprocess_frame(observation) next_state = np.reshape([next_state], (1, 84, 84, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) history = next_history reward = np.clip(reward, -1., 1.) score += reward print("score: ", score)
def main(): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='config/global_config.json') parser.add_argument('--num_step', type=int, default=2000) parser.add_argument('--ckpt', type=str) parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') args = parser.parse_args() # preparing config # # for environment config = json.load(open(args.config)) config["num_step"] = args.num_step cityflow_config = json.load(open(config['cityflow_config_file'])) roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile'] config["lane_phase_info"] = parse_roadnet(roadnetFile) # # for agent intersection_id = "intersection_1_1" config["intersection_id"] = intersection_id config["state_size"] = len(config['lane_phase_info'][intersection_id]['start_lane']) + 1 # 1 is for the current phase. [vehicle_count for each start lane] + [current_phase] phase_list = config['lane_phase_info'][intersection_id]['phase'] config["action_size"] = len(phase_list) config["batch_size"] = args.batch_size logging.info(phase_list) # build cityflow environment env = CityFlowEnv(config) # build agent agent = DQNAgent(config) # inference agent.load(args.ckpt) env.reset() state = env.get_state() for i in range(args.num_step): action = agent.choose_action(state) # index of action action_phase = phase_list[action] # actual action next_state, reward = env.step(action_phase) # one step state = next_state # logging logging.info("step:{}/{}, action:{}, reward:{}" .format(i, args.num_step, action, reward))
+ str(n_games) + 'games' figure_file = 'plots/' + fname + '.png' # if you want to record video of your agent playing, do a mkdir tmp && mkdir tmp/dqn-video # and uncomment the following 2 lines. #env = wrappers.Monitor(env, "tmp/dqn-video", # video_callable=lambda episode_id: True, force=True) n_steps = 0 scores, eps_history, steps_array = [], [], [] for i in range(n_games): done = False observation = env.reset() score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward if not load_checkpoint: agent.store_transition(observation, action, reward, observation_, done) agent.learn() observation = observation_ n_steps += 1 scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) print('episode: ', i, 'score: ', score, ' average score %.1f' % avg_score,
def dqn_train(): env = make_env('PongNoFrameskip-v4') load_checkpoint = False save_checkpoint = True learning_enabled = True rendering_enabled = False n_games = 100 agent = DQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, input_dims=(env.observation_space.shape), n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, batch_size=32, replace=1000, eps_dec=1e-5, chkpt_dir='models/', algo='DQNAgent', env_name='PongNoFrameskip-v4') if load_checkpoint: agent.load_models() with open('models/best_score.pkl', 'rb') as file: best_score = pickle.load(file) else: best_score = -np.inf fname = agent.algo + '_' + agent.env_name + '_lr' + str( agent.lr) + '_' + str(n_games) + 'games' figure_file = 'plots/' + fname + '.png' n_steps = 0 scores, eps_history, steps_array = [], [], [] for i in range(n_games): done = False observation = env.reset() score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward if learning_enabled: agent.store_transition(observation, action, reward, observation_, int(done)) agent.learn() if rendering_enabled: env.render() observation = observation_ n_steps += 1 scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) print('episode: ', i, 'score: ', score, ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) if avg_score > best_score: best_score = avg_score if save_checkpoint: agent.save_models() with open('models/best_score.pkl', 'wb') as file: pickle.dump(best_score, file) eps_history.append(agent.epsilon) if load_checkpoint and n_steps >= 18000: break x = [i + 1 for i in range(len(scores))] plot_learning_curve(steps_array, scores, eps_history, figure_file)
learning_start = 300 update_model_freq = 300 update_target_model_freq = 1500 num_step = config['num_step'] state_size = config['state_size'] for e in range(EPISODES): # reset initially at each episode env.reset() t = 0 state = env.get_state() state = np.array( list(state['start_lane_vehicle_count'].values()) + [state['current_phase']]) # a sample state definition state = np.reshape(state, [1, state_size]) last_action = phase_list[agent.choose_action(state)] while t < num_step: action = phase_list[agent.choose_action(state)] if action == last_action: env.step(action) else: for _ in range(env.yellow_time): env.step(0) # required yellow time t += 1 flag = (t >= num_step) if flag: break if flag: break env.step(action) last_action = action
def main(argv=None): try: options, args = getopt.getopt(sys.argv[1:], "s:x:b:u:mh", [ "step=", "max_eps=", "buffer_size=", "hidden_unit=","monitor", "help"]) except getopt.GetoptError as err: print(str(err)) print(usage.__doc__) sys.exit(1) GAME_NAME = 'CartPole-v1' AGENT_NAME = 'DQN-lr_1_e-3' MONITOR = False print_step = 10 max_eps = 500 buffer_size=1000000 hidden_unit = 16 lr=1e-3 print(options) for o, v in options: if o in ("-h", "--help"): print(usage.__doc__) sys.exit() elif o in ("-m", "--monitor"): MONITOR = True elif o in ("-s", "--step"): print_step = int(v) elif o in ("-x", "--max_eps"): max_eps = int(v) elif o in ("-b", "--buffer_size"): buffer_size = int(v) elif o in ("-u", "--hidden_unit"): hidden_unit = int(v) else: print(usage.__doc__) sys.exit() print('process game: %s\tusing agent: %s' % (GAME_NAME, AGENT_NAME)) # -------------------------------- loop for training ----------------------------- # preparing env output_dir = '%s/%s' % (GAME_NAME, AGENT_NAME) cmd = 'mkdir -p %s && mkdir -p %s/%s' % (GAME_NAME, GAME_NAME, AGENT_NAME) os.system(cmd) env = gym.make(GAME_NAME) if MONITOR: env = Monitor(env, directory=output_dir, force=True, video_callable=lambda ep: ep % 10 == 0, write_upon_reset=True, mode='training') env.seed(0) state_num = len(env.reset()) print(state_num) action_sample = env.action_space.sample() action_num = env.action_space.n if isinstance(action_sample, int) else len(action_sample) print('state_num: %d\taction_num: %d' % (state_num, action_num)) device = torch.device('cpu') agent = DQNAgent(state_num, action_num, buffer_size=buffer_size, batch_size=128, device=device, hidden_unit=hidden_unit, lr=lr) scores_window = deque(maxlen=print_step) # last 10 scores avg_scores = [] for i_episode in range(max_eps): score = 0 state = env.reset() while True: action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) score += reward state = next_state if done: break scores_window.append(score) print('\rEpisode {}\tAverage Score: {:.2f} '.format( i_episode, np.mean(scores_window)), end="") if i_episode % print_step == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) # save model agent.save_model_params(output_dir, i_episode) avg_scores.append(np.mean(scores_window)) sys.stdout.flush() env.close()
def main(): logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=200, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=1, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=64, help='batchsize for training') parser.add_argument('--phase_step', type=int, default=15, help='seconds of one phase') args = parser.parse_args() # preparing config # # for environment config = json.load(open(args.config)) config["num_step"] = args.num_step assert "1x1" in config[ 'cityflow_config_file'], "please use 1x1 config file for cityflow" # config["replay_data_path"] = "replay" cityflow_config = json.load(open(config['cityflow_config_file'])) roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile'] config["lane_phase_info"] = parse_roadnet(roadnetFile) # # for agent intersection_id = list(config['lane_phase_info'].keys())[0] config["intersection_id"] = intersection_id phase_list = config['lane_phase_info'][config["intersection_id"]]['phase'] config["action_size"] = len(phase_list) config["batch_size"] = args.batch_size logging.info(phase_list) model_dir = "model/{}_{}".format(args.algo, date) result_dir = "result/{}_{}".format(args.algo, date) config["result_dir"] = result_dir # parameters for training and inference # batch_size = 32 EPISODES = args.epoch learning_start = 300 # update_model_freq = args.batch_size update_model_freq = 1 update_target_model_freq = 10 if not args.inference: # build cityflow environment cityflow_config["saveReplay"] = True json.dump(cityflow_config, open(config["cityflow_config_file"], 'w')) env = CityFlowEnv( lane_phase_info=config["lane_phase_info"], intersection_id=config["intersection_id"], # for single agent num_step=args.num_step, cityflow_config_file=config["cityflow_config_file"]) # build agent config["state_size"] = env.state_size if args.algo == 'DQN': agent = DQNAgent(intersection_id, state_size=config["state_size"], action_size=config["action_size"], batch_size=config["batch_size"], phase_list=phase_list, env=env) elif args.algo == 'DDQN': agent = DDQNAgent(config) elif args.algo == 'DuelDQN': agent = DuelingDQNAgent(config) # make dirs if not os.path.exists("model"): os.makedirs("model") if not os.path.exists("result"): os.makedirs("result") if not os.path.exists(model_dir): os.makedirs(model_dir) if not os.path.exists(result_dir): os.makedirs(result_dir) # training total_step = 0 episode_rewards = [] episode_scores = [] with tqdm(total=EPISODES * args.num_step) as pbar: for i in range(EPISODES): # print("episode: {}".format(i)) env.reset() state = env.get_state() episode_length = 0 episode_reward = 0 episode_score = 0 while episode_length < args.num_step: action = agent.choose_action_(state) # index of action action_phase = phase_list[action] # actual action # no yellow light next_state, reward = env.step(action_phase) # one step # last_action_phase = action_phase episode_length += 1 total_step += 1 episode_reward += reward episode_score += env.get_score() for _ in range(args.phase_step - 1): next_state, reward_ = env.step(action_phase) reward += reward_ reward /= args.phase_step pbar.update(1) # store to replay buffer if episode_length > learning_start: agent.remember(state, action_phase, reward, next_state) state = next_state # training if episode_length > learning_start and total_step % update_model_freq == 0: if len(agent.memory) > args.batch_size: agent.replay() # update target Q netwark if episode_length > learning_start and total_step % update_target_model_freq == 0: agent.update_target_network() # logging # logging.info("\repisode:{}/{}, total_step:{}, action:{}, reward:{}" # .format(i+1, EPISODES, total_step, action, reward)) pbar.set_description( "total_step:{}, episode:{}, episode_step:{}, reward:{}" .format(total_step, i + 1, episode_length, reward)) # save episode rewards episode_rewards.append( episode_reward / args.num_step) # record episode mean reward episode_scores.append(episode_score) print("score: {}, mean reward:{}".format( episode_score, episode_reward / args.num_step)) # save model if (i + 1) % args.save_freq == 0: if args.algo != 'DuelDQN': agent.model.save(model_dir + "/{}-{}.h5".format(args.algo, i + 1)) else: agent.save(model_dir + "/{}-ckpt".format(args.algo), i + 1) # save reward to file df = pd.DataFrame({"rewards": episode_rewards}) df.to_csv(result_dir + '/rewards.csv', index=None) df = pd.DataFrame({"rewards": episode_scores}) df.to_csv(result_dir + '/scores.csv', index=None) # save figure plot_data_lists([episode_rewards], ['episode reward'], figure_name=result_dir + '/rewards.pdf') plot_data_lists([episode_scores], ['episode score'], figure_name=result_dir + '/scores.pdf') else: # inference cityflow_config["saveReplay"] = True json.dump(cityflow_config, open(config["cityflow_config_file"], 'w')) env = CityFlowEnv( lane_phase_info=config["lane_phase_info"], intersection_id=config["intersection_id"], # for single agent num_step=args.num_step, cityflow_config_file=config["cityflow_config_file"]) env.reset() # build agent config["state_size"] = env.state_size if args.algo == 'DQN': agent = DQNAgent(intersection_id, state_size=config["state_size"], action_size=config["action_size"], batch_size=config["batch_size"], phase_list=phase_list, env=env) elif args.algo == 'DDQN': agent = DDQNAgent(config) elif args.algo == 'DuelDQN': agent = DuelingDQNAgent(config) agent.load(args.ckpt) state = env.get_state() scores = [] for i in range(args.num_step): action = agent.choose_action(state) # index of action action_phase = phase_list[action] # actual action next_state, reward = env.step(action_phase) # one step for _ in range(args.phase_step - 1): next_state, reward_ = env.step(action_phase) reward += reward_ reward /= args.phase_step score = env.get_score() scores.append(score) state = next_state # logging logging.info("step:{}/{}, action:{}, reward:{}, score:{}".format( i + 1, args.num_step, action, reward, score)) inf_result_dir = "result/" + args.ckpt.split("/")[1] df = pd.DataFrame({"inf_scores": scores}) df.to_csv(inf_result_dir + '/inf_scores.csv', index=None) plot_data_lists([scores], ['inference scores'], figure_name=inf_result_dir + '/inf_scores.pdf')
if train: figure_file_name = 'figures/' + agent_dqn.env_name + '_' + agent_dqn.algorithm + '_' \ + str(num_games) + '_' + str(agent_dqn.learning_rate) + '.png' score_history = [] eps_history = [] steps_arr = [] num_steps = 0 for i in range(num_games): score = 0 done = False observation = env.reset() while not done: action = agent_dqn.choose_action(observation) new_observation, reward, done, info = env.step(action) score += reward agent_dqn.store_transition(observation, action, reward, new_observation, int(done)) agent_dqn.learn_step() observation = new_observation num_steps += 1 steps_arr.append(num_steps) score_history.append(score) eps_history.append(agent_dqn.epsilon) avg_score = np.mean( score_history[-100:]) # moving average over last 100 games
def main(): logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json') parser.add_argument('--num_step', type=int, default=10**3) args = parser.parse_args() # preparing config # # for rnvironment config = json.load(open(args.config)) config["num_step"] = args.num_step # config["replay_data_path"] = "replay" cityflow_config = json.load(open(config['cityflow_config_file'])) roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile'] config["lane_phase_info"] = parse_roadnet(roadnetFile) # # for agent intersection_id = list(config['lane_phase_info'].keys())[0] phase_list = config['lane_phase_info'][intersection_id]['phase'] logging.info(phase_list) config["state_size"] = len( config['lane_phase_info'][intersection_id]['start_lane'] ) + 1 # 1 is for the current phase. [vehicle_count for each start lane] + [current_phase] config["action_size"] = len(phase_list) # build cotyflow environment env = CityFlowEnv(config) # build agent agent = DQNAgent(config) # training batch_size = 32 EPISODES = 11 learning_start = 300 update_model_freq = 300 update_target_model_freq = 1500 num_step = config['num_step'] state_size = config['state_size'] ### the dqp learning code if not os.path.exists("model"): os.makedirs("model") model_dir = "model/{}".format(date) os.makedirs(model_dir) total_step = 0 for i in range(EPISODES): env.reset() state = env.get_state() state = np.array( list(state['start_lane_vehicle_count'].values()) + [state['current_phase']]) state = np.reshape(state, [1, state_size]) episode_length = 0 while episode_length < num_step: action = agent.choose_action(state) # index of action action_phase = phase_list[action] # actual action # no yellow light next_state, reward = env.step(action_phase) # one step last_action_phase = action_phase episode_length += 1 total_step += 1 # store to replay buffer next_state = np.array( list(next_state['start_lane_vehicle_count'].values()) + [next_state['current_phase']]) next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action_phase, reward, next_state) state = next_state # training if total_step > learning_start and total_step % update_model_freq == 0: agent.replay() # update target Q netwark if total_step > learning_start and total_step % update_target_model_freq == 0: agent.update_target_network() # log logging.info( "episode:{}/{}, total_step:{}, action:{}, reward:{}".format( i, EPISODES, total_step, action, reward)) # save model if i % 10 == 0: agent.model.save(model_dir + "/dqn-{}.h5".format(i))
n_steps = 0 scores, eps_history, steps_array = [], [], [] #------------------------------------------------------------------------------ #----------------------------------------------------------------------------- for i in range(n_games): done = False observation = env.reset() observation_np = np.reshape(observation, (4, )) # print("observation: ", type(observation), observation) # print("observation_np: ", type(observation_np), observation_np.shape, observation_np) score = 0 while not done: action_agent = agent.choose_action(observation_np) # print('take action:' , type(action_agent), action_agent) print('$take action: ', action_agent) action = {"discrete": action_agent} observation_, reward, done, info = env.step(action) observation_np_ = np.reshape(observation_, (4, )) score += reward if not load_checkpoint: agent.store_transition(observation_np, action_agent, reward, observation_np_, int(done)) agent.learn() observation = observation_ observation_np = observation_np_ n_steps += 1 scores.append(score)
def train(env, game, model_path): agent = DQNAgent() nb_steps = 0 current_date = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') log_dir = os.path.join("log", current_date) writer = tf.summary.FileWriter(log_dir, tf.get_default_graph()) for episode in range(agent.nb_episodes): done, terminal = False, False score, lives = game.start_score, game.start_lives loss = 0. observation = env.reset() # To avoid sub-optimal, start the episode by waiting for a few steps nb_no_op = random.randint(1, agent.no_op_max_steps) for _ in range(nb_no_op): observation, _, _, _ = env.step(1) # Initial history state = preprocess_frame(observation) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 4)) while not done: # Play action action = agent.choose_action(history) game_action = get_ingame_action(action) observation, reward, done, info = env.step(game_action) # Update history next_state = preprocess_frame(observation) next_state = np.reshape([next_state], (1, 84, 84, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) # Lost a life if lives > info['ale.lives']: terminal = True lives = info['ale.lives'] reward = np.clip(reward, -1., 1.) # Learn agent.save_to_memory(history, action, reward, next_history, terminal) score += reward loss += agent.train_replay() if nb_steps % agent.target_update_rate == 0: agent.update_target_model() # Prepare for next iteration if terminal: terminal = False else: history = next_history nb_steps += 1 print("done episode {}: loss {}, score {}.".format( episode, loss, score)) if episode % 100 == 0: agent.model.save(model_path) # Output log into TensorBoard loss_summary = tf.Summary( value=[tf.Summary.Value(tag="loss", simple_value=loss)]) score_summary = tf.Summary( value=[tf.Summary.Value(tag="score", simple_value=score)]) writer.add_summary(loss_summary, episode) writer.add_summary(score_summary, episode) agent.model.save(model_path) writer.close()