def train_new_model(): game_env = PongGame() # init env agent = DqnAgent(game_env) # init agent # input from user print("Enter num of iteration:") while True: try: Dqn.total_num_of_training = int(input()) break except ValueError: print("That's not an integer!") continue # create new file output_directory = JP.create_results_directory() file_name = output_directory + "/data_file.txt" checkpoint_dir = JP.create_checkpoints_directory(output_directory) print(checkpoint_dir) saver = tf.train.Saver() # call train model with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # for initialized Dqn.run_dqn(sess, game_env, agent, output_directory, file_name, saver, checkpoint_dir)
def __init__(self, args): print("Initialise DQN Agent") # Load parameters from user-given arguments self.params = params self.params['width'] = args['width'] self.params['height'] = args['height'] self.params['num_training'] = args['numTraining'] # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.qnet = DQN(self.params) # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) # Q and cost self.Q_global = [] self.cost_disp = 0 # Stats self.cnt = self.qnet.sess.run(self.qnet.global_step) self.local_cnt = 0 self.numeps = 0 self.last_score = 0 self.s = time.time() self.last_reward = 0. self.replay_mem = deque() self.last_scores = deque()
def main(): env = gym.make('Acrobot-v1') gamma = 0.99 copy_step = 25 num_states = len(env.observation_space.sample()) num_actions = env.action_space.n hidden_units = [64, 64] max_experiences = 10000 min_experiences = 100 batch_size = 32 iter_per_episode = 300 TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size) TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size) N = 50 total_rewards = np.empty(N) epsilon = 0.99 decay = 0.9999 min_epsilon = 0.08 for n in range(N): epsilon = max(min_epsilon, epsilon * decay) total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step, iter_per_episode) total_rewards[n] = total_reward avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean() if n % 5 == 0: print("Progress:", int(n/N*100), "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards) print("avg reward for last 100 episodes:", avg_rewards) make_video(env, TrainNet, 300) env.close()
def initialize_policies(self): self.Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward')) self.policy_net_agent = DQN(n_feature = self._state_dim) self.policy_net_agent.double() self.target_net_agent = DQN(n_feature = self._state_dim) self.target_net_agent.double() self.target_net_agent.load_state_dict(self.policy_net_agent.state_dict())
def main(): h, s, v = rgb2hsv(201, 204, 214) print(h, s, v) r, g, b = hsv2rgb(h, s, v*0.7) print(r, g, b) dump_device_info() check_adb() n = 0 while True: pull_screenshot() im = Image.open('./autojump.png') # 获取棋子和 board 的位置 piece_x, piece_y, board_x, board_y = find_piece_and_board(im) ts = int(time.time()) print(ts, piece_x, piece_y, board_x, board_y) set_button_position(im) jump(math.sqrt((board_x - piece_x) ** 2 + (board_y - piece_y) ** 2)) # save_debug_creenshot(ts, im, piece_x, piece_y, board_x, board_y) # backup_screenshot(ts) DQN.save_pic(np.asarray(im), str(n)) with open('time.txt','a+') as f: f.write('%d,%f' % (n, math.sqrt((board_x - piece_x) ** 2 + (board_y - piece_y) ** 2))) time.sleep(random.uniform(1.2, 1.4)) # 为了保证截图的时候应落稳了,多延迟一会儿 n += 1
def setUp(self): self.env = DQN.env (self.player_states, (self.community_infos, self.community_cards)) = self.env.reset() (self.player_infos, self.player_hands) = zip(*self.player_states) self.current_state = ((self.player_infos, self.player_hands), (self.community_infos, self.community_cards)) self.state = DQN.create_np_array(self.player_infos, self.player_hands, self.community_cards, self.community_infos) self.state_set = utilities.convert_list_to_tupleA( self.player_states[self.env.learner_bot.get_seat()], self.current_state[1]) self._round = utilities.which_round(self.community_cards) self.current_player = self.community_infos[-3] self.learner_bot, self.villain = self.env.learner_bot, self.env.villain Q = defaultdict(lambda: np.zeros(self.env.action_space.n)) self.agent = DQN.DQNAgent(DQN.state_size, DQN.action_size) # initialise agent self.policy = DQN.make_epsilon_greedy_policy(Q, self.agent.epsilon, self.env.action_space.n) self.villain_action = DQN.get_action_policy( self.player_infos, self.community_infos, self.community_cards, self.env, self._round, self.env.n_seats, self.state_set, self.policy, self.villain) self.learner_action = self.agent.act(self.state, self.player_infos, self.community_infos, self.community_cards, self.env, self._round, self.env.n_seats, self.state_set, self.policy)
def __init__(self, config, device, model=False): self.device = device self.board_size = config.board_size self.eps_end = config.eps_end self.eps_start = config.eps_start self.eps_end = config.eps_end self.eps_decay = config.eps_decay self.gamma = config.gamma self.batch_size = config.batch_size # This part is for the network if (model != False): self.policy_net = torch.load(model) else: self.policy_net = DQN(config).to(device) # Be aware that the config must be the exact same for the loaded model self.target_net = DQN(config).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), momentum=config.momentum, lr=config.lr) self.criterion = torch.nn.SmoothL1Loss() self.memory = ReplayMemory(config.replay_memory) self.steps_done = 0
def f3(argv): state_dict, device, datas, data2s = argv if 'dqn' in args.model: if 'duel' in args.model: model = DQN.GraphNet(hidden_size=args.hidden_size, n_head=8, nlayers=4, duel_dqn=True) else: model = DQN.GraphNet(hidden_size=args.hidden_size, n_head=8, nlayers=4, duel_dqn=False) elif 'IL' in args.model: model = model_gnn.GraphNet() elif 'RL' in args.model: model = A2C.GraphNet(n_head=4, nlayers=2) model.load_state_dict(state_dict) model = model.to(device) model.eval() torch.no_grad() ret = [] for data, data2 in zip(datas, data2s): ret.append( dqn_schedule(model, [data, data2], device, plan_limit=args.planlimit)) return ret
def run_demo(num_samples, model): samples = None if model == "Note_CNN": weights = "NOTE_CNN_WEIGHTS_400.pt" samples = nrnn.generate_samples_NoteCNN(weights, 32, 10, num_samples) elif model == "0.01": weights = "Q_400-500000.pt" samples = DQN.generate_sample(weights, 32, 10, num_samples) elif model == "0.05": weights = "Q_500-100000.pt" samples = DQN.generate_sample(weights, 32, 10, num_samples) elif model == "0.1": weights = "Q-500000.pt" samples = DQN.generate_sample(weights, 32, 10, num_samples) elif model == "0.3": weights = "Q_300-500000.pt" samples = DQN.generate_sample(weights, 32, 10, num_samples) elif model == "0.5": weights = "Q_200-500000.pt" samples = DQN.generate_sample(weights, 32, 10, num_samples) else: print("Invalid model parameter! Try again") for i in range(num_samples): oh.one_hot_to_midi(samples[i], midi_filename='demo_song-' + str(i) + '.mid') return None
def DQN(observation_shape, action_shape, **params): if params.get('noisy', False): net = dqn.NoisyDQN(observation_shape, action_shape) else: net = dqn.DQN(observation_shape, action_shape) if params.get('target', False): net = dqn.DQNT(net, params['double']) return net.to(params.get('device', 'cpu'))
def __init__(self, width, height, numTraining=0): # Load parameters from user-given arguments self.params = params self.params['width'] = width # Maze width self.params['height'] = height # Maze height self.params[ 'num_training'] = numTraining # Number of games used for training # create saves and logs directory if not os.path.exists("saves/DQN/"): os.makedirs("saves/DQN/") if not os.path.exists("logs/"): os.makedirs("logs/") # get saves directory if params["load_file"] is not None and not params[ "load_file"].startswith("saves/DQN/"): params["load_file"] = "saves/DQN/" + params["load_file"] # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.qnet = DQN(self.params) # create DQN # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) self.Q_global = [] # max Q-values in the current game self.cost_disp = 0 # current loss self.cnt = self.qnet.sess.run( self.qnet.global_step ) # number of steps the model has been trained so far self.local_cnt = 0 # number of total steps the algorithm has run self.numeps = 0 # current episode if params["load_file"] is not None: self.numeps = int(params["load_file"].split("_")[-1]) self.last_score = 0 # Score in the last step self.s = time.time() # time elapsed since beginning of training self.last_reward = 0. # Reward obtained in the last step self.replay_mem = deque() # replay memory used for training self.terminal = False # True if the game in a terminal state self.last_score = 0 # Score obtained in the last state self.current_score = 0 # Score obtained in the current state self.last_reward = 0. # Reward obtained in the last state self.ep_rew = 0 # Cumulative reward obtained in the current game self.last_state = None # Last state self.current_state = None # Current state self.last_action = None # Last action self.won = True # True if the game has been won self.delay = 0 self.frame = 0
def __init__(self, args): print("Initialise DQN Agent") # Load parameters from user-given arguments self.params = params self.params['width'] = args['width'] self.params['height'] = args['height'] self.params['num_training'] = args['numTraining'] # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.qnet = DQN(self.params) # Summary Writer self.summary = tf.Summary() self.wins = deque(maxlen=100) self.episodesSoFar = 0 print(args) if (params['save_file']): self.writer = tf.summary.FileWriter('logs/model-' + params['save_file'], graph=tf.Session().graph) self.replay_mem = None if (params['load_file']): try: with open('memories/model-' + params['load_file'], 'r') as f: self.replay_mem = pickle.load(f) except: pass # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) # Q and cost self.Q_global = [] self.cost_disp = 0 # Stats self.cnt = self.qnet.sess.run(self.qnet.global_step) self.local_cnt = 0 self.numeps = 0 self.last_score = 0 self.s = time.time() self.last_reward = 0. if not self.replay_mem: self.replay_mem = deque() self.last_scores = deque()
def __init__(self, load_from_previous_model): self.policy_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device) self.target_net = DQN(STATE_DIMENSION, NUM_ACTIONS).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=LEARNING_RATE) self.replayMemory = ReplayMemory(10000) if load_from_previous_model: self.load_model()
def __init__(self): env = gym.envs.make("PongDeterministic-v4") self.Q_target = DQN.Mynet(env.observation_space, env.action_space).to(device) self.Q_policy = DQN.Mynet(env.observation_space, env.action_space).to(device) self.Q_target.load_state_dict(self.Q_policy.state_dict()) self.Q_target.eval() self.env = env self.pool = DQN.ReplyMemory(15000) self.gramma = GRAMMA self.alpha = ALPHA self.epsilon = EPSILON self.ImageProcess = DQN.ImageProcess()
def simple_replay_train(DQN, train_batch): x_stack = np.empty(0).reshape(0, DQN.input_size) y_stack = np.empty(0).reshape(0, DQN.output_size) for state, action, reward, next_state, done in train_batch: Q = DQN.predict(state) if done: Q[0, action] = reward else: Q[0, action] = reward + dis * np.max(DQN.predict(next_state)) y_stack = np.vstack([y_stack, Q]) x_stack = np.vstack([x_stack, state]) return DQN.update(x_stack, y_stack)
def initialize_policies(self): self.Transition = namedtuple( 'Transition', ('state', 'action', 'next_state', 'reward')) self.policy_net_agent = DQN(n_feature=self._state_dim) self.policy_net_agent.double() self.target_net_agent = DQN(n_feature=self._state_dim) self.target_net_agent.double() self.target_net_agent.load_state_dict( self.policy_net_agent.state_dict()) self.optimizer_agent = optim.RMSprop( self.policy_net_agent.parameters(), lr=self._lr, weight_decay=self._weight_decay)
def main(): env = gym.make("Boxing-v0") height = 84 width = 84 channels = 4 num_actions = 18 dqn = DQN(AtariNetwork(height, width, channels), height * width, num_actions, epsilon=1.0, epsilon_decay=0.999, num_stacked=channels, learning_rate=0.1) memory = MemoryReplay(height * width, num_actions, max_saved=10000, num_stacked=channels) for epoch in tqdm(range(1000)): # Gain experience for _ in range(1): s = env.reset() s = preprocess(s) s = np.array([s, s, s, s]) for i in range(100): # if epoch % 5 == 0: # env.render() a = dqn.select_action(np.reshape(s, [1, -1])) s_prime, r, t, _ = env.step(np.argmax(a)) s_prime = preprocess(s_prime) s_prime = np.roll(s, 1, axis=0) s_prime[0] = np.maximum(s_prime[1], s_prime[0]) memory.add(s.reshape([-1]), a, r - 1, s_prime.reshape([-1]), t) s = s_prime if t: break #print(epoch, ": ", total_reward) # Train on that experience # for i in range(min((epoch + 1) * 5, 250)): for i in range(25): dqn.train(*memory.get_batch()) dqn.reassign_target_weights() if (epoch + 1) % 25 == 0: s = env.reset() s = preprocess(s) s = np.array([s, s, s, s]) for i in range(100): a = dqn.select_greedy_action(np.reshape(s, [1, -1])) env.render() s_prime, _, t, _ = env.step(np.argmax(a)) s = np.roll(s, 1, axis=0) s[0] = preprocess(s_prime) if t: break
def build_net(self): print('Building QNet and targetnet...') self.qnet = DQN(self.params, 'qnet', self.params['TB_logpath']) self.targetnet = DQN(self.params, 'targetnet', self.params['TB_logpath']) self.sess.run(tf.global_variables_initializer()) saver_dict = { 'qw1': self.qnet.w1, 'qb1': self.qnet.b1, 'qw2': self.qnet.w2, 'qb2': self.qnet.b2, 'qw3': self.qnet.w3, 'qb3': self.qnet.b3, 'qw4': self.qnet.w4, 'qb4': self.qnet.b4, 'qw5': self.qnet.w5, 'qb5': self.qnet.b5, 'tw1': self.targetnet.w1, 'tb1': self.targetnet.b1, 'tw2': self.targetnet.w2, 'tb2': self.targetnet.b2, 'tw3': self.targetnet.w3, 'tb3': self.targetnet.b3, 'tw4': self.targetnet.w4, 'tb4': self.targetnet.b4, 'tw5': self.targetnet.w5, 'tb5': self.targetnet.b5, 'step': self.qnet.global_step } self.saver = tf.train.Saver(saver_dict) self.cp_ops = [ self.targetnet.w1.assign(self.qnet.w1), self.targetnet.b1.assign(self.qnet.b1), self.targetnet.w2.assign(self.qnet.w2), self.targetnet.b2.assign(self.qnet.b2), self.targetnet.w3.assign(self.qnet.w3), self.targetnet.b3.assign(self.qnet.b3), self.targetnet.w4.assign(self.qnet.w4), self.targetnet.b4.assign(self.qnet.b4), self.targetnet.w5.assign(self.qnet.w5), self.targetnet.b5.assign(self.qnet.b5) ] self.sess.run(self.cp_ops) if self.params['ckpt_file'] is not None: print('\x1b[1;30;41m RUN LOAD \x1b[0m') self.load() print('Networks had been built!') sys.stdout.flush()
def outCome(): num = {}; num["rawoutcome"] = request.args.get("outcome") num["outcome"] = json.loads(num["rawoutcome"]) num["state"] = num["outcome"]["state"] num["next_state"] =num["outcome"]["next_state"] num["reward"] = num["outcome"]["reward"] num["done"] = num["outcome"]["done"] num["action"] = num["outcome"]["action"] DQN.remember(agent,num["state"] , num["action"], num["reward"], num["next_state"], num["done"] ) response = jsonify(num) response.headers.add('Access-Control-Allow-Origin', '*') return response; pass
def main(): env = gym.make(ENV_NAME) agent = DQN.DQN(env) for episode in range(EPISODE): state = env.reset() # train for step in range(STEP): action = agent.egreedy_action(state) next_state, reward, done, _ = env.step(action) #Define reward reward_agent = -1 if done else 0.1 agent.perceive(state, action, reward, next_state, done) state = next_state if done: break if episode % 100 == 0: total_reward = 0 for i in range(TEST): env.render() action = agent.action(state) state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print 'episode', episode, 'Evaluation Average Reward:', ave_reward if ave_reward >= 200: break
def ai_play(swap_network, SAVE_NAME): if swap_network: print("Swapped") neural_net = deep_neural_network.network(N_IN, HIDDEN, N_OUT, True, saveName=(SAVE_NAME + "_target")) else: neural_net = deep_neural_network.network(N_IN, HIDDEN, N_OUT, True, saveName=SAVE_NAME) # player False and draw True pong = PlayPong(False, True) done = False grow = True while not done: obs = pong.get_observation() action = DQN.act(neural_net, obs, training=False) draw_neural_net.draw(pong.screen, grow, obs, neural_net.hidden[0], neural_net.outputs[0]) grow = False done = pong.play_one_pong(action) print(" GAME OVER!!\AI scored %d points" % pong.state.points)
def __init__(self, args): print("Initialise DQN Agent") # Load parameters from user-given arguments self.params = params self.params['width'] = args['width'] self.params['height'] = args['height'] self.params['num_training'] = args['numTraining'] # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) self.sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) self.qnet = DQN(self.params) # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) # Q and cost self.Q_global = [] self.cost_disp = 0 # Stats self.cnt = self.qnet.sess.run(self.qnet.global_step) self.local_cnt = 0 self.numeps = 0 self.last_score = 0 self.s = time.time() self.last_reward = 0. self.replay_mem = deque() self.last_scores = deque()
def __init__(self, restore=False): sess_conf = DQN.tf.ConfigProto() sess_conf.gpu_options.allow_growth = True self.sess = DQN.tf.Session(config=sess_conf) self.Q_main = DQN.DQN(self.sess, name="main") self.Q_target = DQN.DQN(self.sess, name="target") self.sess.run(DQN.tf.global_variables_initializer()) self.copy_ops = DQN.get_copy_var_ops(dest_scope_name="target", src_scope_name="main") self.copy() if restore: self.restore() self.copy()
def simpleReplayTrain(DQN, trainBatch): xStack = np.empty(0).reshape(0, 45) yStack = np.empty(0).reshape(0, 161) for state, action, reward, nextState, done in trainBatch: Q = DQN.predict(state) if not done: Q[0, action - 10] = reward else: Q[0, action - 10] = reward + dis * np.argmax(DQN.predict(nextState)) xStack = np.vstack([xStack, state]) yStack = np.vstack([yStack, Q]) return DQN.update(xStack, yStack)
def step(self, a, t, context, env, val_model, targ_model): actions = torch.transpose( torch.Tensor(self.action_list(a, val_model, targ_model)), 0, 1) context_size = context.shape[0] if str(a) not in self.QLearning_Buffer.keys(): self.QLearning_Buffer[str(a)] = DQN.Q_Learning(0.5, 0.99, val_model, targ_model, actions, context_size, history_len=1) reward, self.q_learning_rewards = DQN.ql(env, self.QLearning_Buffer[str(a)], context[:, t], t, self.q_learning_rewards) return reward
def __init__(self): self.graph = Graph() self.actionSpace = [] self.RL = DQN(self.action_size, self.feature_size, output_graph=True) self.requests = [] self.max_request = 20000 #get the action spaces for cl in self.graph.cloudlets: for operate in [0, 1, 2]: action = (cl, operate) self.actionSpace.append(action) self.action_size = len(self.actionSpace) #get the feature size self.feature_size = 5 + self.graph.cloudlet_number * self.graph.web_function_number * 2 #dimension number of the state '''
def simple_replay_train(DQN, train_batch): x_stack = np.empty(0).reshape(0, DQN.input_size) y_stack = np.empty(0).reshape(0, DQN.output_size) # Get stored information from the buffer for state, action, reward, next_state, done in train_batch: Q = DQN.predict(state) if done: Q[0, action] = reward else: Q[0, action] = reward + dis * np.max(DQN.predict(next_state)) y_stack = np.vstack([y_stack, Q]) x_stack = np.vstack([x_stack, state]) # Train our network using target and predicted Q values on each episode return DQN.update(x_stack, y_stack)
def act(): num = {}; num["rawState"] = request.args.get("state") num["state"] = json.loads(num["rawState"]) print(num["state"]) num["action"] = DQN.act(agent,num["state"]) response = jsonify(num) response.headers.add('Access-Control-Allow-Origin', '*') return response;
def __init__(self, learning_rate=1e-2, restore=False, name="main"): self.sess = DQN.tf.Session() self.action_value = DQN.DQN(self.sess, learning_rate=learning_rate, name=name) self.sess.run(DQN.tf.global_variables_initializer()) if restore: self.restore()
def main(weights_name, video_name=None, get_image=False): env = DQN.Environment(render=True, sigma=0.02, down=1.0, get_image=get_image) s_size = env.env.s_size agent = DQN.Agent(s_size=s_size) agent.network.model.load_weights("data/" + weights_name + ".h5", by_name=True) print("model loaded") for _ in range(3): s = time.time() if video_name: env.record("data/mov/" + video_name + ".mp4") step = env.replay(agent.policy) print("unicycle lasted {} steps and {:2f} seconds.".format(step, step/30)) print("time = {}".format(time.time() - s)) env.close()
def main(): max_episodes = 1000 replay_buffer = deque() with tf.compat.v1.Session() as sess: mainDQN = DQN.DQN(sess, input_size, output_size, name="main") targetDQN = DQN.DQN(sess, input_size, output_size, name="target") tf.compat.v1.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main") sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("Episode: {} steps: {}".format(episode, step_count)) if step_count > 10000: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("Loss : ", loss) bot_play(mainDQN) sess.run(copy_ops) bot_play(mainDQN)
def build_nets(self): print "Building QNet and Targetnet..." self.qnet = DQN(self.params)
class deep_atari: def __init__(self, params): print "Initializing Module..." self.params = params self.sess = tf.Session() self.DB = database(self.params["db_size"], self.params["input_dims_proc"]) self.engine = emulator(rom_name="breakout.bin", vis=True) self.params["num_act"] = len(self.engine.legal_actions) self.build_nets() self.Q_global = 0 self.cost_disp = 0 def build_nets(self): print "Building QNet and Targetnet..." self.qnet = DQN(self.params) def start(self): print "Start training..." cnt = self.qnet.sess.run(self.qnet.global_step) print "Global step = " + str(cnt) local_cnt = 0 s = time.time() for numeps in range(self.params["num_episodes"]): self.Q_global = 0 state_proc = np.zeros((84, 84, 4)) state_proc_old = None action = None terminal = None delay = 0 state = self.engine.newGame() state_resized = cv2.resize(state, (84, 110)) state_gray = cv2.cvtColor(state_resized, cv2.COLOR_BGR2GRAY) state_proc[:, :, 3] = state_gray[26:110, :] / 255.0 total_reward_ep = 0 for maxl in range(self.params["episode_max_length"]): if state_proc_old is not None: self.DB.insert(state_proc_old[:, :, 3], reward, action, terminal) action = self.perceive(state_proc, terminal) if action == None: # TODO - check [terminal condition] break if local_cnt > self.params["train_start"] and local_cnt % self.params["learning_interval"] == 0: bat_s, bat_a, bat_t, bat_n, bat_r = self.DB.get_batches(self.params["batch"]) bat_a = self.get_onehot(bat_a) cnt, self.cost_disp = self.qnet.train(bat_s, bat_a, bat_t, bat_n, bat_r) if local_cnt > self.params["train_start"] and local_cnt % self.params["save_interval"] == 0: self.qnet.save_ckpt("ckpt/model_" + str(cnt)) print "Model saved" state_proc_old = np.copy(state_proc) state, reward, terminal = self.engine.next(action) # IMP: newstate contains terminal info state_resized = cv2.resize(state, (84, 110)) state_gray = cv2.cvtColor(state_resized, cv2.COLOR_BGR2GRAY) state_proc[:, :, 0:3] = state_proc[:, :, 1:4] state_proc[:, :, 3] = state_gray[26:110, :] / 255.0 total_reward_ep = total_reward_ep + reward local_cnt += 1 # params['eps'] =0.05 self.params["eps"] = max(0.1, 1.0 - float(cnt) / float(self.params["eps_step"])) # self.params['eps'] = 0.00001 sys.stdout.write( "Epi: %d | frame: %d | train_step: %d | time: %f | reward: %f | eps: %f " % (numeps, local_cnt, cnt, time.time() - s, total_reward_ep, self.params["eps"]) ) sys.stdout.write("| max_Q: %f\n" % (self.Q_global)) # sys.stdout.write("%f, %f, %f, %f, %f\n" % (self.t_e[0],self.t_e[1],self.t_e[2],self.t_e[3],self.t_e[4])) sys.stdout.flush() def select_action(self, state): if np.random.rand() > self.params["eps"]: # greedy with random tie-breaking Q_pred = self.qnet.sess.run( self.qnet.y, feed_dict={ self.qnet.x: np.reshape(state, (1, 84, 84, 4)), self.qnet.q_t: np.zeros(1), self.qnet.actions: np.zeros((1, self.params["num_act"])), self.qnet.terminals: np.zeros(1), self.qnet.rewards: np.zeros(1), }, )[ 0 ] # TODO check self.Q_global = max(self.Q_global, np.amax(Q_pred)) a_winner = np.argwhere(Q_pred == np.amax(Q_pred)) if len(a_winner) > 1: return self.engine.legal_actions[a_winner[np.random.randint(0, len(a_winner))][0]] else: return self.engine.legal_actions[a_winner[0][0]] else: # random return self.engine.legal_actions[np.random.randint(0, len(self.engine.legal_actions))] def perceive(self, newstate, terminal): if not terminal: action = self.select_action(newstate) return action def get_onehot(self, actions): actions_onehot = np.zeros((self.params["batch"], self.params["num_act"])) for i in range(len(actions)): actions_onehot[i][self.engine.action_map[int(actions[i])]] = 1 return actions_onehot
class PacmanDQN(game.Agent): def __init__(self, args): print("Initialise DQN Agent") # Load parameters from user-given arguments self.params = params self.params['width'] = args['width'] self.params['height'] = args['height'] self.params['num_training'] = args['numTraining'] # Start Tensorflow session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) self.sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) self.qnet = DQN(self.params) # time started self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()) # Q and cost self.Q_global = [] self.cost_disp = 0 # Stats self.cnt = self.qnet.sess.run(self.qnet.global_step) self.local_cnt = 0 self.numeps = 0 self.last_score = 0 self.s = time.time() self.last_reward = 0. self.replay_mem = deque() self.last_scores = deque() def getMove(self, state): # Exploit / Explore if np.random.rand() > self.params['eps']: # Exploit action self.Q_pred = self.qnet.sess.run( self.qnet.y, feed_dict = {self.qnet.x: np.reshape(self.current_state, (1, self.params['width'], self.params['height'], 6)), self.qnet.q_t: np.zeros(1), self.qnet.actions: np.zeros((1, 4)), self.qnet.terminals: np.zeros(1), self.qnet.rewards: np.zeros(1)})[0] self.Q_global.append(max(self.Q_pred)) a_winner = np.argwhere(self.Q_pred == np.amax(self.Q_pred)) if len(a_winner) > 1: move = self.get_direction( a_winner[np.random.randint(0, len(a_winner))][0]) else: move = self.get_direction( a_winner[0][0]) else: # Random: move = self.get_direction(np.random.randint(0, 4)) # Save last_action self.last_action = self.get_value(move) return move def get_value(self, direction): if direction == Directions.NORTH: return 0. elif direction == Directions.EAST: return 1. elif direction == Directions.SOUTH: return 2. else: return 3. def get_direction(self, value): if value == 0.: return Directions.NORTH elif value == 1.: return Directions.EAST elif value == 2.: return Directions.SOUTH else: return Directions.WEST def observation_step(self, state): if self.last_action is not None: # Process current experience state self.last_state = np.copy(self.current_state) self.current_state = self.getStateMatrices(state) # Process current experience reward self.current_score = state.getScore() reward = self.current_score - self.last_score self.last_score = self.current_score if reward > 20: self.last_reward = 50. # Eat ghost (Yum! Yum!) elif reward > 0: self.last_reward = 10. # Eat food (Yum!) elif reward < -10: self.last_reward = -500. # Get eaten (Ouch!) -500 self.won = False elif reward < 0: self.last_reward = -1. # Punish time (Pff..) if(self.terminal and self.won): self.last_reward = 100. self.ep_rew += self.last_reward # Store last experience into memory experience = (self.last_state, float(self.last_reward), self.last_action, self.current_state, self.terminal) self.replay_mem.append(experience) if len(self.replay_mem) > self.params['mem_size']: self.replay_mem.popleft() # Save model if(params['save_file']): if self.local_cnt > self.params['train_start'] and self.local_cnt % self.params['save_interval'] == 0: self.qnet.save_ckpt('saves/model-' + params['save_file'] + "_" + str(self.cnt) + '_' + str(self.numeps)) print('Model saved') # Train self.train() # Next self.local_cnt += 1 self.frame += 1 self.params['eps'] = max(self.params['eps_final'], 1.00 - float(self.cnt)/ float(self.params['eps_step'])) def observationFunction(self, state): # Do observation self.terminal = False self.observation_step(state) return state def final(self, state): # Next self.ep_rew += self.last_reward # Do observation self.terminal = True self.observation_step(state) # Print stats log_file = open('./logs/'+str(self.general_record_time)+'-l-'+str(self.params['width'])+'-m-'+str(self.params['height'])+'-x-'+str(self.params['num_training'])+'.log','a') log_file.write("# %4d | steps: %5d | steps_t: %5d | t: %4f | r: %12f | e: %10f " % (self.numeps,self.local_cnt, self.cnt, time.time()-self.s, self.ep_rew, self.params['eps'])) log_file.write("| Q: %10f | won: %r \n" % ((max(self.Q_global, default=float('nan')), self.won))) sys.stdout.write("# %4d | steps: %5d | steps_t: %5d | t: %4f | r: %12f | e: %10f " % (self.numeps,self.local_cnt, self.cnt, time.time()-self.s, self.ep_rew, self.params['eps'])) sys.stdout.write("| Q: %10f | won: %r \n" % ((max(self.Q_global, default=float('nan')), self.won))) sys.stdout.flush() def train(self): # Train if (self.local_cnt > self.params['train_start']): batch = random.sample(self.replay_mem, self.params['batch_size']) batch_s = [] # States (s) batch_r = [] # Rewards (r) batch_a = [] # Actions (a) batch_n = [] # Next states (s') batch_t = [] # Terminal state (t) for i in batch: batch_s.append(i[0]) batch_r.append(i[1]) batch_a.append(i[2]) batch_n.append(i[3]) batch_t.append(i[4]) batch_s = np.array(batch_s) batch_r = np.array(batch_r) batch_a = self.get_onehot(np.array(batch_a)) batch_n = np.array(batch_n) batch_t = np.array(batch_t) self.cnt, self.cost_disp = self.qnet.train(batch_s, batch_a, batch_t, batch_n, batch_r) def get_onehot(self, actions): """ Create list of vectors with 1 values at index of action in list """ actions_onehot = np.zeros((self.params['batch_size'], 4)) for i in range(len(actions)): actions_onehot[i][int(actions[i])] = 1 return actions_onehot def mergeStateMatrices(self, stateMatrices): """ Merge state matrices to one state tensor """ stateMatrices = np.swapaxes(stateMatrices, 0, 2) total = np.zeros((7, 7)) for i in range(len(stateMatrices)): total += (i + 1) * stateMatrices[i] / 6 return total def getStateMatrices(self, state): """ Return wall, ghosts, food, capsules matrices """ def getWallMatrix(state): """ Return matrix with wall coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height grid = state.data.layout.walls matrix = np.zeros((height, width), dtype=np.int8) for i in range(grid.height): for j in range(grid.width): # Put cell vertically reversed in matrix cell = 1 if grid[j][i] else 0 matrix[-1-i][j] = cell return matrix def getPacmanMatrix(state): """ Return matrix with pacman coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height matrix = np.zeros((height, width), dtype=np.int8) for agentState in state.data.agentStates: if agentState.isPacman: pos = agentState.configuration.getPosition() cell = 1 matrix[-1-int(pos[1])][int(pos[0])] = cell return matrix def getGhostMatrix(state): """ Return matrix with ghost coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height matrix = np.zeros((height, width), dtype=np.int8) for agentState in state.data.agentStates: if not agentState.isPacman: if not agentState.scaredTimer > 0: pos = agentState.configuration.getPosition() cell = 1 matrix[-1-int(pos[1])][int(pos[0])] = cell return matrix def getScaredGhostMatrix(state): """ Return matrix with ghost coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height matrix = np.zeros((height, width), dtype=np.int8) for agentState in state.data.agentStates: if not agentState.isPacman: if agentState.scaredTimer > 0: pos = agentState.configuration.getPosition() cell = 1 matrix[-1-int(pos[1])][int(pos[0])] = cell return matrix def getFoodMatrix(state): """ Return matrix with food coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height grid = state.data.food matrix = np.zeros((height, width), dtype=np.int8) for i in range(grid.height): for j in range(grid.width): # Put cell vertically reversed in matrix cell = 1 if grid[j][i] else 0 matrix[-1-i][j] = cell return matrix def getCapsulesMatrix(state): """ Return matrix with capsule coordinates set to 1 """ width, height = state.data.layout.width, state.data.layout.height capsules = state.data.layout.capsules matrix = np.zeros((height, width), dtype=np.int8) for i in capsules: # Insert capsule cells vertically reversed into matrix matrix[-1-i[1], i[0]] = 1 return matrix # Create observation matrix as a combination of # wall, pacman, ghost, food and capsule matrices # width, height = state.data.layout.width, state.data.layout.height width, height = self.params['width'], self.params['height'] observation = np.zeros((6, height, width)) observation[0] = getWallMatrix(state) observation[1] = getPacmanMatrix(state) observation[2] = getGhostMatrix(state) observation[3] = getScaredGhostMatrix(state) observation[4] = getFoodMatrix(state) observation[5] = getCapsulesMatrix(state) observation = np.swapaxes(observation, 0, 2) return observation def registerInitialState(self, state): # inspects the starting state # Reset reward self.last_score = 0 self.current_score = 0 self.last_reward = 0. self.ep_rew = 0 # Reset state self.last_state = None self.current_state = self.getStateMatrices(state) # Reset actions self.last_action = None # Reset vars self.terminal = None self.won = True self.Q_global = [] self.delay = 0 # Next self.frame = 0 self.numeps += 1 def getAction(self, state): move = self.getMove(state) # Stop moving when not legal legal = state.getLegalActions(0) if move not in legal: move = Directions.STOP return move