def game(player1, player2): env = Environment((board_width, board_height), speed, True) #Reset environment env.reset(player1, player2) #Other initialization stuff result = "" while True: state = env.get_state(player1, player2) action = player1.predict(state, player2, True) env.act(player1, action) state = env.get_frame(player2, player1) action = player2.predict(state, player1) env.act(player2, action) env.render(player1, player2) if env.check_collision(player1, player2): result = "Blue Won" if env.check_collision(player2, player1): result = "Red Won" if env.check_collision(player1, player2): result = "Tie Game" if result != "": break return result
def main(): env = Environment(base_name=BASE_NAME, destination=DESTINATION) env.reset_base() state = env.get_state() # Initial state if state["greedy"][0][0] != 1.: print("Expected initial distance (1.), got ({}). Re-running the simulation..". format((state["greedy"][0][0]))) return main() # Stop simulation connector = vm.VMConnector() server = vm.VMServer() server.listen() connector.send_data(state) action = server.receive_data() while action != -2: if action == -1: env.reset_base() state = env.get_state() # Initial state connector.send_data(state) else: next_state, reward, terminal, crashed = env.act(action) connector.send_data((next_state, reward, terminal, crashed)) action = server.receive_data()
def solve(solver): """ solve and calulcate the total reward for one run in an online solver """ total_reward = 0 environment = Environment(solver.pomdp) time_step = 0 Max_abs_reward = np.max(np.abs(solver.pomdp.R)) while (Max_abs_reward * (solver.pomdp.discount ** time_step) > solver.precision): # each iteration start = time.time() count = 0 while (time.time() - start < solver.time_limit): is_expanded = solver.expandOneNode() count = count+1 if is_expanded == False: break #print(count) action = solver.chooseAction() reward, observation = environment.act(action) if reward == None: # we check Terminal states to get results faster break total_reward += reward * (solver.pomdp.discount ** time_step) time_step += 1 solver.updateRoot(action, observation) return total_reward
def main(): logger = Logger() #------------------------------------ENVIRONMENT--------------------------------------------- a = Workspace(conversion_a) b = Workspace(conversion_b) workspaces = [] workspaces.append(a) workspaces.append(b) env = Environment(workspaces) #------------------------------------------------------------------------------------------- agent = Agent().build_agent(len(workspaces)) sess = agent.get_session() logger.create_dataholder("Target") logger.create_dataholder("Workspace_A") logger.create_dataholder("Workspace_B") #sess = tf_debug.LocalCLIDebugWrapperSession(sess) for i in range(config.nb_timesteps): Logger.write("INFO", "TIMESTEP " + str(i)) logger.add_datapoint("Workspace_A", i, distribution_a(i)) logger.add_datapoint("Workspace_B", i, distribution_b(i)) actions_tensor = np.zeros((config.training_size, 1)) rewards_tensor = np.zeros((config.training_size,1)) for j in range(config.training_size): action_elem = np.zeros(1) reward_elem = np.zeros(1) action_elem = agent.act() reward_elem = env.act(action_elem, i) actions_tensor[j][0] = action_elem rewards_tensor[j][0] = reward_elem for j in range(config.nb_batches): action_batch, reward_batch = utils.shuffle_batch(actions_tensor, rewards_tensor) loss_value,upd,resp,ww = agent.train(action_batch, reward_batch) Logger.write("INFO", str(loss_value)) Logger.write("INFO", str(ww)) total_reward = np.sum(rewards_tensor) reward_mean = float(total_reward)/float(config.training_size) Logger.write("INFO", "Total Reward of timestep " + str(i) + ': ' + str(reward_mean)) logger.add_datapoint("Target", i, 100.0*reward_mean) logger.init_plot() logger.plot("Target", 'o') logger.plot("Workspace_A", linestyle = None) logger.plot("Workspace_B", linestyle = None) logger.show()
def main(): config = Config() env = Environment(config) #for training eval_env = Eval_Environment(config)#for testing num_actions = env.action_size() config.setaction_set_size(num_actions) brain = Control(config) plt = Plotter() plt.writesummary(0) #adding progress bar for training pbar = tqdm(total = config.MAX_FRAMES, desc='Training Progress') episode_buffer = Buffer(config) episode_length = 0 eval_count = 1 while(env.frame_history <= config.MAX_FRAMES): if env.frame_history/(config.EVAL_FREQ*eval_count) == 1: evaluate(eval_env,config,brain,env.frame_history,plt)#testing happens now eval_count+=1 past_num_frames = env.frame_history #algorithm beigns now if episode_length == 0: env.reset() s,a,r,t = env.act(0) episode_buffer.add(s,a,r) episode_length += 1 s,a,r,t = env.act(brain.getaction(s)) episode_length += 1 episode_buffer.add(s,a,r) if (env.START_NEW_GAME or episode_length >= config.T) and not(episode_buffer.isempty()):#then epsiode ends episode_values = episode_buffer.get_returns() brain.update_table(episode_values) episode_buffer.reset() episode_length = 0 pbar.update(env.frame_history-past_num_frames) env.close_render()
def train(self): """ Learn your (final) policy. Use evolution strategy algortihm CMA-ES: https://pypi.org/project/cma/ Possible action: [0, 1, 2] Range observation (tuple): - position: [-1.2, 0.6] - velocity: [-0.07, 0.07] """ # 1- Define state features # 2- Define search space (to define a policy) # 3- Define objective function (for policy evaluation) # 4- Use CMA-ES to optimize the objective function # 5- Save optimal policy # This is an example of using Envrironment class (No learning is done yet!) for i in range(10): env = Environment() done = False while not done: reward, done = env.act(env.sample_action())
class Agent(object): def __init__(self, args, sess): # CartPole 환경 self.sess = sess self.model = Network(sess, phase='train') # mnist accurcacy model self.env = MnistEnvironment(self.model) self.state_size = self.env.state_size self.action_size = self.env.action_size self.a_bound = self.env.a_bound self.train_size = len(self.env.train_images) self.test_size = len(self.env.test_images) self.learning_rate = args.learning_rate self.batch_size = args.batch_size self.discount_factor = args.discount_factor self.epochs = args.epochs self.ENV = Environment(self.env, self.state_size, self.action_size) self.replay = ReplayMemory(self.state_size, self.batch_size) self.ddpg = DDPG(self.state_size, self.action_size, self.sess, self.learning_rate[0], self.learning_rate[1], self.replay, self.discount_factor, self.a_bound) self.save_dir = args.save_dir self.render_dir = args.render_dir self.play_dir = args.play_dir # initialize sess.run(tf.global_variables_initializer()) # tensorflow graph가 다 만들어지고 난 후에 해야됨 # load pre-trained mnist model self.env.model.checkpoint_load() self.saver = tf.train.Saver() self.epsilon = 1 self.explore = 2e4 pass ''' def select_action(self, state): return np.clip( np.random.normal(self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0], self.action_variance), -2, 2) pass ''' def ou_function(self, mu, theta, sigma): x = np.ones(self.action_size) * mu dx = theta * (mu - x) + sigma * np.random.randn(self.action_size) return x + dx def noise_select_action(self, state): action = self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0] noise = self.epsilon * self.ou_function(0, 0.15, 0.25) return action + noise def select_action(self, state): return self.sess.run(self.ddpg.actor, {self.ddpg.state: state})[0] def train(self): scores, episodes = [], [] for e in range(self.epochs): for i, idx in enumerate(np.random.permutation(self.train_size)): terminal = False score = 0 state = self.ENV.new_episode(idx) state = np.reshape(state, [1, self.state_size]) while not terminal: action = self.noise_select_action(state) next_state, reward, terminal = self.ENV.act(action) state = state[0] self.replay.add(state, action, reward, next_state, terminal) if len(self.replay.memory) >= self.batch_size: self.ddpg.update_target_network() self.ddpg.train_network() score += reward state = np.reshape(next_state, [1, self.state_size]) if terminal: scores.append(score) episodes.append(e) if (i+1)%10 == 0: print('epoch', e+1, 'iter:', f'{i+1:05d}', ' score:', f'{score:.03f}', ' last 10 mean score', f'{np.mean(scores[-min(10, len(scores)):]):.03f}', f'sequence: {self.env.sequence}') if (i+1)%500 == 0: self.ENV.render_worker(os.path.join(self.render_dir, f'{(i+1):05d}.png')) if (i+1)%1000 == 0: self.save() pass def play(self): cor_before_lst, cor_after_lst = [], [] for idx in range(self.test_size): state = self.ENV.new_episode(idx, phase='test') state = np.reshape(state, [1, self.state_size]) terminal = False score = 0 while not terminal: action = self.select_action(state) next_state, reward, terminal = self.ENV.act(action) next_state = np.reshape(next_state, [1, self.state_size]) score += reward state = next_state # time.sleep(0.02) if terminal: (cor_before, cor_after) = self.ENV.compare_accuracy() cor_before_lst.append(cor_before) cor_after_lst.append(cor_after) self.ENV.render_worker(os.path.join(self.play_dir, f'{(idx+1):04d}.png')) print(f'{(idx+1):04d} image score: {score}\n') print('====== NUMBER OF CORRECTION =======') print(f'before: {np.sum(cor_before_lst)}, after: {np.sum(cor_after_lst)}') pass def save(self): checkpoint_dir = os.path.join(self.save_dir, 'ckpt') if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) self.saver.save(self.sess, os.path.join(checkpoint_dir, 'trained_agent')) def load(self): checkpoint_dir = os.path.join(self.save_dir, 'ckpt') self.saver.restore(self.sess, os.path.join(checkpoint_dir, 'trained_agent'))
class LfD(object): def __init__(self): self.env = Environment() self.state_action = [] self.tmp_state_action = [] self.cur_state = None self.cur_action = None self.current_path = os.path.dirname(os.path.realpath(__file__)) self.img_dir = os.path.join(self.current_path, 'middle_region') self.front_cam_dir = os.path.join(self.img_dir, 'front') self.right_cam_dir = os.path.join(self.img_dir, 'right') self.back_cam_dir = os.path.join(self.img_dir, 'back') self.left_cam_dir = os.path.join(self.img_dir, 'left') self.action_dir = os.path.join(self.img_dir, 'action') if not os.path.exists(self.img_dir): os.makedirs(self.img_dir) if not os.path.exists(self.front_cam_dir): os.makedirs(self.front_cam_dir) if not os.path.exists(self.right_cam_dir): os.makedirs(self.right_cam_dir) if not os.path.exists(self.back_cam_dir): os.makedirs(self.back_cam_dir) if not os.path.exists(self.left_cam_dir): os.makedirs(self.left_cam_dir) if not os.path.exists(self.action_dir): os.makedirs(self.action_dir) self.tmp_img_dir = os.path.join(self.current_path, 'tmp_middle_region') self.tmp_front_cam_dir = os.path.join(self.tmp_img_dir, 'front') self.tmp_right_cam_dir = os.path.join(self.tmp_img_dir, 'right') self.tmp_back_cam_dir = os.path.join(self.tmp_img_dir, 'back') self.tmp_left_cam_dir = os.path.join(self.tmp_img_dir, 'left') self.tmp_action_dir = os.path.join(self.tmp_img_dir, 'action') if not os.path.exists(self.tmp_img_dir): os.makedirs(self.tmp_img_dir) if not os.path.exists(self.tmp_front_cam_dir): os.makedirs(self.tmp_front_cam_dir) if not os.path.exists(self.tmp_right_cam_dir): os.makedirs(self.tmp_right_cam_dir) if not os.path.exists(self.tmp_back_cam_dir): os.makedirs(self.tmp_back_cam_dir) if not os.path.exists(self.tmp_left_cam_dir): os.makedirs(self.tmp_left_cam_dir) if not os.path.exists(self.tmp_action_dir): os.makedirs(self.tmp_action_dir) self.img_count = 0 def getch(self): fd = sys.stdin.fileno() old_settings = termios.tcgetattr(fd) try: tty.setraw(sys.stdin.fileno()) ch = sys.stdin.read(1) finally: termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) return ch def save_tmp_state_action(self, action): cameraFront_img_name = os.path.join( self.tmp_front_cam_dir, 'F' + str('{:06d}'.format(self.img_count)) + '.jpg') while os.path.exists(cameraFront_img_name): self.img_count += 1 cameraFront_img_name = os.path.join( self.tmp_front_cam_dir, 'F' + str('{:06d}'.format(self.img_count)) + '.jpg') print 'saving image', self.img_count cameraRight_img_name = os.path.join( self.tmp_right_cam_dir, 'R' + str('{:06d}'.format(self.img_count)) + '.jpg') cameraBack_img_name = os.path.join( self.tmp_back_cam_dir, 'B' + str('{:06d}'.format(self.img_count)) + '.jpg') cameraLeft_img_name = os.path.join( self.tmp_left_cam_dir, 'L' + str('{:06d}'.format(self.img_count)) + '.jpg') action_name = os.path.join( self.tmp_action_dir, 'action' + str('{:06d}'.format(self.img_count)) + '.yaml') self.img_count += 1 front, right, back, left = self.env.sense() cv2.imwrite(cameraFront_img_name, front) cv2.imwrite(cameraRight_img_name, right) cv2.imwrite(cameraBack_img_name, back) cv2.imwrite(cameraLeft_img_name, left) with open(action_name, 'w') as f: action_data = { 'act': self.env.valid_actions.index(action), 'act_name:': action } yaml.dump(action_data, f) def get_max_index(self, path): files_lst = os.listdir(path) max_index = -1 for filename in files_lst: fileindex_list = re.findall(r'\d+', filename) if not fileindex_list: continue fileindex = int(fileindex_list[0]) if fileindex >= max_index: max_index = fileindex return max_index def store_to_file(self): dest_count = self.get_max_index(self.front_cam_dir) + 1 src_count = self.get_max_index(self.tmp_front_cam_dir) + 1 foldername_list = os.listdir(self.tmp_img_dir) for i in range(src_count): for fld_name in foldername_list: src_folder_dir = os.path.join(self.tmp_img_dir, fld_name) dest_folder_dir = os.path.join(self.img_dir, fld_name) files_lst = os.listdir(src_folder_dir) first_filename = files_lst[0] fileindex_list = re.findall(r'\d+', files_lst[0]) index_start = first_filename.index(fileindex_list[0]) file_prefix = first_filename[:index_start] file_suffix = first_filename[index_start + len(fileindex_list[0]):] dest_file_name = file_prefix + str( '{:06d}'.format(dest_count + i)) + file_suffix src_file_name = file_prefix + str( '{:06d}'.format(i)) + file_suffix if not os.path.exists( os.path.join(src_folder_dir, src_file_name)): continue shutil.move(os.path.join(src_folder_dir, src_file_name), os.path.join(dest_folder_dir, dest_file_name)) def delete_tmp_files(self): folders = os.listdir(self.tmp_img_dir) for folder in folders: filelists = os.listdir(os.path.join(self.tmp_img_dir, folder)) for file in filelists: os.remove(os.path.join(self.tmp_img_dir, folder, file)) print 'deleting files done...' def read_key(self): while not rospy.is_shutdown(): ch = self.getch() if ch == '\x1b': ch = self.getch() if ch == '[': ch = self.getch() if ch == 'A': print 'forward' self.save_tmp_state_action('forward') self.env.act('forward') elif ch == 'B': print 'backward' self.save_tmp_state_action('backward') self.env.act('backward') elif ch == 'C': print 'right_45_backward' self.save_tmp_state_action('right_45_backward') self.env.act('right_45_backward') elif ch == 'D': print 'left_45_backward' self.save_tmp_state_action('left_45_backward') self.env.act('left_45_backward') elif ch == ' ': print 'stop' self.env.act('stop') elif ch == 'a': print 'left_45_forward' self.save_tmp_state_action('left_45_forward') self.env.act('left_45_forward') elif ch == 'd': print 'right_45_forward' self.save_tmp_state_action('right_45_forward') self.env.act('right_45_forward') elif ch == 'r': print 'reset' self.img_count = 0 self.delete_tmp_files() self.env.reset() elif ch == 's': print 'save paths to file' self.save_tmp_state_action('stop') self.store_to_file() elif ch == 'c': print 'clear the previous paths you just ran' self.delete_tmp_files() print 'you can start recording the path from scratch' elif ch == '\x03' or ch == '\x71': # ctrl + c or 'q' rospy.signal_shutdown(ch) sys.exit() else: print ord(ch)
class dqnRunner(): def __init__(self, sess, params, out_dir=None, agentB_sess=None): self.params = params self.sess = sess self.agentB_sess = agentB_sess self.lock = threading.Lock() self.modelStoreIntv = 150 self.bufferStoreIntv = 150 self.annealSteps = params['annealSteps'] self.state_dim = params['pxRes'] if self.params['verbose']: printT("tensorflow version: {}".format(tf.__version__)) # create environment self.env = Environment(sess, params, self) self.numActions = self.env.numActions # load classifier for reward calculation if self.params['classNN'] is not None: with tf.device("/device:CPU:0"): self.rewardClassNet = ClassConvNetEval(self.sess, params) self.env.rewardClassNet = self.rewardClassNet # just gets or resets global_step self.global_step = None variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) for v in variables: if "global_step" in v.name: self.global_step = v if self.global_step is None: self.global_step = tf.Variable(0, name='global_step', trainable=False) self.resetGlStep = tf.assign(self.global_step, 0) # load actual dqn self.q = DQN(self.sess, self.params['out_dir'], self.global_step, self.params, self.numActions) self.evalMethods = ["agent", "random"] self.evalMethod = "agent" self.qAgentB = None if (not self.params['agentB'] is None) and self.params['interEval']: self.qAgentB = DQN(self.agentB_sess, self.params['out_dir'], self.global_step, self.params, self.numActions, agentB=True) self.evalMethod = "agentA" self.evalMethods = ["agentA", "random", "fixed", "agentB"] self.sess.as_default() # replay buffer (size and type) if self.params['replaySz'] is None: self.replayBufferSize = 1000000 else: self.replayBufferSize = self.params['replaySz'] self.replay = ReplayBuffer(self.replayBufferSize) # variables for exploration decay self.action_step = tf.Variable(0, name='action_step', trainable=False, dtype=tf.int32) self.increment_ac_step_op = tf.assign(self.action_step, self.action_step + 1) self.global_action_step = tf.Variable(0, name='global_action_step', trainable=False, dtype=tf.int32) self.increment_gac_step_op = tf.assign(self.global_action_step, self.global_action_step + 1) self.episode_step = tf.Variable(0, name='episode_step', trainable=False, dtype=tf.int32) self.increment_ep_step_op = tf.assign(self.episode_step, self.episode_step + 1) self.resetEpStep = tf.assign(self.episode_step, 0) self.resetAcStep = tf.assign(self.action_step, 0) self.resetGAcStep = tf.assign(self.global_action_step, 0) # save state self.saver = tf.train.Saver( max_to_keep=self.params['keepNewestModels']) fn = os.path.join(self.params['out_dir'], "mainLoopTime.txt") self.mainLoopTimeFile = open(fn, "a") fn_ = os.path.join(self.params['out_dir'], "learnLoopTime.txt") self.learnLoopTimeFile = open(fn_, "a") # main function, runs the learning process def run(self): # debugging variables, for tensorboard if self.params['evaluation']: # evaluation episodes, no exploration eval_reward = tf.Variable(0., name="evalReward") eval_reward_op = tf.summary.scalar("Eval-Reward", eval_reward) eval_disc_reward = tf.Variable(0., name="evalDiscReward") eval_disc_reward_op = tf.summary.scalar("Eval-Reward_discounted", eval_disc_reward) eval_stepCount = tf.Variable(0., name="evalStepCount") eval_stepCount_op = tf.summary.scalar("Eval-StepCount", eval_stepCount) eval_sum_vars = [eval_reward, eval_disc_reward, eval_stepCount] eval_sum_op = tf.summary.merge( [eval_reward_op, eval_disc_reward_op, eval_stepCount_op]) # (discounted) reward per episode episode_reward = tf.Variable(0., name="episodeReward") episode_reward_op = tf.summary.scalar("Reward", episode_reward) episode_disc_reward = tf.Variable(0., name="episodeDiscReward") episode_disc_reward_op = tf.summary.scalar("Reward_discounted", episode_disc_reward) # average (max q) episode_ave_max_q = tf.Variable(0., name='epsideAvgMaxQ') episode_ave_max_q_op = tf.summary.scalar("Qmax_Value", episode_ave_max_q) # number of steps for episode stepCount = tf.Variable(0., name="stepCount") stepCount_op = tf.summary.scalar("StepCount", stepCount) # number of learning iterations(total number of mini batches so far) global_step_op = tf.summary.scalar("GlobalStep", self.global_step) # current exploration epsilon epsilonVar = tf.Variable(0., name="epsilon") epsilonVar_op = tf.summary.scalar("Epsilon", epsilonVar) summary_vars = [ episode_reward, episode_disc_reward, episode_ave_max_q, stepCount, epsilonVar ] summary_ops = tf.summary.merge([ episode_reward_op, episode_disc_reward_op, episode_ave_max_q_op, stepCount_op, epsilonVar_op ]) self.writer = tf.summary.FileWriter( os.path.join(self.params['out_dir'], "train"), self.sess.graph) self.action_vars = [] self.action_ops = [] for a in range(self.numActions): action = tf.Variable(0., name="qval_action_" + str(a)) action_op = tf.summary.scalar("Q-Value_Action_" + str(a), action) self.action_vars.append(action) self.action_ops.append(action_op) self.action_ops = tf.summary.merge(self.action_ops) # initialize all tensorflow variables # and finalize graph (cannot be modified anymore) self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() # for debugging, variable values before and after if self.params['veryveryverbose']: variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope="DQN") for v in variables: if v.name.endswith("conv1_2/weights:0"): print(v.name, self.sess.run(v)) # do we want to use pretrained weights for the dqn # from the classifier or a pretrained agent? if self.params['resume']: pass elif self.params['useClassNN']: print("restoring dqn net from classNN: {}".format( self.params['classNN'])) if "ckpt" in self.params['classNN']: self.q.saver.restore(self.sess, self.params['classNN']) else: self.q.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['classNN'])) elif self.params['dqnNN'] is not None: print("restoring dqn net from dqnNN: {}".format( self.params['dqnNN'])) if "ckpt" in self.params['dqnNN']: self.q.saver.restore(self.sess, self.params['dqnNN']) else: self.q.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['dqnNN'])) # main network weights are set, now run target init op self.sess.run(self.q.target_nn_init_op) if (self.params['agentB'] is not None) and self.params['interEval']: print("restoring agentB net from {}".format(self.params['agentB'])) if "ckpt" in self.params['agentB']: self.qAgentB.saver.restore(self.agentB_sess, self.params['agentB']) else: self.qAgentB.saver.restore( self.agentB_sess, tf.train.latest_checkpoint(self.params['agentB'])) # for debugging, variable values before and after if self.params['veryveryverbose']: variables = tf.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope="DQN") for v in variables: if v.name.endswith("conv1_2/weights:0"): print(v.name, self.sess.run(v)) print("initialize classifier network") if self.params['classNN'] is not None: print("restoring reward class net from classNN: {}".format( self.params['classNN'])) if "ckpt" in self.params['classNN']: self.rewardClassNet.saver.restore(self.sess, self.params['classNN']) else: self.rewardClassNet.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['classNN'])) # load previously trained model if not self.params['resume'] and self.params['loadModel']: if "ckpt" in self.params['loadModel']: self.saver.restore(self.sess, self.params['loadModel']) else: self.saver.restore( self.sess, tf.train.latest_checkpoint(self.params['loadModel'])) printT("Model {} restored.".format(self.params['loadModel'])) # load previously filled replay buffer if not self.params['resume'] and self.params['loadReplay'] is not None: self.replay.load(self.params['loadReplay']) printT("Buffer {} restored.".format(self.params['loadReplay'])) # resume old run if self.params['resume']: self.saver.restore( sess, tf.train.latest_checkpoint( os.path.join(self.params['out_dir'], "models"))) printT("Model {} restored.".format( tf.train.latest_checkpoint( os.path.join(self.params['out_dir'], "models")))) # if not self.params['interEval'] : self.replay.load( os.path.join(self.params['out_dir'], "replayBuffer")) printT("Buffer {} restored.".format(self.params['out_dir'])) else: self.sess.run(self.resetGlStep) # start immediately for interactive test runs try: if os.environ['IS_INTERACTIVE'] == 'true' \ and \ not self.params['sleep']: self.params['startLearning'] = 1 except KeyError: pass # exploration variables self.startEpsilon = self.params['epsilonStart'] self.endEpsilon = self.params['epsilonStop'] self.epsilon = sess.run(epsilonVar) # evaluation/learning/exploration self.evalEp = False self.learning = True self.pauseLearning = False self.pauseExploring = False self.stopLearning = False self.stopExploring = False self.qValFileExpl = open( os.path.join(self.params['out_dir'], "qValExpl.txt"), "a") self.qValFileEval = open( os.path.join(self.params['out_dir'], "qValEval.txt"), "a") self.actionLogFile = open( os.path.join(self.params['out_dir'], "actionLog.txt"), "a") self.episodeLogFile = open( os.path.join(self.params['out_dir'], "episodeLog.txt"), "a") self.episodeEvalLogFile = open( os.path.join(self.params['out_dir'], "episodeEvalLog.txt"), "a") # remove stop/termination file if os.path.exists("stop"): os.remove(os.path.join(params['out_dir'], "stop")) # reset if self.params['onlyLearn']: sess.run(self.resetEpStep) sess.run(self.resetAcStep) if self.params['onlyLearn']: self.learn() exit() # multi-threaded # learning and exploration threads act independently? if self.params['async']: t = threading.Thread(target=self.learnWrap) t.daemon = True t.start() if self.params['evaluation']: # evaluate this often evalEpReward = 0 evalEpDiscReward = 0 evalEpStepCount = 0 evalIntv = 25 evalCnt = 40 evalOc = 0 # start exploration self.episode = sess.run(self.episode_step) if self.params['verbose']: printT("start Episode: {}".format(self.episode)) acs = sess.run(self.action_step) if self.params['verbose']: printT("start action step: {}".format(acs)) self.globActStep = acs gacs = sess.run(self.global_action_step) if self.params['verbose']: printT("start global action step: {}".format(gacs)) self.gac = gacs while self.episode < self.params['numEpisodes']: self.episode = sess.run(self.episode_step) sess.run(self.increment_ep_step_op) if self.params['verbose']: print("STARTING NEW EPISODE:" + str(self.episode)) # do we want to explore/gather samples? while self.stopExploring: time.sleep(1) # evaluation episode (no exploration?) if self.params['evaluation'] and self.episode % ( evalIntv + evalCnt) < evalCnt: self.evalEp = True if self.episode % (evalIntv + evalCnt) == 0: if self.params['verbose']: printT("Start Eval Episodes!") evalOc += 1 elif self.params['onlyLearn'] or \ (self.params['limitExploring'] is not None \ and self.replay.size() >= self.params['limitExploring']): self.pauseExploring = True self.evalEp = False else: self.evalEp = False # reset simulation/episode state terminal = False ep_reward = 0 ep_disc_reward = 0 ep_ave_max_q = 0 self.inEpStep = 0 if self.params['interEval']: self.evalMethod = self.evalMethods[self.episode % (len(self.evalMethods))] # reset environment # set start state and allowed actions nextState, allowedActions, terminal = self.env.reset( self.episode, self.evalEp, globActStep=self.globActStep) allowedV = self.calcAllowedActionsVector(allowedActions) if nextState is None: # unable to get state # restart with new episode continue lastTime = time.time() # step forward until terminal while not terminal: if os.path.exists(os.path.join(params['out_dir'], "stop")): self.terminate() if self.params['async']: if not t.isAlive(): printT("alive {}".format(t.isAlive())) printT("Exception in user code:") printT('-' * 60) traceback.print_exc(file=sys.stdout) printT('-' * 60) sys.stdout.flush() t.join(timeout=None) os._exit(-1) # state <- nextstate state = nextState # choose action # random or according to dqn (depending on epsilon) self.inEpStep += 1 if not self.evalEp: sess.run(self.increment_ac_step_op) self.globActStep += 1 sess.run(self.increment_gac_step_op) self.gac += 1 epsStep = max( 0, self.globActStep - (self.params['startLearning'] / 4.0)) tmp_step = min(epsStep, self.annealSteps) self.epsilon = (self.startEpsilon - self.endEpsilon) * \ (1 - tmp_step / self.annealSteps) + \ self.endEpsilon action = self.getActionID(state, allowedV) if self.evalMethod == "fixed": action = self.params['fixedAction'] # We choose a random action in these cases rnm = np.random.rand() if self.params['veryveryverbose']: printT("rnm:" + str(rnm) + " self.epsilon:" + str(self.epsilon) + " |self.params['randomEps']:" + str(self.params['randomEps']) + " e:" + str(self.episode)) if (self.evalMethod == "random" ) or (not self.pauseExploring) and (not self.evalEp) and ( self.episode < self.params['randomEps'] or rnm < self.epsilon): if self.params['verbose']: printT("randomly selecting action") action = np.random.choice(allowedActions) if self.params['verbose']: printT( "\nEpisode: {}, Step: {}, Time:{}, Next action (e-greedy {}): {}" .format(self.episode, self.globActStep, time.ctime(), self.epsilon, action)) else: # We let the DQN choose the action if self.params['verbose']: printT("Greedyly selecting action:") if self.params['verbose']: printT( "\nEpisode: {}, Step: {}, Time:{}, Next action: {}" .format(self.episode, self.globActStep, time.ctime(), action)) # perform selected action and # get new state, reward, and termination-info nextState, reward, terminal, terminalP, allowedActions = self.env.act( action, self.episode, self.inEpStep, self.globActStep, self.evalEp) if self.params['veryveryverbose']: print('ACTIONLOG:', str(self.globActStep), str(self.episode), str(self.inEpStep), action, self.evalEp, terminal, terminalP, reward, self.epsilon, self.evalMethod) self.actionLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), action, self.evalEp, terminal, terminalP, reward, self.epsilon, self.evalMethod)) self.actionLogFile.flush() allowedV = self.calcAllowedActionsVector(allowedActions) # accumulate episode reward ep_disc_reward += pow(self.params['gamma'], self.inEpStep - 1) * reward ep_reward += reward if (self.evalMethod == "agent" ) and not self.evalEp and not self.pauseExploring: self.insertSamples(np.copy(state), action, reward, terminal, np.copy(nextState), np.copy(allowedV)) # do logging inside of one episode # we do not want to lose any data if self.params['storeModel'] and \ ((self.globActStep+1) % self.modelStoreIntv) == 0: logDqn.logModel(self) if self.params['storeBuffer'] and \ ((self.globActStep+1) % self.bufferStoreIntv) == 0: logDqn.logBuffer(self) # if training/exploration not decoupled, do one learning step if not self.params['async']: for i in range(8): self.learn() sys.stdout.flush() cTime = time.time() usedTime = cTime - lastTime # do we want to pause exploration thread? # (to simulate slower stm) if not self.pauseExploring and \ not self.evalEp and \ self.params['sleep'] and \ self.params['async'] and \ (self.replay.size() >= self.params['startLearning']) and \ (self.replay.size() >= self.params['miniBatchSize']): if self.params['sleepA'] is not None: sleepingTime = self.params['sleepA'] - usedTime if sleepingTime > 0: time.sleep(sleepingTime) else: time.sleep(60) cTime = time.time() usedTime = cTime - lastTime lastTime = cTime self.mainLoopTimeFile.write( str(cTime) + " " + str(usedTime) + "\n") self.mainLoopTimeFile.flush() # terminate episode after x steps # even if no good state has been reached if self.inEpStep == self.params['stepsTillTerm']: self.env.switchApproachArea() break # end episode # otherwise store episode summaries and print log if self.evalEp: evalEpReward += ep_reward evalEpDiscReward += ep_disc_reward evalEpStepCount += self.inEpStep if self.episode % (evalIntv + evalCnt) == (evalCnt - 1): summary_str = self.sess.run( eval_sum_op, feed_dict={ eval_sum_vars[0]: evalEpReward / float(evalCnt), eval_sum_vars[1]: evalEpDiscReward / float(evalCnt), eval_sum_vars[2]: evalEpStepCount / float(evalCnt) }) self.writer.add_summary(summary_str, evalOc - 1) evalEpReward = 0.0 evalEpDiscReward = 0.0 evalEpStepCount = 0.0 if self.params['veryveryverbose']: printT("step count-eval: {}".format(self.inEpStep)) if self.params['veryverbose']: printT( 'Time: {} | Reward: {} | Discounted Reward: {} | Eval-Episode {}' .format(time.ctime(), ep_reward, ep_disc_reward, self.episode)) self.episodeEvalLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\n".format(time.time(), self.episode, ep_reward, ep_disc_reward, self.inEpStep, self.epsilon)) self.episodeEvalLogFile.flush() else: if self.params['evaluation']: et = self.episode - (evalOc * evalCnt) else: et = self.episode summary_str = self.sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_disc_reward, summary_vars[2]: ep_ave_max_q / float(max(self.inEpStep, 1)), summary_vars[3]: self.inEpStep, summary_vars[4]: self.epsilon }) self.writer.add_summary(summary_str, et) self.writer.flush() if self.params['veryveryverbose']: printT("step count: {}".format(self.inEpStep)) if self.params['veryveryverbose']: printT( 'Time: {} | Reward: {} | Discounted Reward: {} | Episode {} | Buffersize: {}' .format(time.ctime(), ep_reward, ep_disc_reward, self.episode, self.replay.size())) self.episodeLogFile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), self.episode, ep_reward, ep_disc_reward, self.inEpStep, self.epsilon, self.evalMethod)) self.episodeLogFile.flush() # log some stuff if self.params['storeModel'] and \ ((self.episode+1) % self.modelStoreIntv) == 0: logDqn.logModel(self) if self.params['storeBuffer'] and \ ((self.episode+1) % self.bufferStoreIntv) == 0: logDqn.logBuffer(self) statsIntv = 100 sys.stdout.flush() # stop learning after last episode self.learning = False sys.stdout.flush() def terminate(self): printT("terminating...........") sys.stdout.flush() self.logStuff() sys.stdout.flush() printT("EXIT NOW!") sys.stdout.flush() exit(0) def learnWrap(self): try: self.learn() except: printT("learn wrap failed") printT("Exception in user code:") printT('-' * 60) traceback.print_exc(file=sys.stdout) printT('-' * 60) sys.stdout.flush() os._exit(-1) def learn(self): y_batch = np.zeros((self.params['miniBatchSize'], 1)) tmp = np.zeros((self.params['miniBatchSize'], self.numActions)) lastTime = time.time() count = 0 while self.learning: # Throtteling to allow the other thread a chance count += 1 cTime = time.time() loopTime = cTime - lastTime lastTime = cTime self.learnLoopTimeFile.write( str(cTime) + " " + str(loopTime) + "\n") self.learnLoopTimeFile.flush() if self.stopLearning: time.sleep(5.0) continue if self.replay.size() < self.params['startLearning'] or \ self.replay.size() < self.params['miniBatchSize'] or \ self.evalEp: if self.params['async']: time.sleep(5.0) continue else: return s_batch, a_batch, r_batch, t_batch, ns_batch, allowed_batch = \ self.replay.sample_batch(self.params['miniBatchSize']) if self.params['doubleDQN']: qValsNewState = self.estimate_ddqn(ns_batch, allowed_batch, p=False, mem=tmp) else: qValsNewState = self.predict_target_nn(ns_batch) for i in range(self.params['miniBatchSize']): if t_batch[i]: y_batch[i] = r_batch[i] else: y_batch[i] = r_batch[ i] + self.params['gamma'] * qValsNewState[i] gS, qs, delta = self.update(s_batch, a_batch, y_batch) if self.params['noHardResetDQN']: self.update_targets() elif (gS + 1) % self.params['resetFreq'] == 0: self.update_targets() if not self.params['async']: return if self.params['onlyLearn']: if (gS + 1) % 1000 == 0: logDqn.logModel(self) # Returns vector of length 'self.numActions' containing # Zeros for allowed actions # '-inf' for forbidden actions def calcAllowedActionsVector(self, allowedActions): allowedV = np.zeros(shape=(self.numActions)) allowedV[:] = float("-inf") # init all actions as fobidden for i in allowedActions: allowedV[i] = 0 # mark actions as allowed return allowedV # get action id for max q def getActionID(self, state, allowedActionsV): if self.params['interEval'] and self.evalMethod == 'agentB': if self.params['verbose']: print("PREDICTING WITH AGENTB:") qs = self.qAgentB.run_predict(state) print(qs) else: if self.params['verbose']: print("PREDICTING WITH AGENT:") qs = self.q.run_predict(state) if self.evalEp: self.qValFileEval.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), qs[0], allowedActionsV)) self.qValFileEval.flush() else: self.qValFileExpl.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( time.time(), str(self.globActStep), str(self.episode), str(self.inEpStep), qs[0], allowedActionsV)) self.qValFileExpl.flush() var_dict = {} for a in range(self.numActions): var_dict[self.action_vars[a]] = qs[0][a] summary_str = self.sess.run(self.action_ops, feed_dict=var_dict) self.writer.add_summary(summary_str, self.gac) self.writer.flush() printT("Q-values:" + str(qs)) qs = qs + allowedActionsV return np.argmax(qs, axis=1)[0] # update dqn main network def update(self, states, actionIDs, targets): step, out, delta, loss = self.q.run_train(states, actionIDs, targets) # network diverged? if np.isnan(loss): printT("ABORT: NaN") sys.stdout.flush() os._exit(-1) return step, out, delta # update dqn target network def update_targets(self): self.q.run_update_target_nn() # estimate q values using double dqn # get values of target network for actions where main network is max def estimate_ddqn(self, states, allowedActionsV, p=False, mem=None): qs = self.q.run_predict(states) if p: if self.params['veryveryverbose']: print("allowedActionsV.shape" + str(allowedActionsV.shape)) print("qs.shape" + str(qs.shape)) qs += allowedActionsV # add '-inf' to the q values of forbidden actions if p: if self.params['veryveryverbose']: print(states) print(qs.shape) print(states.shape) printT("qs: {}".format(qs)) maxA = np.argmax(qs, axis=1) qs = self.q.run_predict_target(states) mem.fill(0) mem[np.arange(maxA.size), maxA] = 1 mem = mem * qs mem = np.sum(mem, axis=1) return mem # predict dqns def predict_target_nn(self, states): qs = self.q.run_predict_target(states) return np.max(qs, axis=1) def predict_nn(self, states): qs = self.q.run_predict(states) return np.max(qs, axis=1) # insert samples into replay buffer def insertSamples(self, stateScaled, action, reward, terminal, newStateScaled, allowedActionsV): stateScaled.shape = (stateScaled.shape[1], stateScaled.shape[2], stateScaled.shape[3]) newStateScaled.shape = (newStateScaled.shape[1], newStateScaled.shape[2], newStateScaled.shape[3]) states = (stateScaled, np.rot90(stateScaled, 2), np.fliplr(stateScaled), np.flipud(stateScaled)) newStates = (newStateScaled, np.rot90(newStateScaled, 2), np.fliplr(newStateScaled), np.flipud(newStateScaled)) if (self.params['fullAugmentation']): self.lock.acquire() for i in range(4): for j in range(4): self.replay.add(states[i], action, reward, terminal, allowedActionsV, newStates[j]) self.lock.release() else: self.lock.acquire() self.replay.add(stateScaled, action, reward, terminal, allowedActionsV, newStateScaled) self.replay.add(np.ascontiguousarray(np.rot90(stateScaled, 2)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.rot90(newStateScaled, 2))) self.replay.add(np.ascontiguousarray(np.fliplr(stateScaled)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.fliplr(newStateScaled))) self.replay.add(np.ascontiguousarray(np.flipud(stateScaled)), action, reward, terminal, allowedActionsV, np.ascontiguousarray(np.flipud(newStateScaled))) self.lock.release() # if we want to stop if buffer is full # or limit exploration if self.pauseExploring == False and \ self.replay.size() == self.replayBufferSize: if self.params['termAtFull']: printT("Buffer FULL!") self.logStuff() self.pauseExploring = True # exit() elif self.pauseExploring == False and \ self.params['limitExploring'] is not None and \ self.replay.size() >= self.params['limitExploring']: if self.params['termAtFull']: printT("Buffer FULL!") self.logStuff() self.pauseExploring = True def logStuff(self): logDqn.logModel(self) logDqn.logBuffer(self)
def train(self): """ Learn your (final) policy. Use evolution strategy algortihm CMA-ES: https://pypi.org/project/cma/ Possible action: [0, 1, 2] Range observation (tuple): - position: [-1.2, 0.6] - velocity: [-0.07, 0.07] """ # 1- Define state features # 2- Define search space (to define a policy) # 3- Define objective function (for policy evaluation) # 4- Use CMA-ES to optimize the objective function # 5- Save optimal policy generations = 10000 for i in range(generations): solutions = self.es.ask() print("iteration:", i, " ;") result = [] for solution in solutions: env = Environment() n_w1 = len(self.w1_flat) self.w1_flat = np.array(solution[0:len(self.w1_flat)]) self.b1_flat = np.array( solution[len(self.w1_flat):len(self.w1_flat) + len(self.b1_flat)]) self.w2_flat = np.array( solution[len(self.w1_flat) + len(self.b1_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat)]) self.b2_flat = np.array( solution[len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat) + len(self.b2_flat)]) done = False accumulated_reward = 0 while not done: observation = env.observe() reward, done = env.act(self.act(observation)) accumulated_reward += reward result.append(-accumulated_reward) self.es.tell(solutions, result) if np.mean( result ) < 100: # result.avg=200 when cound not achieve aim, less is better. print("Good generation founded") break index = np.argmin(result) weight = solutions[index] np.save("weights.npy", weight) self.w1_flat = np.array(weight[0:len(self.w1_flat)]) self.b1_flat = np.array(weight[len(self.w1_flat):len(self.w1_flat) + len(self.b1_flat)]) self.w2_flat = np.array( weight[len(self.w1_flat) + len(self.b1_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat)]) self.b2_flat = np.array( weight[len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat):len(self.w1_flat) + len(self.b1_flat) + len(self.w2_flat) + len(self.b2_flat)])
class Agent(object): def __init__(self, conf): self.env = Environment(name=conf.env, width=conf.width, height=conf.height, history=conf.history) self.hist = History(self.env) self.mem = ReplayMemory(self.env, capacity=conf.mem_capacity, batch_size=conf.batch_size) self._capa = conf.mem_capacity self._ep_en = conf.ep_end self._ep_st = conf.ep_start self._learn_st = conf.learn_start self._tr_freq = conf.train_freq self._update_freq = conf.update_freq self.q = DQN(self.hist._history, self.env.action_size).type(dtype) self.target_q = DQN(self.hist._history, self.env.action_size).type(dtype) self.optim = torch.optim.RMSprop(self.q.parameters(), lr=0.00025, alpha=0.95, eps=0.01) def train(self): screen, reward, action, terminal = self.env.new_random_game() for _ in range(self.env._history): self.hist.add(screen) num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] #for self.step in xrange(50000000): for self.step in tqdm(range(0, 50000000), ncols=70, initial=0): if self.step == self._learn_st: num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] action = self._select_action() screen, reward, terminal = self.env.act(action) self.observe(screen, reward, action, terminal) if terminal: screen, reward, action, terminal = self.env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self._learn_st: if self.step % 10000 == 10000 - 1: avg_reward = total_reward / 10000. avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count print '# games: {}, reward: {}, loss: {}, q: {}'.format( num_game, avg_reward, avg_loss, avg_q) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def observe(self, screen, reward, action, terminal): reward = max(-1., min(1., reward)) self.hist.add(screen) self.mem.add(screen, reward, action, terminal) if self.step > self._learn_st: if self.step % self._tr_freq == 0: self._q_learning() #print '{} q-learning'.format(self.step) if self.step % self._update_freq == self._update_freq - 1: self.target_q.load_state_dict(self.q.state_dict()) if self.step % (self._update_freq * 10) == (self._update_freq * 10) - 1: torch.save(self.target_q, 'models1/model_{}'.format(self.step)) #print 'update' def play(self, model_path, num_ep=200): self.q = torch.load(model_path) best_reward = 0 best_screen_hist = [] for ep in range(num_ep): print '# episode: {}'.format(ep) screen, reward, action, terminal = self.env.new_random_game( force=True) current_reward = 0 current_screen_hist = [] act_hist = [] current_screen_hist.append(self.env.screen) for _ in range(self.env._history): self.hist.add(screen) cnt = 0 while not terminal: cnt += 1 action = self._select_action(test_mode=True) act_hist.append(action) if cnt > 200: # avoid local maxima ??? same actions....?? if np.array(act_hist[-100:]).mean() == act_hist[-1]: action = random.randrange(self.env.action_size) screen, reward, terminal = self.env.act(action, is_train=False) self.hist.add(screen) current_reward += reward #print cnt, action, current_reward, terminal, self.env.lives current_screen_hist.append(self.env.screen) print current_reward print 'count: {}'.format(cnt) if current_reward > best_reward: best_reward = current_reward best_screen_hist = current_screen_hist import imageio print 'best reward: {}'.format(best_reward) imageio.mimsave('movies_play/best_{}.gif'.format(best_reward), best_screen_hist, 'GIF', duration=0.0001) def _q_learning(self): sc_t, actions, rewards, sc_t_1, terminals = self.mem.sample() batch_obs_t = self._to_tensor(sc_t) batch_obs_t_1 = self._to_tensor(sc_t_1, volatile=True) batch_rewards = self._to_tensor(rewards).unsqueeze(1) batch_actions = self._to_tensor( actions, data_type=torch.cuda.LongTensor).unsqueeze(1) batch_terminals = self._to_tensor(1. - terminals).unsqueeze(1) q_dash = self.q(batch_obs_t) #print 'shape_q: {}'.format(q_dash.shape) q_values = self.q(batch_obs_t).gather(1, batch_actions) next_max_q_values = self.target_q(batch_obs_t_1).max(1)[0].unsqueeze(1) next_q_values = batch_terminals * next_max_q_values target_q_values = batch_rewards + (0.99 * next_q_values) target_q_values.volatile = False cri = torch.nn.SmoothL1Loss() self.loss = cri(q_values, target_q_values) self.optim.zero_grad() self.loss.backward() self.optim.step() self.update_count += 1 self.total_q += q_values.data.mean() self.total_loss += self.loss.data.mean() def _select_action(self, test_mode=False): # epsilon greedy policy if not test_mode: ep = self._ep_en + max( 0., (self._ep_st - self._ep_en) * (self._capa - max(0., self.step - self._learn_st)) / self._capa) else: ep = -1. if random.random() < ep: action = random.randrange(self.env.action_size) else: inputs = self._to_tensor(self.hist.get) pred = self.q(inputs.unsqueeze(0)) action = pred.data.max(1)[1][ 0] # ##### actual = pred.data.max(1)[1][0][0] return action def _to_tensor(self, ndarray, volatile=False, data_type=dtype): return Variable(torch.from_numpy(ndarray), volatile=volatile).type(data_type)
class Agent(object): def __init__(self, args, sess): self.sess = sess self.model = Network(sess, phase='test') # pre-trained mnist accuracy model self.env = MnistEnvironment(self.model, args.env, args.reward_type) self.state_size = self.env.state_size self.state_shape = self.env.state_shape self.action_size = self.env.action_size self.a_bound = self.env.a_bound self.train_size = len(self.env.train_images) self.test_size = len(self.env.test_images) self.learning_rate = args.learning_rate self.batch_size = args.batch_size self.discount_factor = args.discount_factor self.epsilon = args.epsilon self.epochs = args.epochs self._make_std() self.num_actor = 256 # N self.timesteps = 20 # T self.gae_parameter = 0.99 # lambda self.num_train = 64 # K self.ENV = Environment(self.env, self.state_size, self.action_size) self.replay = ReplayMemory(self.state_size, self.batch_size, self.num_actor * self.timesteps) self.ppo = PPO(self.state_size, self.action_size, self.sess, self.learning_rate, self.discount_factor, self.replay, self.epsilon, self.a_bound, self.state_shape) self.continue_train = args.continue_train self.save_dir = args.save_dir self.render_dir = args.render_dir self.play_dir = args.play_dir # initialize sess.run(tf.global_variables_initializer() ) # tensorflow graph가 다 만들어지고 난 후에 해야됨 # load pre-trained mnist model self.env.model.checkpoint_load() self.saver = tf.train.Saver() # continue_train if self.continue_train: self.load() pass def _policy_action_bound(self, policy): a_range = (self.a_bound[:, 1] - self.a_bound[:, 0]) / 2. a_mean = (self.a_bound[:, 0] + self.a_bound[:, 1]) / 2. return policy * np.transpose(a_range) + np.transpose(a_mean) def select_action(self, state, phase): if phase == 'step': policy = self.sess.run(self.ppo.sampled_action, feed_dict={ self.ppo.state: state, self.ppo.std: self.std_step })[0] elif phase == 'test': policy = self.sess.run(self.ppo.sampled_action, feed_dict={ self.ppo.state: state, self.ppo.std: self.std_test })[0] else: raise PhaseError('Phase is not train or test') policy = self._policy_action_bound(policy) return policy pass def _get_old_policy(self, state, action): a_range = (self.a_bound[:, 1] - self.a_bound[:, 0]) / 2. a_mean = (self.a_bound[:, 0] + self.a_bound[:, 1]) / 2. action = (action - a_mean) / a_range actor_output = self.sess.run(self.ppo.actor, feed_dict={ self.ppo.state: state, self.ppo.std: self.std_step })[0] # old_policy = self.sess.run(self.ppo.normal.log_prob(action - actor_output), # feed_dict={self.ppo.state: state, self.ppo.std: self.std_step})[0] old_policy = norm.logpdf(action - actor_output, loc=0, scale=self.std_step[0]) return old_policy pass def _make_std(self): # make std for step, train and test # a_range = self.a_bound[:, 1:] - self.a_bound[:, :1] self.std_step = np.ones([1, self.action_size]) self.std_train = np.ones([self.batch_size, self.action_size]) # self.std_train = np.multiply(self.std_train, np.transpose(a_range)) / 2. self.std_test = self.std_train / 5. ''' def make_delta(self, memory): states, rewards, next_states = [], [], [] for i in range(len(memory)): states.append(memory[i][0]) rewards.append(memory[i][2]) next_states.append(memory[i][3]) current_v = self.sess.run(self.ppo.critic, feed_dict={self.ppo.state: states}) next_v = self.sess.run(self.ppo.critic, feed_dict={self.ppo.state: next_states}) delta = [r_t + self.discount_factor * v_next - v for r_t, v_next, v in zip(rewards, next_v, current_v)] return delta pass def make_gae(self, memory): delta = self.make_delta(memory) gae = copy.deepcopy(delta) for t in reversed(range(len(gae) - 1)): gae[t] = gae[t] + self.gae_parameter * self.discount_factor * gae[t + 1] # memory[t].append(gae[t]) # memory[len(gae)-1].append(gae[len(gae)-1]) # normalize gae gae = np.array(gae).astype(np.float32) gae = (gae - gae.mean()) / (gae.std() + 1e-10) for t in range(len(gae)): memory[t].append(gae[t]) pass ''' def make_gae(self, memory): rewards = [m[2] for m in memory] masks = [m[4] for m in memory] # terminals values = [m[6] for m in memory] returns = np.zeros_like(rewards) advants = np.zeros_like(rewards) running_returns = 0 previous_value = 0 running_advants = 0 for t in reversed(range(0, len(rewards))): running_returns = rewards[ t] + self.discount_factor * running_returns * masks[t] running_tderror = rewards[ t] + self.discount_factor * previous_value * masks[t] - values[ t] running_advants = running_tderror + self.discount_factor * self.gae_parameter * running_advants * masks[ t] returns[t] = running_returns previous_value = values[t] advants[t] = running_advants if len(rewards) > 1: if (advants.std() == [0 for _ in range(len(rewards))]).all(): pass else: advants = (advants - advants.mean()) / advants.std() for t in range(len(rewards)): memory[t].append(advants[t]) memory[t].append(returns[t]) pass def memory_to_replay(self, memory): self.make_gae(memory) for i in range(len(memory)): self.replay.add(memory[i]) pass def train(self): scores, losses, scores2, losses2, idx_list = [], [], [], [], [] count = 0 for e in range(self.epochs): for i, idx in enumerate(np.random.permutation(self.train_size)): count += 1 idx_list.append(idx) if count % self.num_actor == 0: for j in range(self.num_actor): memory, states, rewards, next_states = [], [], [], [] score = 0 state = self.ENV.new_episode(idx_list[j]) for _ in range(self.timesteps): state = np.reshape(state, [1, self.state_size]) action = self.select_action(state, 'step') next_state, reward, terminal = self.ENV.act(action) old_policy = self._get_old_policy(state, action) old_value = self.sess.run( self.ppo.critic, feed_dict={self.ppo.state: state})[0] state = state[0] memory.append([ state, action, reward, next_state, terminal, old_policy, old_value ]) score += reward state = next_state if terminal: break scores.append(score) self.memory_to_replay(memory) for _ in range(self.num_train): losses.append(self.ppo.train_network(self.std_train)) self.replay.clear() scores2.append(np.mean(scores)) losses2.append(np.mean(losses, axis=0)) losses.clear() scores.clear() idx_list.clear() if count % 300 == 0 and count >= self.num_actor: print('epoch', e + 1, 'iter:', f'{count:05d}', ' score:', f'{scores2[-1]:.03f}', ' actor loss', f'{losses2[-1][0]:.03f}', ' critic loss', f'{losses2[-1][1]:.03f}', f'sequence: {self.env.sequence}') if count % 300 == 0 and count >= self.num_actor: self.ENV.render_worker( os.path.join(self.render_dir, f'{count:05d}.png')) if count % 1000 == 0: self.save() pass def play(self): cor_before_lst, cor_after_lst = [], [] for idx in range(self.test_size): state = self.ENV.new_episode(idx, phase='test') state = np.reshape(state, [1, self.state_size]) terminal = False score = 0 while not terminal: action = self.select_action(state, 'test') next_state, reward, terminal = self.ENV.act(action) next_state = np.reshape(next_state, [1, self.state_size]) score += reward state = next_state # time.sleep(0.02) if terminal: (cor_before, cor_after) = self.ENV.compare_accuracy() cor_before_lst.append(cor_before) cor_after_lst.append(cor_after) if (idx + 1) % 200 == 0: self.ENV.render_worker( os.path.join(self.play_dir, f'{(idx + 1):04d}.png')) print(f'{(idx + 1):04d} image score: {score}') print('====== NUMBER OF CORRECTION =======') print( f'before: {np.sum(cor_before_lst)}, after: {np.sum(cor_after_lst)}' ) pass def save(self): checkpoint_dir = os.path.join(self.save_dir, 'ckpt') if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) self.saver.save(self.sess, os.path.join(checkpoint_dir, 'trained_agent')) def load(self): print('=== loading ckeckpoint... ===') checkpoint_dir = os.path.join(self.save_dir, 'ckpt') self.saver.restore(self.sess, os.path.join(checkpoint_dir, 'trained_agent'))
def test(it, pa ,pg_resume, pg_learner=None, episode_max_length=200): if pg_learner is None: pg_learner=policy_network.PGLearner(pa) if pg_resume is not None: net_handle = open(pg_resume, 'rb') net_params = pickle.load(net_handle) pg_learner.set_net_params(net_params) accuracy=0. #logline = str(it) + '\n' for ex in range(pa.num_test_ex): env = Environment(ex+pa.num_ex) ob=env.current_grid print(sudoku.unflatten(ob)) print('Testing : ') acts = [] probs = [] rews = [] final_obs=[] final_acts=[] final_rews=[] indices=[] json_array = [] utils = 0 suffer = [] for _ in range(pa.episode_max_length): act_prob = pg_learner.get_one_act_prob(ob) csprob_n = np.cumsum(act_prob) a = np.argmax(act_prob) #################json # prev_waiting_tasks = env.waiting_tasks ################# # plt1 = visualize_state(ob, pa, '/tmp/trajs/'+str(_)+'.jpg') # if _ < sum([len(i) for i in workloads[0]]): # print('Agent action: ', a) # man_act = input('Manual Action : ') # if man_act: # a = int(man_act) ob, rews, mistake, done= env.act(a) acts.append(a) probs.append(act_prob) final_rews.append(rews) if done: break ##############logs if sum(final_rews)==0: accuracy+=1 if it % 20 == 0: print('Test Actions: ',acts) #print(probs) print('Reward : ', sum(final_rews)) print('Full Reward: ',final_rews) print('Accuracy:',accuracy/pa.num_test_ex)