def main(): # Initilization for tensor board session = tf.Session() tensorVar = tf.Variable(0) tf.summary.scalar("reward", tensorVar) sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic') sumWriterExternal = tf.summary.FileWriter('./reward/external') merged = tf.summary.merge_all() session.run(tf.initialize_all_variables()) actionMap = [0, 1, 2, 3, 4, 5, 11, 12] actionExplain = [ 'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left' ] goalExplain = [ 'top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key' ] stepCount = 0 parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=80) #parser.add_argument("--repeat_action_probability", default=0.25) parser.add_argument("--color_averaging", default=False) parser.add_argument("--random_seed") #parser.add_argument("--record_screen_path", default="./record") #parser.add_argument("--record_sound_filename") parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) args = parser.parse_args() ActorExperience = namedtuple( "ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple( "MetaExperience", ["state", "goal", "reward", "next_state", "done"]) env = ALEEnvironment(args.game, args) hdqn = Hdqn() agent = Agent(hdqn, range(8), range(6)) # set goalNum to hardcoded subgoal goalNum = 0 intrinsicRewardMonitor = 0 externalRewardMonitor = 0 env.act(12) # for i in range(100): # env.act(0) # print(env.isTerminal()) print(env.isTerminal())
def main(): # Process arguments args = utils.parse_args() # Use random seed from argument if args.random_seed: random.seed(args.random_seed) # Instantiate environment class if args.environment == "ale": env = ALEEnvironment(args.game, args) elif args.environment == "gym": env = GymEnvironment(args.game, args) elif args.environment == "robot": env = RobotEnvironment(args.game, args) else: assert False, "Unknown environment" + args.environment # Instantiate DQN action_dim = env.action_dim() state_dim = env.state_dim() net = DQN(state_dim, action_dim, args) # Load weights before starting training if args.load_weights: filepath = args.load_weights net.load(filepath) # Instantiate agent agent = Agent(env, net, args) # Start statistics stats = Statistics(agent, agent.net, agent.net.memory, env, args) # Play game with two players (user and agent) if args.two_player: player_b = PlayerTwo(args) env.set_mode('test') stats.reset() agent.play_two_players(player_b) stats.write(0, "2player") sys.exit() # Play agent if args.play_games > 0: env.set_mode('test') stats.reset() for _ in range(args.play_games): agent.play() stats.write(0, "play") sys.exit() # Populate replay memory with random steps if args.random_steps: env.set_mode('test') stats.reset() agent.play_random(args.random_steps) stats.write(0, "random") for epoch in range(args.start_epoch, args.epochs): # Train agent if args.train_steps: env.set_mode('train') stats.reset() agent.train(args.train_steps) stats.write(epoch + 1, "train") # Save weights after every epoch if args.save_weights_prefix: filepath = args.save_weights_prefix + "_%d.h5" % (epoch + 1) net.save(filepath) # Test agent if args.test_steps: env.set_mode('test') stats.reset() agent.test(args.test_steps) stats.write(epoch + 1, "test") # Stop statistics stats.close()
help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': # logger does not work with this line #logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights:
mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights)
help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DQN(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights)
mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DQN(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights)
def run_agent(args): # Launch the graph config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Set up training variables training_iters = args.training_iters display_step = args.display_step test_step = args.test_step test_count = args.test_count tests_done = 0 test_results = [] # Stats for display ep_rewards = [] ep_reward_last = 0 qs = [] q_last = 0 avr_ep_reward = max_ep_reward = avr_q = 0.0 # Set precision for printing numpy arrays, useful for debugging #np.set_printoptions(threshold='nan', precision=3, suppress=True) mode = args.model # Create environment if args.env_type == 'ALE': from environment import ALEEnvironment # 这是原文件中的一个,暂时没用上 env = ALEEnvironment(args.rom) if mode is None: mode = 'DQN' args.num_actions = env.numActions() elif args.env_type == 'gym': import gym try: import gym_vgdl #This can be found on my github if you want to use it. except: pass env = gym.make(args.env) if mode is None: shape = env.observation_space.shape if len(shape) is 3: mode = 'DQN' elif shape[0] is None: mode = 'object' else: mode = 'vanilla' args.num_actions = env.action_space.n #only works with discrete action spaces # Set agent variables if mode == 'DQN': args.model = 'CNN' args.preprocessor = 'deepmind' args.obs_size = [84, 84] args.history_len = 4 elif mode == 'image': args.model = 'CNN' args.preprocessor = 'grayscale' args.obs_size = list(env.observation_space.shape)[:2] args.history_len = 2 elif mode == 'object': args.model = 'object' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 elif mode == 'vanilla': args.model = 'nn' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 # Create agent agent = GraphQAgent(sess, args) #agent = DQNAgent.DQNAgent(sess, args) # Initialize all tensorflow variables sess.run(tf.global_variables_initializer()) # Keep training until reach max iterations # Start Agent state = env.reset() agent.Reset(state) rewards = [] terminal = False aver = np.zeros(int(training_iters / display_step) + 50) # 这个数组太大复制太慢了 maxeq = np.zeros(int(training_iters / display_step) + 50) savename = args.save_path + 'results/GBIL_3_' + args.riqi + args.env print(savename) iterationa = 0 for step in tqdm(range(training_iters), ncols=80): #env.render() #print("step",step) # Act, and add action, value = agent.GetAction_wq(step) state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) #print("len(agent.trajectory_embeddings)",len(agent.trajectory_embeddings),"len(trj_obs)",len(agent.trajectory_observations)) # Bookeeping rewards.append(reward) qs.append(value) if terminal: # Bookeeping ep_rewards.append(np.sum(rewards)) rewards = [] # Reset agent and environment # 应该在内存满了之后再用 #if agent.G.Graphisfull(): # C_time_a = time.time() # #agent.G.GetKeyPointByDegree() if cluster_flag: # for x in range(len(agent.keypoint.obss)): # trj_x = agent.keypoint.obss[x] # for y in range(len(trj_x)): # node_y = trj_x[y] #print("nodey",node_y) # plt.imshow(node_y) # plt.savefig(args.save_path+'results/temp/'+str(step)+'_'+str(x)+'_'+str(y)+'.png') # plt.close() #agent.G.ReconstructGraph(agent.keypoint.trjs2set())# 每display_step 重构一次 keypoints, keyobss = agent.keypoint.get_keypoint() agent.G.ReconstructGraph(keypoints) show_obs( keyobss, args.save_path + 'results/temp/', "GBIL_3_" + args.riqi + args.env + "_" + str(step) + "_") # agent.G.GraphCluster(args.num_center) cluster_flag = False # # 这里的超参数决定了聚类的松紧,两个数值是特征向量之间的距离, # # 第一个要比第二个大,小于第二个参数的会被归为一类, # #第一个到第二个之间的可能归为多类,两个参数距离越远,类别数越多 # C_time_b = time.time() # print("cluster time using ",C_time_b-C_time_a) state = env.reset() agent.Reset(state) # Display Statistics if (step) % display_step == 0: cluster_flag = True num_eps = len(ep_rewards[ep_reward_last:]) if num_eps is not 0: avr_ep_reward = np.mean(ep_rewards[ep_reward_last:]) max_ep_reward = np.max(ep_rewards[ep_reward_last:]) avr_q = np.mean(qs[q_last:]) q_last = len(qs) ep_reward_last = len(ep_rewards) dict_entries = 0 #agent.DND.tot_capacity() aver[iterationa] = avr_ep_reward maxeq[iterationa] = max_ep_reward iterationa = iterationa + 1 np.save(savename + 'aver.npy', aver) np.save(savename + 'maxeq.npy', maxeq) tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\ .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps) +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\ .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries)) # Continue until end of episode step = training_iters while not terminal: # Act, and add action, value = agent.GetAction_wq(step) state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) step += 1
def run_agent(args): # Launch the graph config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Set up training variables training_iters = args.training_iters display_step = args.display_step test_step = args.test_step test_count = args.test_count tests_done = 0 test_results = [] # Stats for display ep_rewards = [] ep_reward_last = 0 qs = [] q_last = 0 avr_ep_reward = max_ep_reward = avr_q = 0.0 # Set precision for printing numpy arrays, useful for debugging #np.set_printoptions(threshold='nan', precision=3, suppress=True) mode = args.model # Create environment if args.env_type == 'ALE': from environment import ALEEnvironment # 这是原文件中的一个,暂时没用上 env = ALEEnvironment(args.rom) if mode is None: mode = 'DQN' args.num_actions = env.numActions() elif args.env_type == 'gym': import gym try: import gym_vgdl #This can be found on my github if you want to use it. except: pass env = gym.make(args.env) if mode is None: shape = env.observation_space.shape if len(shape) is 3: mode = 'DQN' elif shape[0] is None: mode = 'object' else: mode = 'vanilla' args.num_actions = env.action_space.n #only works with discrete action spaces # Set agent variables if mode == 'DQN': args.model = 'CNN' args.preprocessor = 'deepmind' args.obs_size = [84, 84] args.history_len = 4 elif mode == 'image': args.model = 'CNN' args.preprocessor = 'grayscale' args.obs_size = list(env.observation_space.shape)[:2] args.history_len = 2 elif mode == 'object': args.model = 'object' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 elif mode == 'vanilla': args.model = 'nn' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 # Create agent agent = GraphQAgent.GraphQAgent(sess, args) #agent = DQNAgent.DQNAgent(sess, args) # Initialize all tensorflow variables sess.run(tf.global_variables_initializer()) # Keep training until reach max iterations # Start Agent state = env.reset() agent.Reset(state) rewards = [] terminal = False aver = np.zeros(training_iters) maxeq = np.zeros(training_iters) savename = args.save_path + 'results/GQ' + args.riqi + args.env print(savename) iterationa = 0 for step in tqdm(range(training_iters), ncols=80): #env.render() # Act, and add action, value = agent.GetAction_wq() state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) # Bookeeping rewards.append(reward) qs.append(value) if terminal: # Bookeeping ep_rewards.append(np.sum(rewards)) rewards = [] # Reset agent and environment state = env.reset() agent.Reset(state) # Display Statistics if (step) % display_step == 0: num_eps = len(ep_rewards[ep_reward_last:]) if num_eps is not 0: avr_ep_reward = np.mean(ep_rewards[ep_reward_last:]) max_ep_reward = np.max(ep_rewards[ep_reward_last:]) avr_q = np.mean(qs[q_last:]) q_last = len(qs) ep_reward_last = len(ep_rewards) dict_entries = 0 #agent.DND.tot_capacity() aver[iterationa] = avr_ep_reward maxeq[iterationa] = max_ep_reward iterationa = iterationa + 1 np.save(savename + 'aver.npy', aver) np.save(savename + 'maxeq.npy', maxeq) tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\ .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps) +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\ .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries)) # Continue until end of episode step = training_iters while not terminal: # Act, and add action, value = agent.GetAction_wq() state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) step += 1
mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights)
def run_agent(args): # Launch the graph config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Set up training variables training_iters = args.training_iters display_step = args.display_step test_step = args.test_step test_count = args.test_count tests_done = 0 test_results = [] # Stats for display ep_rewards = [] ep_reward_last = 0 qs = [] q_last = 0 avr_ep_reward = max_ep_reward = avr_q = 0.0 # Set precision for printing numpy arrays, useful for debugging #np.set_printoptions(threshold='nan', precision=3, suppress=True) mode = args.model # Create environment if args.env_type == 'ALE': from environment import ALEEnvironment env = ALEEnvironment(args.rom) if mode is None: mode = 'DQN' args.num_actions = env.numActions() elif args.env_type == 'gym': import gym try: import gym_vgdl #This can be found on my github if you want to use it. except: pass env = gym.make(args.env) if mode is None: shape = env.observation_space.shape if len(shape) is 3: mode = 'DQN' elif shape[0] is None: mode = 'object' else: mode = 'vanilla' args.num_actions = env.action_space.n #only works with discrete action spaces # Set agent variables if mode == 'DQN': args.model = 'CNN' args.preprocessor = 'deepmind' args.obs_size = [84, 84] args.history_len = 4 elif mode == 'image': args.model = 'CNN' args.preprocessor = 'grayscale' args.obs_size = list(env.observation_space.shape)[:2] args.history_len = 2 elif mode == 'object': args.model = 'object' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 elif mode == 'vanilla': args.model = 'nn' args.preprocessor = 'default' args.obs_size = list(env.observation_space.shape) args.history_len = 0 # Create agent agent = NECAgent.NECAgent(sess, args) #agent = DQNAgent.DQNAgent(sess, args) # Initialize all tensorflow variables sess.run(tf.global_variables_initializer()) # Keep training until reach max iterations # Start Agent state = env.reset() agent.Reset(state) rewards = [] terminal = False for step in tqdm(range(training_iters), ncols=80): #env.render() # Act, and add action, value = agent.GetAction() state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) # Bookeeping rewards.append(reward) qs.append(value) if terminal: # Bookeeping ep_rewards.append(np.sum(rewards)) rewards = [] if step >= (tests_done) * test_step: R_s = [] for i in tqdm( range(test_count), ncols=50, bar_format='Testing: |{bar}| {n_fmt}/{total_fmt}'): R = test_agent(agent, env) R_s.append(R) tqdm.write("Tests: {}".format(R_s)) tests_done += 1 test_results.append({ 'step': step, 'scores': R_s, 'average': np.mean(R_s), 'max': np.max(R_s) }) #Save to file summary = {'params': vars(args), 'tests': test_results} if args.save_file is not None: np.save(args.save_file, summary) # Reset agent and environment state = env.reset() agent.Reset(state) # Display Statistics if (step) % display_step == 0: num_eps = len(ep_rewards[ep_reward_last:]) if num_eps is not 0: avr_ep_reward = np.mean(ep_rewards[ep_reward_last:]) max_ep_reward = np.max(ep_rewards[ep_reward_last:]) avr_q = np.mean(qs[q_last:]) q_last = len(qs) ep_reward_last = len(ep_rewards) dict_entries = 0 #agent.DND.tot_capacity() tqdm.write("{}, {:>7}/{}it | {:3n} episodes,"\ .format(time.strftime("%H:%M:%S"), step, training_iters, num_eps) +"q: {:4.3f}, avr_ep_r: {:4.1f}, max_ep_r: {:4.1f}, epsilon: {:4.3f}, entries: {}"\ .format(avr_q, avr_ep_reward, max_ep_reward, agent.epsilon, dict_entries)) # Continue until end of episode step = training_iters while not terminal: # Act, and add action, value = agent.GetAction() state, reward, terminal, info = env.step(action) agent.Update(action, reward, state, terminal) step += 1 # Final test R_s = [] for i in tqdm(range(test_count), ncols=50, bar_format='Testing: |{bar}| {n_fmt}/{total_fmt}'): R = test_agent(agent, env) R_s.append(R) tqdm.write("Tests: {}".format(R_s)) tests_done += 1 test_results.append({ 'step': step, 'scores': R_s, 'average': np.mean(R_s), 'max': np.max(R_s) }) #Save to file summary = {'params': vars(args), 'tests': test_results} if args.save_file is not None: np.save(args.save_file, summary)
def main(): # Initilization for tensor board session = tf.Session() tensorVar = tf.Variable(0) tensorVarLoss = tf.Variable(0, dtype="float32") tensorVarMiddle = tf.Variable(0, dtype="float32") tensorVarLowerRight = tf.Variable(0, dtype="float32") tensorVarLowerLeft = tf.Variable(0, dtype="float32") tensorVarKey = tf.Variable(0, dtype="float32") tf.summary.scalar("reward", tensorVar) tf.summary.scalar("loss", tensorVarLoss) tf.summary.scalar("middle ladder", tensorVarMiddle) tf.summary.scalar("lower right ladder", tensorVarLowerRight) tf.summary.scalar("lower left ladder", tensorVarLowerLeft) tf.summary.scalar("key", tensorVarKey) sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic') sumWriterLoss = tf.summary.FileWriter('./reward/loss') sumWriterExternal = tf.summary.FileWriter('./reward/external') sumWriterMiddle = tf.summary.FileWriter('./reward/middleLadder') sumWriterLowerRight = tf.summary.FileWriter('./reward/lowerRightLadder') sumWriterLowerLeft = tf.summary.FileWriter('./reward/lowerLeftLadder') sumWriterKey = tf.summary.FileWriter('./reward/key') merged = tf.summary.merge_all() session.run(tf.initialize_all_variables()) actionMap = [0, 1, 2, 3, 4, 5, 11, 12] actionExplain = [ 'no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left' ] goalExplain = ['lower right ladder', 'lower left ladder', 'key'] stepCount = 0 goalSuccessTrack = [ deque(), deque(), deque(), deque() ] # deque in python is linkedlist, list is actually an array goalSuccessCount = [0, 0, 0, 0] parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=4) parser.add_argument("--color_averaging", default=False) parser.add_argument("--random_seed") parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) parser.add_argument("--load_weight", default=False) parser.add_argument("--use_sparse_reward", type=str2bool, default=False) args = parser.parse_args() ActorExperience = namedtuple( "ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple( "MetaExperience", ["state", "goal", "reward", "next_state", "done"]) annealComplete = False saveExternalRewardScreen = True env = ALEEnvironment(args.game, args) hdqn = Hdqn() # Initilize network and agent if (args.load_weight): defaultRandomPlaySteps = 200000 print('loading weight') hdqn.loadWeight() print('loading weight complete') agent = Agent(hdqn, range(8), range(3)) else: defaultRandomPlaySteps = 200000 agent = Agent(hdqn, range(8), range(3)) intrinsicRewardMonitor = 0 externalRewardMonitor = 0 for episode in range(80000): print("\n\n### EPISODE " + str(episode) + "###") print("\n\n### STEPS " + str(stepCount) + "###") # Restart the game env.restart() episodeSteps = 0 # set goalNum to hardcoded subgoal lastGoal = -1 while not env.isGameOver() and episodeSteps <= maxStepsPerEpisode: totalExternalRewards = 0 # NOT SURE IF IT SHOULD BE CLEARED HERE! stateLastGoal = env.getStackedState() # nextState = stateLastGoal goal = agent.selectGoal(stateLastGoal) if (len(goalSuccessTrack[goal]) > 100): firstElement = goalSuccessTrack[goal].popleft() goalSuccessCount[goal] -= firstElement print('predicted subgoal is: ' + goalExplain[goal]) while not env.isTerminal() and not env.goalReached( goal) and episodeSteps <= maxStepsPerEpisode: state = env.getStackedState() action = agent.selectMove(state, goal) externalRewards = env.act(actionMap[action]) if (externalRewards != 0): externalRewards = 1.0 # Debugging if (saveExternalRewardScreen and externalRewards == 100): im = Image.fromarray(np.squeeze(env.getState())) im.save('keyGet.jpeg') saveExternalRewardScreen = False stepCount += 1 episodeSteps += 1 # save the model every 50000 steps if (stepCount % 50000 == 0): hdqn.saveWeight(stepCount) nextState = env.getStackedState() distanceReward = env.distanceReward(lastGoal, goal) # only assign intrinsic reward if the goal is reached and it has not been reached previously intrinsicRewards = agent.criticize( env.goalNotReachedBefore(goal) & env.goalReached(goal), actionMap[action], env.isTerminal(), distanceReward, args.use_sparse_reward) # Store transition and update network params exp = ActorExperience(state, goal, action, intrinsicRewards, nextState, env.isTerminal()) agent.store(exp, meta=False) # Do not update the network during random play if (stepCount >= defaultRandomPlaySteps): if (stepCount == defaultRandomPlaySteps): print('start training (random walk ends)') if (stepCount % 4 == 0): loss = agent.update(stepCount, meta=False) agent.update(stepCount, meta=True) # Update external reward for D2 totalExternalRewards += externalRewards + intrinsicRewards # Update data for visualization externalRewardMonitor += externalRewards intrinsicRewardMonitor += intrinsicRewards # Store meta controller's experience exp = MetaExperience(stateLastGoal, goal, totalExternalRewards, nextState, env.isTerminal()) agent.store(exp, meta=True) # Update goal if episodeSteps > maxStepsPerEpisode: goalSuccessTrack[goal].append(0) break elif env.goalReached(goal): goalSuccessTrack[goal].append(1) goalSuccessCount[goal] += 1 print('goal reached: ' + goalExplain[goal]) # Training Visualization intrinsicPlot = session.run( merged, feed_dict={tensorVar: intrinsicRewardMonitor}) sumWriterIntrinsic.add_summary(intrinsicPlot, stepCount) sumWriterIntrinsic.flush() externalPlot = session.run( merged, feed_dict={tensorVar: externalRewardMonitor}) sumWriterExternal.add_summary(externalPlot, stepCount) sumWriterExternal.flush() lowerRightPlot = session.run( merged, feed_dict={ tensorVarLowerRight: float(goalSuccessCount[0]) / (0.1 + len(goalSuccessTrack[0])) }) sumWriterLowerRight.add_summary(lowerRightPlot, stepCount) sumWriterLowerRight.flush() lowerLeftPlot = session.run( merged, feed_dict={ tensorVarLowerLeft: float(goalSuccessCount[1]) / (0.1 + len(goalSuccessTrack[1])) }) sumWriterLowerLeft.add_summary(lowerLeftPlot, stepCount) sumWriterLowerLeft.flush() keyPlot = session.run(merged, feed_dict={ tensorVarKey: float(goalSuccessCount[2]) / (0.1 + len(goalSuccessTrack[2])) }) sumWriterKey.add_summary(keyPlot, stepCount) sumWriterKey.flush() lastGoal = goal # get key if goal == 2: break else: goalSuccessTrack[goal].append(0) if not env.isGameOver(): lastGoal = -1 env.beginNextLife() if (not annealComplete): # Annealing agent.annealMetaEpsilon(stepCount) agent.annealControllerEpsilon(stepCount, goal)
mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument("--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights)
def main(): actionMap = [0, 1, 2, 3, 4, 5, 11, 12] goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key'] actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] stepCount = 0 parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=4) #parser.add_argument("--repeat_action_probability", default=0.25) parser.add_argument("--color_averaging", default=False) parser.add_argument("--random_seed") #parser.add_argument("--record_screen_path", default="./record") #parser.add_argument("--record_sound_filename") parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) args = parser.parse_args() env = ALEEnvironment(args.game, args) hdqn = Hdqn() print('loading weights') hdqn.loadWeight() print('weight loaded') agent = Agent(hdqn, range(8), range(6)) # Probability of making random action is 0.1 agent.setControllerEpsilon([0.1]*6) agent.setMetaEpsilon(0.1) while True: env.restart() for i in range(10): env.act(0) goalNum = 0 while not env.isGameOver(): goal = agent.selectTrueGoal(goalNum) print('predicted subgoal is: ' + str(goal) + ' ' + goalExplain[goal]) while not env.isTerminal() and not env.goalReached(goal): state = env.getState() action = agent.selectMove(state, goal) #print ('selected action is: ' + str(actionMap[action]) + ' ' + actionExplain[actionMap[action]]) #print('selected action is :' + str(actionExplain[action])) externalRewards = env.act(actionMap[action]) if env.isTerminal() is False: goalNum = goalNum + 1 else: # Re-initialize game if not game over if not env.isGameOver(): goalNum = 0 env.resetLife() for i in range(10): env.act(0)
default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) # bug with double logging if args.environment == 'gym': logger.handlers.pop() if args.random_seed: random.seed(args.random_seed) # instantiate classes env = GymEnvironment(args.rom_file, args) if args.environment == 'gym' else ALEEnvironment( args.rom_file, args) mem = ReplayMemory(args.replay_size, args) net = DeepQNetwork(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) stats.reset() agent.play(args.play_games) stats.write(0, "play") if args.visualization_file:
def __init__(self, id, prediction_q, training_q, config): self.history = StateHistory(config) env = ALEEnvironment(config) super(DeepQAgent, self).__init__(id, prediction_q, training_q, config, env)
from environment import ALEEnvironment from RHEA import RollingHorizonEvolutionaryAlgorithm if __name__ == '__main__': ale = ALEEnvironment('./roms/qbert.bin') rollout_length = 50 rhea = RollingHorizonEvolutionaryAlgorithm(rollout_length, ale, 0.2, 10) rhea.run()