def setup_Agent(filename, epsilon): """ Function to initialize the DQN agent """ # one hot vector (opponents move) on top of game board input_dims = 7 * 7 action_space = tuple(range(7)) n_actions = 7 h1_dims = 512 h2_dims = 256 agent = Agent(lr=0.001, gamma=0.95, epsilon=epsilon, epsilon_dec=0.995, epsilon_min=0.01, input_shape=input_dims, h1_dims=h1_dims, h2_dims=h2_dims, action_space=action_space, training_epochs=2, fname=filename) memory = ReplayBuffer(50000, input_dims, n_actions) return agent, memory
def main(env, gamma, epsilon, final_epsilon, final_exp_step, lr, memory_size, target_update_freq, gradient_update_freq, batch_size, replay_start, val_freq, log_freq_by_step, log_freq_by_ep, val_epsilon, log_dir, weight_dir, steps): train_env = make_atari(env + "NoFrameskip-v4") val_env = make_atari(env + "NoFrameskip-v4", noop=False) agent = Agent(train_env, DQN, gamma=gamma, epsilon=epsilon, final_epsilon=final_epsilon, final_exp_step=final_exp_step) trainer = Trainer(agent, val_env, lr=lr, memory_size=memory_size, target_update_freq=target_update_freq, gradient_update_freq=gradient_update_freq, batch_size=batch_size, replay_start=replay_start, val_freq=val_freq, log_freq_by_step=log_freq_by_step, log_freq_by_ep=log_freq_by_ep, val_epsilon=val_epsilon, log_dir=log_dir, weight_dir=weight_dir) trainer.train(steps)
def test_agent(): print("##################Running agent test##################") agent = Agent(state_shape, action_shape) state1 = np.array([1,2,3,4]).reshape(-1,4) state2 = np.array([2,3,4,5]).reshape(-1,4) out1 = agent.model.predict(state1) out2 = agent.model.predict(state2) print(out1) print(out2) assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state" print("Agent test passed :)\n\n")
def main_loop(self): agent = Agent() numGames = 0 top = 0 while numGames < 100: food = Food(self.size, self.screen) food.food_new() snake = Snake(size=self.size) while not self.over: agent.epsilon = 100 - numGames oldState = agent.get_state(snake, food) if randint(0, 200) < agent.epsilon: move = to_categorical(randint(0, 2), num_classes=3, dtype='int32') else: predict = agent.model.predict(oldState.reshape(1, 11)) move = to_categorical(np.argmax(predict[0]), num_classes=3, dtype='int32') if np.array_equal(move, [1, 0, 0]): snake.xVel = 10 print('condition1') elif np.array_equal( move, [0, 1, 0]) and snake.yVel == 0: # right - going horizontal snake.yVel = 10 print('condition2') elif np.array_equal( move, [0, 1, 0]) and snake.xVel == 0: # right - going vertical snake.xVel = 10 print('condition3') elif np.array_equal( move, [0, 0, 1]) and snake.yVel == 0: # left - going horizontal snake.yVel = -10 print('condition4') elif np.array_equal( move, [0, 0, 1]) and snake.xVel == 0: # left - going vertical snake.xVel = -10 print('condition5') snake.snake_move() self.check_collisions(snake, food) self.update_window(snake, food) self.clock.tick(10) newState = agent.get_state(snake, food) reward = agent.get_reward(self.foodCollide, self.over) agent.train_short(oldState, move, reward, newState, self.over) agent.write_memory(oldState, move, reward, newState, self.over) agent.replay(agent.mem) numGames += 1 print(numGames)
def main(): gym_env = gym.make('custom_gym:Xplane-v0') lr = 0.001 gam = 0.01 n_games = 1 # nn_input = obs() agent = Agent(learning_rate=lr, gamma=gam, epsilon=1.0, input_dims=(6, ), n_actions=15, batch_size=32, file_name='AI_takeoff/saved_models/dq_model_2.h5') scores = [] total_steps = [] eps_hist = [] agent.load_model() for i in range(n_games): try: done = False score = 0 observation = gym_env.reset() time.sleep(2) observation_checkpoints = np.array([observation[0:2]]) step_counter = 0 print("GAME ITERATION ", i) while not done: action = agent.choose_action(observation) new_observation, reward, done = gym_env.step(action) step_counter = step_counter + 1 score = score + reward agent.store_transition(observation, action, reward, new_observation, done) observation = new_observation # agent.learn() # This if statement checks if the airplane is stuck observation_checkpoints = np.append(observation_checkpoints, [new_observation[0:2]], axis=0) print(observation_checkpoints) print("stepcounter is", step_counter) if step_counter % 30 == 0: if np.array_equal( observation_checkpoints[step_counter - 30], observation_checkpoints[step_counter - 1]): done = True eps_hist.append(agent.epsilon) scores.append(score) total_steps.append(step_counter) except Exception as e: print(str(e))
def __init__(self, fname): lr = 0.0005 self.agent = Agent(gamma=0.99, epsilon=0.0, alpha=lr, input_dims=6, n_actions=2, mem_size=60000, batch_size=64, epsilon_end=0.0, fname=fname) self.observation = [] self.action = 0 self.n_step = 0 self.fname = fname.split("/")[-1]
def main(): agent = Agent() agent.load() total_reward = 0 obs = env.reset() env.render() for _ in range(10000): act = agent.predict(obs) obs, reward, done, _ = env.step(act) total_reward += reward env.render() if done: print(f'total_reward: {total_reward}') env.close() break
def main(env_name=None): ENV_NAME = 'wumpus-v0' if env_name: ENV_NAME = env_name MODEL_DIR = f'models/{ENV_NAME}-dqn' MODEL_FILE = f'{ENV_NAME}-dqn.h5' CHECKPOINTS_DIR = f'models/{ENV_NAME}-dqn/checkpoints' TEST_IMG_DIR = f'tests/{ENV_NAME}-dqn' env = gym.make(ENV_NAME) env.reset() agent = Agent(learning_rate=0.01, gamma=0.95, state_shape=env.observation_space.shape, actions=7, batch_size=64, epsilon_initial=0.0, epsilon_decay=0, epsilon_final=0.0, replay_buffer_capacity=1000000, model_name=MODEL_FILE, model_dir=MODEL_DIR, ckpt_dir=CHECKPOINTS_DIR) agent.load_model() done = False score = 0 steps_per_episode = 0 state = env.reset() images = [env.render('rgb_array')] while not done: # Choose action according to policy, and execute action = agent.select_action(state) state, reward, done, _ = env.step(action) score += reward steps_per_episode += 1 images.append(env.render('rgb_array')) # Generate GIF for the execution create_gif( f'{ENV_NAME}.gif', np.array(images), fps=1.0 ) print( f'Model \'{str(ENV_NAME)}\', score {score}, steps {steps_per_episode}')
def start(): env = gym.make('CartPole-v0') params = { 'gamma': 0.8, 'epsi_high': 0.9, 'epsi_low': 0.05, 'decay': 500, 'lr': 0.001, 'capacity': 10000, 'batch_size': 64, 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 for i in range(200): env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1 agent.put(s0, a0, r1, s1) if done: break total_reward += r1 s0 = s1 agent.learn() score.append(total_reward) mean.append(sum(score[-100:]) / 100) print(total_reward)
def main(): #make env and agent env = gym.make('LunarLander-v2') agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, input_dims=[8], lr=0.0001) scores, eps_history = [], [] n_games = 500 for i in range(n_games): score = 0 done = False observation = env.reset() while not done: #ingame #get action from current view of game (observation) action = agent.choose_action(observation) #next frame observation_, reward, done, info = env.step(action) score += reward #store memory agent.store_transisation(observation, action, reward, observation_, done) agent.learn() #set next stage to current stage observation = observation_ #append score and eps scores.append(score) eps_history.append(agent.epsilon) #print some nice statements avg_score = np.mean(scores[-100:]) print( f'Episode: {i} Score: {score} Average Score: {avg_score} Epsilon: {agent.epsilon}' )
def __init__(self): self._load_config() # Control parameter used to scale bid price self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08] self.eps_start = 0.95 self.eps_end = 0.05 self.anneal = 0.00005 self._reset_episode() # DQN Network to learn Q function self.dqn_agent = Agent(state_size=7, action_size=7, seed=0) # Reward Network to reward function self.reward_net = RewardNet(state_action_size=8, reward_size=1, seed=0) self.dqn_state = None self.dqn_action = 3 # no scaling self.dqn_reward = 0 # Reward-Dictionary self.reward_dict = {} self.S = [] self.V = 0 self.total_wins = 0 self.total_rewards = 0.0
def OldStuff(): tf.compat.v1.disable_eager_execution() lr = 0.001 numGames = 10000 session = TriadGameSession() observation = session.getState() scores = [] agent = Agent(gamma=0.99, lr=lr, epsilon=1.0, epsilonDec=0.0005, inputSize=[len(observation)], numActions=session.getMaxActions(), memSize=1000000, batchSize=1024) for i in range(numGames): done = False score = 0 session = TriadGameSession() observation = session.getState() while not done: action = agent.chooseAction(observation) observationNext, reward, done = session.step(action) score += reward agent.store(observation, action, reward, observationNext, done) observation = observationNext agent.learn() scores.append(score) avgScore = np.mean(scores[-100:]) print('game:', i, 'score %.2f' % score, 'avgScore %.2f' % avgScore, 'epsilon %.2f' % agent.epsilon) #agent.save() print('Finished!')
def setup_Agent(filename, epsilon): """ Function to initialize the DQN agent """ input_dims = 6 * 7 action_space = tuple(range(7)) n_actions = 7 h1_dims = 512 h2_dims = 256 agent = Agent(lr=0.001, gamma=0.95, epsilon=epsilon, epsilon_dec=0.995, epsilon_min=0.01, input_shape=input_dims, h1_dims=h1_dims, h2_dims=h2_dims, action_space=action_space, training_epochs=1, fname=filename) return agent
import sys import gym from dqn import Agent num_episodes = 5000 env_name = sys.argv[1] if len(sys.argv) > 1 else "MsPacman-v0" env = gym.make(env_name) agent = Agent(state_size=env.observation_space.shape, number_of_actions=env.action_space.n, save_name=env_name) for e in xrange(num_episodes): observation = env.reset() done = False agent.new_episode() total_cost = 0.0 total_reward = 0.0 frame = 0 while not done: frame += 1 #env.render() action, values = agent.act(observation) #action = env.action_space.sample() observation, reward, done, info = env.step(action) total_cost += agent.observe(reward) total_reward += reward print "total reward", total_reward print "mean cost", total_cost / frame
# ------------------------------ Variable Declaration ---------------------------------- NUM_OF_ZOMBIES = 1 NUM_OF_VILLAGERS = 1 agent_host = MalmoPython.AgentHost() malmoutils.parse_command_line(agent_host) validate = True num_reps = 300 #=======core part initialization==================================== #input size 5*5, you can change the size here memory = MemoryD(5) network_model, q_values_func = nn_model(input_shape=[5, 5]) agent = Agent(network_model, q_values_func, memory, 'train', 'ddqn') #set learning rate to be 0.00025 agent.do_compile(optimizer=Adam(lr=0.00025), loss_func=mean_huber_loss) agent.memoryD.clear() #=================================================================== for iRepeat in range(num_reps): my_mission_record = malmoutils.get_default_recording_object( agent_host, "./Mission_{}".format(iRepeat + 1)) #my_mission_record = MalmoPython.MissionRecordSpec('./' + "Mission_" + str(iRepeat) + ".tgz") #my_mission_record.recordRewards() #my_mission_record.recordMP4(24,400000) #my_mission_record.recordObservations() my_mission = MalmoPython.MissionSpec(GetMissionXML(mapblock, agent_host), validate)
scores = [] epsHistory = [] numGames = 1 batch_size = 32 n_actions = 6 input_dims = (185, 95) crop_start = (15, 30) crop_end = (200, 125) starting_epsilon = 0.05 if LOAD_MODEL else 1.0 env = gym.make('SpaceInvaders-v0') brain = Agent(gamma=0.95, epsilon=0.05, lr=0.003, input_dims=input_dims, batch_size=batch_size, n_actions=n_actions, max_mem_size=5000, save_path='models/') if LOAD_MODEL: brain.load() else: # load memory with random games while brain.mem_cntr < brain.mem_size: observation = env.reset() observation = preprocess(observation, crop_start, crop_end) done = False while not done: # 0 no action, 1 fire, 2 move right, 3 move left, 4 move right fire, 5 move left fire action = env.action_space.sample()
from triadgame import TriadGameSession import numpy as np import tensorflow as tf tf.compat.v1.disable_eager_execution() lr = 0.001 numGames = 10000 session = TriadGameSession() observation = session.getState() scores = [] agent = Agent(gamma=0.99, lr=lr, epsilon=1.0, epsilonDec=0.0005, inputSize=[len(observation)], numActions=session.getMaxActions(), memSize=1000000, batchSize=64) for i in range(numGames): done = False score = 0 session = TriadGameSession() observation = session.getState() while not done: action = agent.chooseAction(observation) observationNext, reward, done = session.step(action) score += reward agent.store(observation, action, reward, observationNext, done) observation = observationNext agent.learn()
def train(path, env): #env = Monitor(env, path, video_callable=video_callable, force=True) agent = Agent(env, path=path) agent.train() return agent
if __name__ == '__main__': env = gym.make('CartPole-v0') params = { 'gamma': 0.8, 'epsi_high': 0.9, 'epsi_low': 0.05, 'decay': 200, 'lr': 0.001, 'capacity': 10000, 'batch_size': 64, 'state_space_dim': env.observation_space.shape[0], 'action_space_dim': env.action_space.n } agent = Agent(**params) score = [] mean = [] for episode in range(1000): s0 = env.reset() total_reward = 1 while True: env.render() a0 = agent.act(s0) s1, r1, done, _ = env.step(a0) if done: r1 = -1
env = make_env('BreakoutNoFrameskip-v4') #env = make_env('SpaceInvadersNoFrameskip-v4') test_rewards, test_qvalue, test_times = [], [], [] scores, eps_history = [], [] num_games = 10_000 number_of_tests = 30 # Numero de pruebas a realizar n_steps, n_test = 0, 1 # Contador de steps y pruebas individuales n_test_instance = 1 # Contador de instancias de prueba test_every_frames = 520_000 # Realizar pruebas cada n frames load_checkpoint = False # Cargar modelo (?) render = False agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.00025, input_dims=env.observation_space.shape, n_actions=env.action_space.n, mem_size=200_000, eps_min=0.1, batch_size=32, replace=10_000, eps_dec=1e-5, save_name='dqn_model', load_name='dqn_model_5000it.h5') if load_checkpoint: agent.epsilon = 0.1 agent.load_models() last_ep = 0 for episode in tqdm(range(num_games)):
def main(): # Initialize environment, agent env = gym.make(ENV_NAME) summary_writer = tf.summary.create_file_writer(LOG_DIR) agent = Agent(learning_rate=0.01, gamma=0.95, state_shape=env.observation_space.shape, actions=7, batch_size=64, epsilon_initial=0.9, epsilon_decay=1e-6, epsilon_final=0.01, replay_buffer_capacity=1000000, model_name=MODEL_FILE, model_dir=MODEL_DIR, ckpt_dir=CHECKPOINTS_DIR, log_dir=LOG_DIR) scores = [] for i in range(1, EPISODES + 1): done = False score = 0 state = env.reset() steps_per_episode = 0 # Play one episode while not done: # Choose action (epsilon greedy), and execute action = agent.select_action(state) next_state, reward, done, _ = env.step(action) score += reward # Store in experience replay buffer agent.store_experience(state, action, reward, next_state, done) state = next_state agent.train() steps_per_episode += 1 if len(scores) == 100: scores.pop(0) scores.append(score) avg_score = np.mean(scores) min_score = np.min(scores) max_score = np.max(scores) print( f'Episode: {i}, Score {score:.2f}, Avg_score {avg_score:.2f}, Epsilon {agent.epsilon:.2f}') # Summaries for Tensorboard write_summaries(summary_writer, { 'epsilon': agent.epsilon, 'reward.episode': score, 'reward.avg': avg_score, 'reward.min': min_score, 'reward.max': max_score, 'steps.count': steps_per_episode }, i, ENV_NAME) # Save the model if i % SAVE_INTERVAL == 0: print(f'Saving model to \'{MODEL_FILE}\' [Overwriting]') agent.save_model() # Save checkpoint if i % CHECKPOINT_INTERVAL == 0: print(f'Adding checkpoint: \'{CHECKPOINTS_DIR}/episode-{i}.h5\'') agent.save_checkpoint(f'episode-{i}')
import numpy as np from dqn import Agent from utils import plotLearning, make_env if __name__ == '__main__': env = make_env('PongNoFrameskip-v4') num_games = 500 load_checkpoint = False best_score = -21 agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0001, input_dims=(4,80,80), n_actions=6, mem_size=25000, eps_min=0.02, batch_size=32, replace=1000, eps_dec=1e-5) if load_checkpoint: agent.load_models() filename = 'PongNoFrameskip-v4.png' scores, eps_history = [], [] n_steps = 0 for i in range(num_games): done = False observation = env.reset() score = 0 while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) n_steps += 1 score += reward
def main(): scores = [] eps_history = [] info_history = [] # Random starting-points: env = sky.make(random=True, xi=(301, 650 - 25), yi=(100, 300 - 25), width=15, height=15, v_initial=14) # Fixed starting-point: #env = sky.make(xi=550) agent = Agent(gamma=gamma, epsilon=epsilon, lr=lr, input_dims=[imput_dimensions], n_actions=n_actions, mem_size=mem_size, batch_size=batch_size, epsilon_dec=epsilon_dec) if (load_checkpoint): agent.load_modes() for i in range(n_games): score = 0 done = False observation = env.reset() while not done: ''' one game: ending, when done=True ''' action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.store_transition(observation, action, reward, observation_, int(done)) observation = observation_ agent.learn() if i % 10 == 0 and i > 0: avg_score = np.mean(scores[max(0, i - 10):(i + 1)]) print(i, 'episode', info, '|| score:', score, '| average score: %.3f' % avg_score, '| epsilon: %.3f' % agent.epsilon, '| training done:', round(i / n_games, 2)) else: print(i, 'episode', info, '|| score:', score) scores.append(score) eps_history.append(agent.epsilon) info_history.append(info) print('training ended with:', [[el, info_history.count(el)] for el in ('crashed', 'goal')]) if (save_checkpoint): agent.save_models() print('[+] model saved') # ------------------- # Plotting and output # ------------------- x = [i + 1 for i in range(n_games)] # First axis: Scores fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Episode') ax1.set_ylabel('score per Episode', color=color) ax1.scatter(x, scores, color=color, s=2) ax1.tick_params(axis='y', labelcolor=color) # Second axis: epsilon ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('epsilon', color=color) # we already handled the x-label with ax1 ax2.plot(x, eps_history, color=color) ax2.tick_params(axis='y', labelcolor=color) # Output fig.tight_layout() # otherwise the right y-label is slightly clipped plt.savefig(filename) return env
out2 = agent.model.predict(state2) print(out1) print(out2) assert np.all((out1[0]-out2[0])!=0), "Failed agent test - same output different state" print("Agent test passed :)\n\n") state_shape=env.observation_space.shape # the state space action_shape=env.action_space.n # the action space #Testing Memory storage and sample state = env.reset() test_mem() test_agent() mem = Memory(10000, state_shape) agent = Agent(state_shape, action_shape) epsilon = 1 batch_size = 64 #action = env.action_space.sample() #next_state, reward, done, info = env.step(action) for game in range(n_games): state = env.reset() game_reward = 0 for step in range(max_steps): #Render game if game % 10 == 0:
def main(argv): # Set seeds np.random.seed(FLAGS.seed) t.manual_seed(FLAGS.seed) # Create logfile f = create_exp_logfile(os.path.join(FLAGS.exp_log_dir, str(FLAGS.learning_rate), str(FLAGS.seed))) # Initialise agent and environment env = LunarLander() num_actions = env.num_actions() agent = Agent(body_type='ff', obs_num_features_or_obs_in_channels=FLAGS.observation_dimensions, fc_hidden_layer_size = FLAGS.fc_hidden_layer_size, output_actions = num_actions, use_target_net = FLAGS.use_target_net, g = FLAGS.gamma, lr = FLAGS.learning_rate) # Initialise data structures c_buf = CircularBuffer(size=FLAGS.cb_size) er_buf = ExperienceReplayBuffer(size=FLAGS.er_size, batch_size=FLAGS.batch_size) # Initialise sampling range for e-greedy interval = t.distributions.uniform.Uniform(t.tensor([0.0]), t.tensor([1.0])) # Run step = 0 episode_results = [] state = env.reset() c_buf.append(t.from_numpy(state).float()) while step < FLAGS.max_steps: # Agent select action eps = max(FLAGS.init_epsilon - (((FLAGS.init_epsilon - FLAGS.final_epsilon) / FLAGS.epsilon_anneal) * step), FLAGS.final_epsilon) if interval.sample() <= eps: action = np.random.randint(num_actions) else: action = agent.greedy_action(c_buf()).item() reward, next_state, terminal = env.act(action) terminal = 1 if terminal else 0 er_buf.append(state, action, reward, next_state, terminal) state = next_state c_buf.append(t.from_numpy(state).float()) if step > FLAGS.batch_size and step % FLAGS.update_frequency: batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals = \ er_buf.sample() batch_states = t.from_numpy(batch_states).float() batch_actions = np.array(batch_actions) batch_rewards = np.array(batch_rewards) batch_next_states = t.from_numpy(batch_next_states).float() batch_terminals = np.array(batch_terminals) agent.optimise(batch_states, batch_actions, batch_rewards, batch_next_states, batch_terminals) if step % FLAGS.target_network_update == 0: agent.sync() if terminal: episode_results.append(env.episode_return()) state = env.reset() step += 1 if step % FLAGS.evaluate == 0: f.write('{}, {}\n'.format(step, performance_avg(episode_results, FLAGS.num_episodes_average))) f.flush() f.close()
def run(env: LlvmEnv) -> None: agent = Agent(n_actions=15, input_dims=[69]) env.observation_space = "InstCountNorm" agent.Q_eval.load_state_dict(torch.load("./H10-N4000-INSTCOUNTNORM.pth")) rollout(agent, env)
import gym from dqn import DeepQNetwork, Agent import numpy as np from gym import wrappers if __name__ == '__main__': env = gym.make('LunarLander-v2') brain = Agent(gamma=0.99, epsilon=1.0, n_actions=4, batch_size=128, input_dims=[8], alpha=0.0003, replace=64) scores = [] eps_history = [] num_games = 500 score = 0 for i in range(num_games): if i % 10 == 0 and i > 0: avg_score = np.mean(scores[max(0, i - 10):(i + 1)]) print('episode: ', i, 'score: ', score, ' average score %.3f' % avg_score, 'epsilon %.3f' % brain.EPSILON) else: print('episode: ', i, 'score: ', score) eps_history.append(brain.EPSILON) done = False observation = env.reset()
from dqn import Agent import numpy as np import gym import matplotlib.pyplot as plt if __name__ == '__main__': env = gym.make('LunarLander-v2') n_games = 300 show = False agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8, n_actions=4, batch_size=64) scores = [] eps_history = [] for i in range(1, n_games+1): done = False score = 0 obseervation = env.reset() while not done: if show: env.render() action = agent.choose_action(obseervation) obseervation_, reward, done, info = env.step(action) score += reward agent.remember(obseervation, action, reward, obseervation_, done) obseervation = obseervation_ agent.learn() eps_history.append(agent.epsilon)
import gym from keras.models import load_model from dqn import Agent env_name = 'CartPole-v0' eps = 0.8 episodes = 5 env = gym.make(env_name) model = load_model('./model/my_model.h5') agent = Agent(env) for episode in range(episodes): # initial state s = env.reset() done = False while not done: for i in range(50): a = agent.act(s, eps) env.render(a) s2, r, done, info = env.step(a) s = s2 env.close()
import gym from dqn import Agent from utils import PlotLearning import numpy as np if __name__ == '__main__': env = gym.make('LunarLander-v2') agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01, inp_dims=[8], lr=0.001) scores, eps_history = [], [] n_games = 500 for i in range(n_games): score = 0 done = False observation = env.reset() while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) score += reward agent.store_transition(observation, action, reward, observation_, done) agent.learn() observation = observation_ scores.append(score) eps_history.append(agent.epsilon)