def main(): env = gym.make('CartPole-v0').unwrapped batch_size = 128 gamma = 0.999 eps_start = 0.9 eps_end = 0.05 eps_decay = 200 target_update = 10 n_actions = env.action_space.n env.reset() init_screen = get_screen(env) _, _, screen_height, screen_width = init_screen.shape episode_durations = [] agent = Agent(gamma=gamma, n_actions=n_actions, screen_height=screen_height, screen_width=screen_width, batch_size=batch_size) n_games = 500 scores = [] eps_history = [] for i in range(n_games): done = False length = 0 state = env.reset() last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen # env.render() while not done: action = agent.select_action(state) _, reward, done, _ = env.step(action.item()) last_screen = current_screen current_screen = get_screen(env) next_state = current_screen - last_screen if not done else None agent.store_transition(state, action, next_state, reward) state = next_state agent.learn() length += 1 episode_durations.append(length) # plot_durations() if i % target_update == 0: agent.update_target() #imagine pretty plots print('done') env.close()
def main(): global args args = parser.parse_args() env = make(game='SonicTheHedgehog-Genesis', state='LabyrinthZone.Act1') current_state = get_screen(env) for epoch in range(args.epochs): env.reset()
def test(self, env, n_epochs=30, verbose=False): rewards = [] self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() for epoch in range(n_epochs): env.reset() done = False epoch_rewards = [] video = [] last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen while not done: if epoch % 5 == 0: video.append(last_screen) action = self.select_action(state, 0.) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen - last_screen else: next_state = None epoch_rewards.append(reward) reward = Tensor([reward]) state = next_state logging.debug( 'Test epoch {} : reward= {}, duration= {}'.format( epoch, np.sum(epoch_rewards), len(epoch_rewards))) rewards.append(np.sum(epoch_rewards)) if epoch % 5 == 0: self.make_video(video, ext='_test_' + str(epoch)) logging.info('Performance estimate : {} pm {}'.format( np.mean(rewards), np.std(rewards)))
def trainDQN(file_name="DQN", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000): """ DQN training routine. Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.title("") plt.draw() plt.pause(0.00001) num_actions = env.action_space.n model = DQN(num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done = 0 # total steps for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() print("Cur episode:", i_episode, "steps done:", steps_done, "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay)) # Initialize the environment and state env.reset() # last_screen = env.current_grid_map # (1, 1, 8, 8) current_screen = get_screen(env) state = current_screen # - last_screen for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) steps_done += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma) if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-dqn-rewards', episode_rewards) np.save(file_name + '-dqn-durations', episode_durations) return model, episode_rewards, episode_durations
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n num_envs = len(list_of_envs) policy = PolicyNetwork(num_actions) models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map current_screen = get_screen(env) state = current_screen # - last_screen # Select and perform an action action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) if done: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) env.reset() episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
def main(): #gettign positions of rigib bodies in real time client = NatClient() arena_rb = client.rigid_bodies['Arena'] rat_rb = client.rigid_bodies['Rat'] window = pyglet.window.Window(resizable=True, fullscreen=True, screen=get_screen(1)) # Opening the basic pyglet window # Load Arena remove_image_lines_from_mtl('assets/3D/grass_scene.mtl') arena_filename = 'assets/3D/grass_scene.obj'# we are taking an arena which has been opened in blender and rendered to 3D after scanning it does not have flipped normals arena_reader = rc.WavefrontReader(arena_filename) # loading the mesh of the arena thought a wavefrontreader arena = arena_reader.get_mesh("Arena", position=arena_rb.position) # making the wafrotn into mesh so we can extrude texture ont top of it. arena.uniforms['diffuse'] = 1., 1., 1. # addign a white diffuse material to the arena arena.rotation = arena.rotation.to_quaternion() # we also need to get arena's rotation not just xyz so it can be tracked and moved if it gets bumped # Load the projector as a Ratcave camera, set light to its position projector = rc.Camera.from_pickle('assets/3D/projector.pkl') # settign the pickle filled of the projector, which gives us the coordinates of where the projector is projector.position.x += .004 projector.projection = rc.PerspectiveProjection(fov_y =40.5, aspect=1.777777778) light = rc.Light(position=projector.position) ## Make Virtual Scene ## fields = [] for x, z in itertools.product([-.8, 0, .8], [-1.6, 0, 1.6]): field = load_textured_mesh(arena_reader, 'grass', 'grass.png') field.position.x += x field.position.z += z fields.append(field) ground = load_textured_mesh(arena_reader, 'Ground', 'dirt.png') sky = load_textured_mesh(arena_reader, 'Sky', 'sky.png') snake = load_textured_mesh(arena_reader, 'Snake', 'snake.png') rat_camera = rc.Camera(projection=rc.PerspectiveProjection(aspect=1, fov_y=90, z_near=.001, z_far=10), position=rat_rb.position) # settign the camera to be on top of the rats head meshes = [ground, sky, snake] + fields for mesh in meshes: mesh.uniforms['diffuse'] = 1., 1., 1. mesh.uniforms['flat_shading'] = False mesh.parent = arena virtual_scene = rc.Scene(meshes=meshes, light=light, camera=rat_camera, bgColor=(0, 0, 255)) # seetign aset virtual scene to be projected as the mesh of the arena virtual_scene.gl_states.states = virtual_scene.gl_states.states[:-1] ## Make Cubemapping work on arena cube_texture = rc.TextureCube(width=4096, height=4096) # usign cube mapping to import eh image on the texture of the arena framebuffer = rc.FBO(texture=cube_texture) ## creating a fr`amebuffer as the texture - in tut 4 it was the blue screen arena.textures.append(cube_texture) # Stereo vr_camgroup = rc.StereoCameraGroup(distance=.05) vr_camgroup.rotation = vr_camgroup.rotation.to_quaternion() # updating the posiotn of the arena in xyz and also in rotational perspective def update(dt): """main update function: put any movement or tracking steps in here, because it will be run constantly!""" vr_camgroup.position, vr_camgroup.rotation.xyzw = rat_rb.position, rat_rb.quaternion # setting the actual osiont of the rat camera to vbe of the rat position arena.uniforms['playerPos'] = rat_rb.position arena.position, arena.rotation.xyzw = arena_rb.position, arena_rb.quaternion arena.position.y -= .02 pyglet.clock.schedule(update) # making it so that the app updates in real time @window.event def on_draw(): ## Render virtual scene onto cube texture with framebuffer: with cube_shader: for mask, camside in zip([(True, False, False, True), (False, True, True, True)], [vr_camgroup.left, vr_camgroup.right]): gl.glColorMask(*mask) virtual_scene.camera.position.xyz = camside.position_global virtual_scene.draw360_to_texture(cube_texture) ## Render real scene onto screen gl.glColorMask(True, True, True, True) window.clear() with cube_shader: # usign cube shader to create the actuall 6 sided virtual cube which gets upated with position and angle of the camera/viewer rc.clear_color(255, 0, 0) # why is it here 39? e with projector, light: arena.draw() # actually run everything. pyglet.app.run()
def train(self, env, n_epochs=30, epsilon_init=1., epsilon_schedule='exp', eps_decay=None, lr=0.001, batch_size=32): if epsilon_schedule == 'linear': eps_range = np.linspace(epsilon_init, 0., n_epochs) elif epsilon_schedule == 'constant': eps_range = [epsilon_init for _ in range(n_epochs)] elif epsilon_schedule == 'exp': if not eps_decay: eps_decay = n_epochs // 4 eps_range = [ epsilon_init * math.exp(-1. * i / eps_decay) for i in range(n_epochs) ] history_file = open(self.filename + 'history', mode='a+') self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr) losses, rewards, change_history = [], [], [] for epoch in range(n_epochs): env.reset() last_screen = get_screen(env) current_screen = get_screen(env) state = current_screen - last_screen done = False epoch_losses = [] epoch_rewards = [] video = [] while not done: if epoch % 10 == 1: video.append(last_screen) action = self.select_action(state, eps_range[epoch]) _, reward, done, _ = env.step(action[0, 0]) last_screen = current_screen current_screen = get_screen(env) reward = Tensor([reward]) if not done: next_state = current_screen - last_screen else: next_state = None self.memory.push(state, action, next_state, reward) state = next_state loss = self.update(batch_size=batch_size) epoch_losses.append(loss) epoch_rewards.append(reward) history_file.write( 'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format( epoch, np.mean(epoch_losses), np.sum(epoch_rewards), len(epoch_rewards))) losses.append(np.mean(epoch_losses)) rewards.append(np.sum(epoch_rewards)) if epoch % 10 == 1: self.target_net.load_state_dict(self.policy_net.state_dict()) self.save(ext=str(epoch)) self.make_video(video, ext='_train_' + str(epoch)) with open(self.filename + '.train_losses', 'a+') as f: for l in losses: f.write(str(l) + '\n') losses = [] with open(self.filename + '.train_rewards', 'a+') as f: for r in rewards: f.write(str(r) + '\n') rewards = [] self.save()
def train(Learner): # global args # args = parser.parse_args() # Learner = DQN().to(device) # env = retro.make(game='Airstriker-Genesis', state='Level1') criterion = L2_loss(0.99).to(device) if use_cuda: Learner = Learner.cuda() criterion = criterion.cuda() optimizer = optim.SGD(Learner.parameters(), lr=0.01) eps_threshold = 0.3 RM = ReplayMemory(900) A_agent = ActorAgent(Learner, args) print("Start Episodes") for i_episode in range(50000): env.reset() A_agent.reset(Learner, args) last_state = get_screen(env) current_state = get_screen(env) state = current_state - last_state # state_var = torch.autograd.Variable(state) state_var = state.to(device) total_reward = 0 # if i_episode % 50 == 0: # if not eps_threshold < 0.1: # eps_threshold -= 0.001 start = time.time() for t in count(): if t == 0: print("episode begin") action_q = A_agent.act(state_var, eps_threshold) """ if is_cuda: action_q = action_q.cpu() _, action = action_q.data.max(2) else: _, action = action_q.data.max(2) """ _, action = action_q.data.max(2) action_numpy = action.squeeze(0).numpy() # print(list(action_numpy)) for i in range(4): _, reward, done, _ = env.step(action_numpy) total_reward += reward last_state = current_state current_state = get_screen(env) state = current_state - last_state # state_var = torch.autograd.Variable(state) state_var = state.to(device) # 行動語のstateを保存 A_agent.add_to_buffer(reward, action_q, state_var) # ReplayMemoryに状態保存 if len(A_agent.localbuffer) > 10: p, error = calc_priority_TDerror(Learner, criterion, A_agent, 10) RM.push(p, error) if done: break if t == 500: print("Total time: {0:.2f}".format(time.time() - start)) # break # Optimize Learner model # if t%100==0 and len(A_agent.localbuffer)>80 and len(RM)>=30: env.render() # update Learner part for i in range(4): if len(RM.memory) >= 30: error_batch = RM.priority_sample(30) optimizer.zero_grad() # error_batch.backward(retain_graph=True) error_batch.backward() optimizer.step() # for param in Learner.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step() print("{0}\t{1}\tLoss:{2:.2f}\tTotal:{3:.2f}\tReward:{4:.2f}". format( i_episode, t, float(error_batch), total_reward, reward, )) else: break if i_episode % 5 == 0: env.reset() last_state = get_screen(env) current_state = get_screen(env) state = current_state - last_state state_var = state.to(device) val_reward = 0 for t in count(): with torch.no_grad(): action_q = Learner(state_var) _, action = action_q.data.max(2) action_numpy = action.squeeze(0).numpy() for i in range(4): _, reward, done, _ = env.step(action_numpy) val_reward += reward last_state = current_state current_state = get_screen(env) state = current_state - last_state state_var = state.to(device) if done: break env.render() print("Validation:\tepisode{0}\tReward: {1:.2f}".format( i_episode, val_reward)) with open("result.txt", "a") as f: f.write("episode{0}\tReward: {1:.2f}".format( i_episode, val_reward)) f.write("\n") RM.reset() # break with open("total_reward.txt", "a") as f: f.write("{0}\t{1}".format(i_episode, total_reward)) f.write("\n")
EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 num_episoeds = 50 resize = T.Compose([ T.ToTensor(), T.ToPILImage(), T.Resize(40, interpolation=Image.CUBIC), T.ToTensor() ]) if __name__ == "__main__": env = gym.make(env_name) init_screen = get_screen(env) screen_height, screen_width, _ = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env.reset() BATCH_SIZE = 128 # GAMMA is the discount factor GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 AVERAGE_SIZE = 10 episode_durations = [] init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape policy_net = DQN(screen_height, screen_width).to(device) target_net = DQN(screen_height, screen_width).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 num_episodes = 300 for i_episode in range(num_episodes): env.reset()
from collections import deque import gym import numpy as np from agent import DDPG from utils import get_screen env = gym.make('Pendulum-v0') agent = DDPG(env, memory=False) agent.load_model() env.reset() pixel = env.render(mode='rgb_array') state = deque([get_screen(pixel) for _ in range(3)], maxlen=3) cumulative_reward = 0 for timestep in range(200): action = agent.get_action(np.array(state)[np.newaxis]) _, reward, _, _ = env.step(action * 2) pixel = env.render(mode='rgb_array') state_ = state.copy() state_.append(get_screen(pixel)) state = state_ cumulative_reward += reward print('Cumulative Reward: {}'.format(cumulative_reward))
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ # action dimension num_actions = list_of_envs[0].action_space.n # total envs num_envs = len(list_of_envs) # pi_0 policy = PolicyNetwork(num_actions) # Q value, every environment has one, used to calculate A_i, models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) # replay buffer for env ??? memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # device = "cpu" print(device) # model policy = policy.to(device) for i in range(len(models)): models[i] = models[i].to(device) # optimizer for every Q model optimizers = [optim.Adam(model.parameters(), lr=learning_rate) for model in models] # optimizer for policy policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) # info list for each environment episode_durations = [[] for _ in range(num_envs)] # list of local steps episode_rewards = [[] for _ in range(num_envs)] # list of list of episode reward episodes_done = np.zeros(num_envs) # episode num steps_done = np.zeros(num_envs) # global timesteps for each env current_time = np.zeros(num_envs) # local timesteps for each env # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: policy.train() for model in models: model.train() # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy # 1. do the step for each env for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map # ===========update step info begin======================== current_screen = get_screen(env) # state state = current_screen # - last_screen # action chosen by pi_1~pi_i action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta, device) # global_steps steps_done[i_env] += 1 # local steps current_time[i_env] += 1 # reward _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # next state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # add to buffer time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # 2. do one optimization step for each env using "soft-q-learning". # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma, device) # ===========update step info end ======================== # ===========update episode info begin ==================== if done: print("ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) # reset env env.reset() # episode steps episodes_done[i_env] += 1 # append each episode local timesteps list for every env episode_durations[i_env].append(current_time[i_env]) # reset local timesteps current_time[i_env] = 0 # append total episode_reward to list episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) # ===========update episode info end ==================== # 3. do one optimization step for the policy # after all envs has performed one step, optimize policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, device) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
import torch import gym from utils import get_screen # Import environment env = gym.make('Acrobot-v1') # Define model and load pre-trained weights trained_Q_network = torch.load('trained_DQN_model', map_location=lambda storage, loc: storage) # Reset env _ = env.reset() current_screen = get_screen(env) state = current_screen # Present trained behaviour over episodes num_test_episodes = 30 episodes_passed = 0 acc_episodic_reward = 0.0 while episodes_passed < num_test_episodes: # Choose action greedily action = trained_Q_network.select_action(state) # Act on env _, reward, done, _ = env.step(action) last_screen = current_screen current_screen = get_screen(env) next_state = current_screen - last_screen # Add to accumulative reward acc_episodic_reward += reward # When episode is done - reset and print
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.draw() plt.pause(0.00001) num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 # plt.ion() for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size) if done: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations