def trainA3C(file_name="A3C", env=GridworldEnv(1), update_global_iter=10, gamma=0.999, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001): """ A3C training routine. Retuns rewards and durations logs. Plot environment screen """ ns = env.observation_space.shape[ 0] ## Line to fix for arbitrary environment na = env.action_space.n gnet = Net(ns, na) # global network gnet.share_memory() # share the global parameters in multiprocessing opt = SharedAdam(gnet.parameters(), lr=learning_rate) # global optimizer global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() # parallel training workers = [ Worker(gnet, opt, global_ep, global_ep_r, res_queue, i, update_global_iter, num_episodes, max_num_steps_per_episode, gamma, env, ns, na) for i in range(mp.cpu_count()) ] [w.start() for w in workers] episode_rewards = [] # record episode reward to plot while True: r = res_queue.get() if r is not None: episode_rewards.append(r) else: break [w.join() for w in workers] #Store results np.save(file_name + '-a3c-rewards', episode_rewards) return episode_rewards
def trainDistral(file_name="Distral_1col", list_of_envs=[GridworldEnv(5), GridworldEnv(4)], batch_size=128, gamma=0.80, alpha=0.5, beta=0.005, is_plot=False, num_episodes=1000, max_num_steps_per_episode=10, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): # Specify Environment conditions input_size = list_of_envs[0].observation_space.shape[0] num_actions = list_of_envs[0].action_space.n tasks = len(list_of_envs) # Define our set of policies, including distilled one models = torch.nn.ModuleList( [Policy(input_size, num_actions) for _ in range(tasks + 1)]) optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] # Store the total rewards episode_rewards = [[] for i in range(num_episodes)] episode_duration = [[] for i in range(num_episodes)] for i_episode in range(num_episodes): # For each one of the envs for i_env, env in enumerate(list_of_envs): #Initialize state of envs state = env.reset() #Store total reward per environment per episode total_reward = 0 # Store duration of each episode per env duration = 0 for t in range(max_num_steps_per_episode): # Run our policy action = select_action(state, models[i_env + 1], models[0]) next_state, reward, done, _ = env.step(action.data[0]) models[i_env + 1].rewards.append(reward) total_reward += reward duration += 1 #if is_plot: # env.render() if done: break #Update state state = next_state episode_rewards[i_episode].append(total_reward) episode_duration[i_episode].append(duration) # Distill for each environment finish_episode(models[i_env + 1], models[0], optimizers[i_env + 1], optimizers[0], alpha, beta, gamma) if i_episode % args.log_interval == 0: for i in range(tasks): print( 'Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'. format(i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i])) np.save(file_name + '-distral0-rewards', episode_rewards) np.save(file_name + '-distral0-duration', episode_duration) print('Completed')
default=543, metavar='N', help='random seed (default: 1)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)') args = parser.parse_args() #env = gym.make('CartPole-v0') env = GridworldEnv(8) env.seed(args.seed) #torch.manual_seed(args.seed) SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.affine1 = nn.Linear(3, 128) self.action_head = nn.Linear(128, 5) self.value_head = nn.Linear(128, 1) self.saved_actions = [] self.rewards = []
def trainDQN(file_name="DQN", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000): """ DQN training routine. Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.title("") plt.draw() plt.pause(0.00001) num_actions = env.action_space.n model = DQN(num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done = 0 # total steps for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() print("Cur episode:", i_episode, "steps done:", steps_done, "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay)) # Initialize the environment and state env.reset() # last_screen = env.current_grid_map # (1, 1, 8, 8) current_screen = get_screen(env) state = current_screen # - last_screen for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) steps_done += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma) if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-dqn-rewards', episode_rewards) np.save(file_name + '-dqn-durations', episode_durations) return model, episode_rewards, episode_durations
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n num_envs = len(list_of_envs) policy = PolicyNetwork(num_actions) models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map current_screen = get_screen(env) state = current_screen # - last_screen # Select and perform an action action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) if done: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) env.reset() episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ # action dimension num_actions = list_of_envs[0].action_space.n # total envs num_envs = len(list_of_envs) # pi_0 policy = PolicyNetwork(num_actions) # Q value, every environment has one, used to calculate A_i, models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) # replay buffer for env ??? memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # device = "cpu" print(device) # model policy = policy.to(device) for i in range(len(models)): models[i] = models[i].to(device) # optimizer for every Q model optimizers = [optim.Adam(model.parameters(), lr=learning_rate) for model in models] # optimizer for policy policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) # info list for each environment episode_durations = [[] for _ in range(num_envs)] # list of local steps episode_rewards = [[] for _ in range(num_envs)] # list of list of episode reward episodes_done = np.zeros(num_envs) # episode num steps_done = np.zeros(num_envs) # global timesteps for each env current_time = np.zeros(num_envs) # local timesteps for each env # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: policy.train() for model in models: model.train() # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy # 1. do the step for each env for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map # ===========update step info begin======================== current_screen = get_screen(env) # state state = current_screen # - last_screen # action chosen by pi_1~pi_i action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta, device) # global_steps steps_done[i_env] += 1 # local steps current_time[i_env] += 1 # reward _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # next state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # add to buffer time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # 2. do one optimization step for each env using "soft-q-learning". # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma, device) # ===========update step info end ======================== # ===========update episode info begin ==================== if done: print("ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) # reset env env.reset() # episode steps episodes_done[i_env] += 1 # append each episode local timesteps list for every env episode_durations[i_env].append(current_time[i_env]) # reset local timesteps current_time[i_env] = 0 # append total episode_reward to list episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) # ===========update episode info end ==================== # 3. do one optimization step for the policy # after all envs has performed one step, optimize policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, device) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
import sys sys.path.append('../') from envs.gridworld_env import GridworldEnv sys.path.append('../sql') import trainingSQL trainingSQL.trainSQL( file_name="env1", env=GridworldEnv(1), batch_size=128, gamma=0.95, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=300, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, )
# trainingDQN.trainDQN(file_name="env1", # env=GridworldEnv(1), # batch_size=128, # gamma=0.999, # eps_start=0.9, # eps_end=0.05, # eps_decay=1000, # is_plot=True, # num_episodes=500, # max_num_steps_per_episode=1000, # learning_rate=0.0001, # memory_replay_size=10000, # ) agent, _, _ = trainingDQN.trainDQN( file_name="env1", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000, ) play_game(GridworldEnv(1), agent)
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.draw() plt.pause(0.00001) num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 # plt.ion() for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size) if done: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations
import sys sys.path.append('../') from envs.gridworld_env import GridworldEnv sys.path.append('../a3c') import trainingA3C import gym trainingA3C.trainA3C(file_name="env3", env=GridworldEnv(3), update_global_iter=10, gamma=0.95, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001)