def env_diff(env_a, env_b, iterations, step_size): all_states = env_a.get_all_states() + env_b.get_all_states() dqn_a = DQN(env_a, qnet=LinRegNet(64, 4).double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) dqn_b = DQN(env_b, qnet=LinRegNet(64, 4).double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) all_mean_diffs = [] for ne in range(0, iterations): convergence_durations = [] ql_agents = [] dqn_a.train(step_size, 4, plot=False) dqn_b.train(step_size, 4, plot=False) env_diffs = [] total_visits = [] for state in all_states: state = torch.from_numpy(state) total_visits.append((dqn_a.state_visits[state] if state in dqn_a.state_visits else 0) + (dqn_b.state_visits[state] if state in dqn_b.state_visits else 0)) normalized_visits = {} for state in all_states: state = torch.from_numpy(state) normalized_visits[state] = ( (dqn_a.state_visits[state] if state in dqn_a.state_visits else 0) + (dqn_b.state_visits[state] if state in dqn_b.state_visits else 0)) / np.linalg.norm(total_visits) for state in all_states: state = torch.from_numpy(state) action_a = dqn_a.qnet(state).max(0)[1].numpy() action_b = dqn_b.qnet(state).max(0)[1].numpy() env_diffs.append(1 * normalized_visits[state] if action_a == action_b else 0) print('{}/{} mean difference: {:.4f}'.format(ne + 1, iterations, np.mean(env_diffs))) all_mean_diffs.append(np.mean(env_diffs)) absolutely_all_diffs.append(all_mean_diffs) return all_mean_diffs[-1]
def env_diff(env_a, env_b, iterations, step_size): all_states = env_a.get_all_states() + env_b.get_all_states() dqn_a = DQN(env_a, qnet=LinRegNet(64, 4).double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) dqn_b = DQN(env_b, qnet=LinRegNet(64, 4).double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) all_mean_diffs = [] for ne in range(0, iterations): convergence_durations = [] ql_agents = [] dqn_a.train(step_size, 4, plot=False) dqn_b.train(step_size, 4, plot=False) env_diffs = [] for state in all_states: state = torch.from_numpy(state) env_diffs.append( torch.sum((dqn_a.qnet(state) - dqn_b.qnet(state))**2).item()) print('{}/{} mean difference: {:.4f}'.format(ne + 1, iterations, np.mean(env_diffs))) all_mean_diffs.append(np.mean(env_diffs)) absolutely_all_diffs.append(all_mean_diffs) return all_mean_diffs[-1] #envs = [ # (Gridworld(width=4, height=4, cell_size=32, agent_pos=(2, 0), food_pos=[(0, 3), (3, 3)]), # Gridworld(width=4, height=4, cell_size=32, agent_pos=(2, 0), food_pos=[(1, 3), (3, 3)])) #] #for env_pair in envs: # print(env_diff(env_pair[0], env_pair[1], 10, 10)) #for diff in absolutely_all_diffs: # plt.plot(diff) #plt.savefig('test.png') #plt.show()
def dqn_benchmark(env, iterations): histories = [] for _ in range(iterations): dqn = DQN(env, qnet=LinRegNet(64, 4).double(), plot_durations=True, plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) dqn.train(200, 4) histories.append(np.mean(dqn.history)) return histories
import gym from vicero.algorithms.deepqlearning import DQN, NetworkSpecification from vicero.agent import Agent # This example is showing off multiple concepts, for a more pure # DQN example, see mountaincar. The first part is obviously to # solve cartpole, but in addition, the script will save the policy # as it is after one shorter round of training, then it will train # a while longer. At last it will show both policies in comparison. # This demonstrates both that training actually improves performace # as well as the concept of saving a policy. env = gym.make('CartPole-v1') spec = NetworkSpecification() dqn = DQN(env, spec, render=False, caching_interval=200) batch_size = 32 num_episodes = 4 training_iter = 500 completion_reward = -10 print('training...') dqn.train(num_episodes, batch_size, training_iter, verbose=True, completion_reward=completion_reward, plot=True, eps_decay=True)
env_list = [] for i in range(10): env_list.append(Gridworld(width=4, height=4, cell_size=32, seed=(20 + i))) #for i in [8, 0, 2, 4, 7]: # env_list.append(Gridworld(width=4, height=4, cell_size=32, seed=i)) env = MultitaskEnvironment(env_list) for _ in range(repetitions): print('x') dqn = DQN(env, qnet=NeuralNet(64, 4).double(), plotter=plot, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1, plot_durations=True) dqn.train(training_iterations, 4, plot=False) histories_a.append(dqn.history) #for seed in env.env_scores: # print('seed={}, score={}'.format(seed, np.mean(env.env_scores[seed]))) print('Set B') histories_b = [] env_list = []
#x = F.relu(self.fc2(x)) #x = F.relu(self.fc3(x)) x = self.fc4(x) return x """ class PolicyNet(nn.Module): def __init__(self): super(PolicyNet, self).__init__() self.fc1 = nn.Linear(256, 32) self.fc2 = nn.Linear(32, 16) self.fc3 = nn.Linear(16, 4) def forward(self, x): x = torch.flatten(x) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x dqn = DQN(env, qnet=PolicyNet().double(), plotter=plot, render=True, memory_length=2000, gamma=.95, alpha=.001, epsilon_start=0.1) dqn.train(2000, 4, plot=True, verbose=True)
for i in range(1, len(layer_sizes)): self.layers.append(nn.ReLU()) self.layers.append(nn.Linear(layer_sizes[i - 1], layer_sizes[i])) self.nn = nn.Sequential(*self.layers) def forward(self, x): x = torch.flatten(x) return self.nn.forward(x) gamma = .95 alpha = .002 num_episodes = 2000 convergence_durations = [] for i in range(50): print('Simulation {}/{}'.format(i, 50)) net = DenseNet(64, [4]).double() dqn = DQN(env, qnet=net, plotter=plot, render=True, memory_length=2000, gamma=gamma, alpha=alpha, epsilon_start=0.1, caching_interval=3000) for e in range(num_episodes): dqn.train_episode(e, num_episodes, 16, plot=True, verbose=False) if e > 50 and np.std(dqn.history[-50:]) < 20: print('Early stop after {} iterations'.format(e)) convergence_durations.append(e) break if len(convergence_durations) <= i: convergence_durations.append(env.cutoff) print('mean duration: {}'.format(np.mean(convergence_durations)))
def forward(self, x): x = torch.flatten(x) return self.linreg(x) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x all_states = env_a.get_all_states() #ql_a = Qlearning(env_a, n_states=len(all_states), n_actions=env_a.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) #ql_b = Qlearning(env_b, n_states=len(all_states), n_actions=env_b.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998) dqn_a = DQN(env_a, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) dqn_b = DQN(env_b, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1) #dqn.train(2000, 4, plot=True, verbose=True) for ne in range(0, 50): #np.random.seed(10)
pg.init() screen = pg.display.set_mode(dim) clock = pg.time.Clock() env = BoatEnvironment(dim, screen, intermediate_rewards=False, multi_agent=True, default_reward=0, illegal_penalty=-1 ) framerate = 60 running = True spec = NetworkSpecification(hidden_layer_sizes=[14, 8], activation_function=nn.ReLU) dqn = DQN(env, spec, render=False, gamma=0.99, alpha=1e-5, epsilon_start=0.9, epsilon_end=0.05, memory_length=5000) batch_size = 2 num_episodes = 20 training_iter = 10 print('training...') dqn.train(num_episodes, batch_size, training_iter, verbose=True, plot=True, eps_decay=True) boat1_policy = dqn.copy_target_policy(verbose=False) boat2_policy = dqn.copy_target_policy(verbose=False) dqn.save('long_training.pkl') boat1_state = env.reset() boat2_state = boat1_state plt.plot(dqn.history)
pg.init() screen = pg.display.set_mode(dim) clock = pg.time.Clock() env = BoatEnvironment(dim, screen, intermediate_rewards=True, multi_agent=True) framerate = 100 running = True spec = NetworkSpecification(hidden_layer_sizes=[12], activation_function=nn.Sigmoid) dqn = DQN(env, spec, render=True, alpha=1e-5, epsilon_start=0.8, epsilon_end=0.01, memory_length=2000) batch_size = 4 num_episodes = 100 training_iter = 1500 print('training...') dqn.train(num_episodes, batch_size, training_iter, verbose=True, plot=True, eps_decay=True)
import gym from vicero.algorithms.deepqlearning import DQN, NetworkSpecification import matplotlib.pyplot as plt # This function is used to design a custom reward function, overriding # the one from the environment. This one rewards based on the absolute cart speed. def state_to_reward(state): return abs(state[1]) * 10 - 0.05 env = gym.make('MountainCar-v0') spec = NetworkSpecification() dqn = DQN(env, spec=spec, state_to_reward=state_to_reward) batch_size = 32 num_episodes = 200 training_iter = 500 dqn.train(num_episodes, batch_size, training_iter, verbose=True, plot=True) plt.plot(dqn.history) plt.show() plt.plot(dqn.maxq_history) plt.show()
def get_diversity(env_list, learning_alg, state_dist, softmax=False, training_iterations=1000, steps=10, verbose=False, input_size=64): assert not (learning_alg == 'reinforce' and softmax) expert_agents = [] for i in range(len(env_list)): if learning_alg == 'reinforce': agent = Reinforce(env_list[i], polinet=LogRegNet(input_size, 4).double()) elif learning_alg == 'dqn': agent = DQN(env_list[i], qnet=LinRegNet(input_size, 4).double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.3) expert_agents.append(agent) diversity_history = [] for step in range(steps): print('step {}/{}'.format(step, steps)) for i in range(len(env_list)): if verbose: print('Training expert agent for environment({}) {}/{}'.format( env_list[i].seed, i + 1, len(env_list))) start = time.time() if learning_alg == 'reinforce': expert_agents[i].train(training_iterations) elif learning_alg == 'dqn': expert_agents[i].train(training_iterations, 4) end = time.time() if verbose: print('Elapsed time: {:.2f}s'.format(end - start)) diff_list = [] for i in range(len(env_list)): for j in range(i + 1, len(env_list)): env_a = env_list[i] env_b = env_list[j] agent_a = expert_agents[i] agent_b = expert_agents[j] if state_dist == 'full': all_states = env_a.get_all_states() + env_b.get_all_states( ) elif state_dist == 'memory': all_states = [ sample[0].numpy() for sample in agent_a.memory ] + [sample[0].numpy() for sample in agent_b.memory] env_diffs = [] with torch.no_grad(): for state in all_states: state = torch.from_numpy(state) if learning_alg == 'reinforce': env_diffs.append( torch.sum( (agent_a.policy_net(state) - agent_b.policy_net(state))**2).item()) elif learning_alg == 'dqn': if softmax: env_diffs.append( torch.sum( (F.softmax(agent_a.qnet(state), dim=-1) - F.softmax(agent_b.qnet(state), dim=-1))**2).item()) else: env_diffs.append( torch.sum((agent_a.qnet(state) - agent_b.qnet(state))**2).item()) diff_list.append(np.mean(env_diffs)) diversity_history.append(np.mean(diff_list)) return diversity_history