def __init__(self, alpha, discount, environment, epsilon=0.2): super().__init__(alpha, discount, environment) self.optimal_policy = policy.GreedyPolicy(environment.state_space, environment.action_space, self.qvalues) self.explore_policy = policy.EpsilonGreedyPolicy( environment.state_space, environment.action_space, self.qvalues, epsilon)
def __init__(self, network, max_len_memory, to_observe, pol, gamma, log_dir, load_prev, game): self.env = wrap_dqn(gym.make(game)) self.env.seed(19) # self.action_meaning = self.env.env.get_action_meanings() self.env._max_episode_steps = None self.model = network network.model.summary() self.batch_size = 32 * 3 self.to_observe = to_observe self.state_size = network.state_size self.action_size = network.action_size self.log_dir = log_dir self.depth = network.depth # self.lives = self.env.env.ale.lives() if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) attr = { 'batch size': self.batch_size, 'to observe': self.to_observe, 'depth': self.depth } self.results = {'info': attr} self.memory = SimpleMemory(max_len=max_len_memory) if load_prev: path = sorted([ int(x) for x in os.listdir(self.log_dir) if os.path.isdir(os.path.join(self.log_dir, x)) ]) if len(path) != 0: load_prev = self.load(os.path.join(self.log_dir, str(path[-1]))) else: load_prev = False if not load_prev: if pol is None: self.pol = policy.GreedyPolicy() else: self.pol = pol if gamma is None: gamma = policy.EpsPolicy(0.95) elif isinstance(gamma, float): gamma = policy.EpsPolicy(gamma) if isinstance(gamma, policy.AnnealedPolicy): self.gamma = gamma.linear_step elif isinstance(gamma, policy.Policy): self.gamma = gamma.get_value
def exp_ddqn(): import matplotlib.pyplot as plt eps = 1000 env = gym.make('CartPole-v0') env.seed(19) pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) log_dir = './logs/prova_pole'+pol.name n = models.DenseDQN(log_dir=log_dir, action_size=env.action_space.n, state_size=env.observation_space.shape[0], layer_size=(24, 24), lr=0.001) a = Agent(game=env, net=n, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='DQN') for i in [50, 100, 200, 300, 500, 750, 1000, 2000, 3000]: pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) log_dir = './logs/prova_pole'+pol.name n = models.DoubleDQNWrapper(network=models.DenseDQN(log_dir=log_dir, action_size=env.action_space.n, state_size=env.observation_space.shape[0], layer_size=(24, 24), lr=0.001), update_time=i) a = Agent(game=env, net=n, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='Update time: {}'.format(i)) plt.legend() plt.savefig('exp_ddqn.png')
def __init__(self, alpha, discount, env, episilon=0.2): super().__init__(alpha, discount, env) ssp = env.state_space asp = env.action_space self.optimal_policy = policy.GreedyPolicy(ssp, asp, self.qvalues, episilon) self.explore_policy = policy.EpsilonGreedyPolicy( ssp, asp, self.qvalues, episilon) self.draw_policy = self.optimal_policy self.returns = defaultdict(list) self.episode = []
def __init__(self, alpha, discount, environment): self.alpha = alpha self.discount = discount ssp = environment.state_space asp = environment.action_space self.action_space = asp self.qvalues = np.zeros((ssp, asp), np.float32) self.optimal_policy = policy.RandomPolicy(ssp, asp) self.explore_policy = self.optimal_policy self.draw_policy = policy.GreedyPolicy(ssp, asp, self.qvalues)
def evaluate(self, env, num_episodes, max_episode_length=1000000): """Test the agent with a provided environment. If you have any layers that vary in behavior between train/test time (such as dropout or batch norm), you should set them to test. - Input - env: gym.Env This is the Atari environment. Need to wrap the environment using the wrap_atari_env function in the utils.py - num_iterations: int How many samples/updates to perform. - max_episode_length: int How long a single episode should last before the agent - Output - total: float the cumulative rewards from all episodes """ total = 0.0 greedy = policy.GreedyPolicy() for episode in range(num_episodes): #print total observation = env.reset() state = self.preprocessor.process_state_for_network(observation) state = state[:, :, None] history = [ np.zeros(state.shape), np.zeros(state.shape), np.zeros(state.shape), state ] for t in range(max_episode_length): env.render() state = np.vstack( (history[0], history[1], history[2], history[3])) action = self.select_action(state[None, :], greedy) observation, reward, done, info = env.step(action) state = self.preprocessor.process_state_for_network( observation) state = state[:, :, None] history = history[1:] history.append(state) total += reward if done: break return total
def __init__(self, game, net, max_memory=5000, log_dir='./logs/prova_cartpole', weight_name=None, pol=None, agent_name='agent'): self.env = game self.env._max_episode_steps = 500 self.name = agent_name self.model = net self.memory = deque(maxlen=max_memory) self.state_size = net.state_size self.action_size = net.action_size self.log_dir = os.path.join(log_dir, agent_name) #self.model.model.summary() if pol is None: self.pol = policy.GreedyPolicy() else: self.pol = pol self.episodes_to_watch = 32 self.batch_size = 32 self.gamma = 0.95
def update_network(self): """Update the Q-network. """ p = policy.GreedyPolicy() if self.replay: # obtain batch from replay memory batch = self.memory.sample(self.batch_size) batch = self.preprocessor.process_batch(batch) size = len(batch) else: # obtain batch from memory and process batch n = len(self.memory) batch = list() for i in range(3, n - 1): state, a, r, nexts, done = self.memory[i] state, nexts = self.memory[i - 3][0], self.memory[i - 3][3] for x in range(2, -1, -1): state = np.vstack((state, self.memory[i - x][0])) if done == False: nexts = np.vstack((nexts, self.memory[i - x][3])) else: nexts = None batch.append((state, a, r, nexts, done)) size = len(batch) - 1 # obtain data from batch inputs, actions, outputs, nextinputs, terminal = batch[0] inputs = inputs[None, :] if terminal == False: nextinputs = nextinputs[None, :] terminal = self.gamma else: nextinputs = inputs terminal = 0 for i, sample in enumerate(batch): if i == 0: continue state, action, reward, nexts, is_terminal = sample state = state[None, :] inputs = np.vstack((inputs, state)) actions = np.hstack((actions, action)) outputs = np.hstack((outputs, reward)) if is_terminal == False: nexts = nexts[None, :] nextinputs = np.vstack((nextinputs, nexts)) terminal = np.vstack((terminal, self.gamma)) else: nextinputs = np.vstack((nextinputs, state)) terminal = np.vstack((terminal, 0)) # calculate target values for the target network next_y = self.target_network.predict(nextinputs, batch_size=size) next_y = next_y * terminal * self.gamma target_y = self.q_network.predict(inputs, batch_size=size) # apply double DQN or not if self.double: target_actions = np.argmax(target_y, axis=1) outputs = outputs + next_y[range(next_y.shape[0]), target_actions] else: outputs = outputs + np.max(next_y, axis=1) target_y[range(target_y.shape[0]), actions] = outputs # do gradient descent self.q_network.fit(inputs, target_y, batch_size=size, verbose=0)
def __init__(self, config=None): if config is None: config = {} self.env = wrap_dqn(gym.make(config.get('game', 'PongNoFrameskip-v4'))) self.action_size = self.env.action_space.n self.to_vis = config.get('visualize', False) self.verbose = config.get('verbose', True) self.backup = config.get('backup', 25) self.episodes = config.get('episodes', 300) self.depth = config.get('depth', 4) self.state_size = config.get('space', (84, 84)) self.model = None self._target_model = None self.prioritized = config.get(('prioritized', False)) if self.prioritized: self.memory = PrioritizedMemory( max_len=config.get('mem_size', 100000)) else: self.memory = SimpleMemory(max_len=config.get('mem_size', 100000)) if config.get('duel', False): self.model = self._duel_conv() else: self.model = self._conv() self.model.compile(Adam(lr=config.get('lr', 1e-4)), loss=huber_loss) if config.get('target', True): self._target_model = clone_model(self.model) self._target_model.set_weights(self.model.get_weights()) self._time = 0 self.update_time = config.get('target_update', 1000) self.env._max_episode_steps = None self.batch_size = config.get('batch', 32 * 3) self.to_observe = config.get('to_observe', 10000) self.log_dir = config['log_dir'] if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) plot_model(self.model, to_file=os.path.join(self.log_dir, 'model.png'), show_shapes=True) attr = { 'batch size': self.batch_size, 'to observe': self.to_observe, 'depth': self.depth } self.results = {'info': attr} load_prev = config.get('load', False) self.gamma = None pol = None if 'pol' in config: if config['pol'] == 'random': pol = policy.RandomPolicy() elif config['pol'] == 'eps': pol = policy.EpsPolicy(config.get('pol_eps', 0.1)) self.pol = pol if load_prev: path = sorted([ int(x) for x in os.listdir(self.log_dir) if os.path.isdir(os.path.join(self.log_dir, x)) ]) if len(path) != 0: load_prev = self.load(os.path.join(self.log_dir, str(path[-1]))) if self.pol is None: self.pol = policy.AnnealedPolicy( inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=config.get('ex_min', 0.02), value_test=0.5, nb_steps=config.get('ex_steps', 100000)) if self.gamma is None: self.gamma = policy.EpsPolicy(float(config.get('gamma', 0.99))).get_value
# os.environ['THEANO_FLAGS'] = "device=cuda,floatX=float32" # os.environ['CPLUS_INCLUDE_PATH'] = '/usr/local/cuda-9.0/include' import sys sys.path.append('..') import policy from dqn.agent_with_depth_less_memory import ImageAgent as ia_less from dqn.models_with_depth import DenseDQN, DoubleDQNWrapper, ConvDQM, ConvDDQN n = ConvDQM(action_size=6, state_size=(84, 84), depth=4, lr=1e-4) n = DoubleDQNWrapper(n, 10000) # n = DenseDQN(action_size=3, state_size=6, depth=4, lr=0.001, layer_size=(64, 64)) pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy( 1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.02, value_test=0.5, nb_steps=100000) agent = ia_less(pol=pol, network=n, to_observe=10000, max_len_memory=100000, log_dir='../pong/good_wrappers_DDQN_32x3-8/', load_prev=True, gamma=0.99) # agent = ram_less(pol=pol, network=n, to_observe=50000, max_len_memory=1000000,
def exp_double_duel(): import matplotlib.pyplot as plt eps = 1000 env = gym.make('CartPole-v0') env.seed(19) state_size = env.observation_space.shape[0] action_size = env.action_space.n log_dir = './logs/prova_pole' net = models.DenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001) a = Agent(game=env, net=net, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='DQN') net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(4, 4)) env.seed(19) a = Agent(game=env, net=net, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='Duel DQN 4 4') for i in [50, 100, 200, 300, 500, 750, 1000, 2000, 3000]: net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(4, 4)) n = models.DoubleDQNWrapper(network=net, update_time=i) a = Agent(game=env, net=n, log_dir=log_dir, pol=policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500)) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='Double Duel DQN 4 4 '+str(i)) plt.legend() plt.savefig('exp_double_duel.png')
def exp_duel(): import matplotlib.pyplot as plt eps = 1000 env = gym.make('CartPole-v0') env.seed(19) state_size = env.observation_space.shape[0] action_size = env.action_space.n pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) log_dir = './logs/prova_pole' + pol.name net = models.DenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001) a = Agent(game=env, net=net, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) print(r[-1]) plt.plot(range(eps), r, label='DQN') pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(12, 12)) env.seed(19) a = Agent(game=env, net=net, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) print(r[-1]) plt.plot(range(eps), r, label='Duel DQN 12 12') pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(8, 8)) env.seed(19) a = Agent(game=env, net=net, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) print(r[-1]) plt.plot(range(eps), r, label='Duel DQN 8 8') pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(4, 4)) a = Agent(game=env, net=net, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='Duel DQN 4 4') print(r[-1]) pol = policy.AnnealedPolicy(inner_policy=policy.EpsPolicy(1.0, other_pol=policy.GreedyPolicy()), attr='eps', value_max=1.0, value_min=0.1, value_test=0.5, nb_steps=500) net = models.DuelDenseDQN(log_dir=log_dir, action_size=action_size, state_size=state_size, layer_size=(24, 24), lr=0.001, layer_size_val=(24, 24)) a = Agent(game=env, net=net, log_dir=log_dir, pol=pol) r = a.learn(eps, False, 10, verbose=False) plt.plot(range(eps), r, label='Duel DQN 24 24') print(r[-1]) plt.legend() plt.savefig('exp_duel.png')
def __init__(self, network, max_len_memory=20000, to_observe=5000, pol=None, gamma=None, log_dir='', load_prev=False, game='Breakout-ramDeterministic-v4'): self.env = gym.make(game) self.env.seed(19) print(self.env.observation_space.shape[0], self.env.action_space.n, self.env.env.get_action_meanings()) self.action_meaning = self.env.env.get_action_meanings() print(network.model.summary()) self.no_op_ep = 30 self.env._max_episode_steps = None self.model = network self.batch_size = 32 self.to_observe = to_observe self.state_size = network.state_size self.action_size = network.action_size self.log_dir = log_dir self.depth = network.depth if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) attr = { 'batch size': self.batch_size, 'to observe': self.to_observe, 'depth': self.depth, 'no_op_ep': 30 } self.results = {'info': attr} self.memory = PrioritizedMemory(max_len=max_len_memory) if load_prev: path = sorted([ int(x) for x in os.listdir(self.log_dir) if os.path.isdir(os.path.join(self.log_dir, x)) ]) if len(path) != 0: load_prev = self.load(os.path.join(self.log_dir, str(path[-1]))) else: load_prev = False if not load_prev: if pol is None: self.pol = policy.GreedyPolicy() else: self.pol = pol if gamma is None: gamma = policy.EpsPolicy(0.99) if isinstance(gamma, policy.AnnealedPolicy): self.gamma = gamma.linear_step elif isinstance(gamma, policy.Policy): self.gamma = gamma.get_value