示例#1
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(max_memory_size, batch_size, eps_start, eps_end,
                         eps_decay, device, decay_type)

        self.model_name = "DQN"
        self.output_dim = output_dim
        self.policy_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)

        # optimizer
        self.optim = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
    def __init__(self, learn_rate, 
            state_shape, num_actions, action_shape, 
            batch_size, slice_size):
        self.gamma = 0.999
        self.tau = 0.01
        self.clip_grad_norm = 0.1
        self.has_target_net = True

        self.state_shape = state_shape
        self.num_actions = num_actions      #   this is how many actions there are to choose from
        self.action_shape = action_shape    #   this is how many actions the env accepts at each step

        self.buffer_size = 1_000_000
        self.batch_size = batch_size    # *times slice_size, because recurrency/rollouts
        self.slice_size = slice_size

        self.slice_replay_buffer = MemorySliceReplayBuffer(
            size=self.buffer_size, slice_size=self.slice_size, 
            state_shape=self.state_shape, action_shape=self.action_shape)
        self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300)
        # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)


        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN(state_shape, num_actions).to(self.device)
        if self.has_target_net:
            self.target_net  = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)
示例#3
0
def main():
    """Run DQN until the environment throws an exception."""
    env = AllowBacktracking(make_env(stack=False, scale_rew=False))
    env = BatchedFrameStack(BatchedGymEnv([[env]]), num_images=4, concat=False)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    with tf.Session(config=config) as sess:
        dqn = DQN(*rainbow_models(sess,
                                  env.action_space.n,
                                  gym_space_vectorizer(env.observation_space),
                                  min_val=-200,
                                  max_val=200))
        player = NStepPlayer(BatchedPlayer(env, dqn.online_net), 4)
        optim, optimize = dqn.optimize(learning_rate=0.0001)
        sess.run(tf.global_variables_initializer())
        dqn.train(
            num_steps=3000000,  # Make sure an exception arrives before we stop.
            player=player,
            replay_buffer=PrioritizedReplayBuffer(500000,
                                                  0.5,
                                                  0.4,
                                                  epsilon=0.1),
            optimize_op=optimize,
            train_interval=1,
            target_interval=1024,
            batch_size=16,
            min_buffer_size=20000)
示例#4
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 lr,
                 gamma,
                 max_memory_size,
                 batch_size,
                 eps_start,
                 eps_end,
                 eps_decay,
                 device,
                 target_update=100,
                 linear1_units=64,
                 linear2_units=64,
                 decay_type="linear"):

        super().__init__(input_dim, output_dim, lr, gamma, max_memory_size,
                         batch_size, eps_start, eps_end, eps_decay, device,
                         linear1_units, linear2_units, decay_type)

        self.model_name = "FixedDQN"

        self.target_update_freq = target_update
        # networks
        self.output_dim = output_dim
        self.target_net = DQN(input_dim, output_dim, linear1_units,
                              linear2_units).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.updated = 0
示例#5
0
文件: trainer.py 项目: tzyrq/marl_dqn
    def __init__(self, args, n_agents, n_cities, device, data_loader):
        self.n_agents = n_agents
        self.n_cities = n_cities

        self.device = device

        self.args = args
        self.Encoder = Encoder(K=args.steps,
                               M=self.n_cities,
                               L=args.len_encoder).to(self.device)
        self.DQN = DQN(N=self.n_agents,
                       K=args.steps,
                       L=args.len_encoder,
                       M=n_cities).to(self.device)

        self.data_loader = data_loader
        self.iter_data = iter(data_loader)
        self.n_envs = len(data_loader)
        self.idx_env = -1
        self.env = None

        self.EPS_START = self.args.eps_start
        self.EPS_END = self.args.eps_end
        self.EPS_DECAY = self.args.eps_decay

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.DQN.parameters(), lr=args.lr)
示例#6
0
    def __init__(self, player, episode):

        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.n_states = 9
        self.state = np.zeros(self.n_states, dtype=np.int)
        self.player = player
        self.reward = 0
        self.prev_state = None
        self.dqn = DQN(self.n_states + 1, self.n_states)
示例#7
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=0.9999,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn',
                 device='cuda:0'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.device = device

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        # Create policy and target DQN models
        self.policy = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'policy',
                          chkpt_dir=self.chkpt_dir)
        self.target = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'target',
                          chkpt_dir=self.chkpt_dir)

        # put on correct device (GPU or CPU)
        self.policy.to(device)
        self.target.to(device)

        # Optimizer
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        # Loss
        self.loss = nn.MSELoss()
示例#8
0
    def __init__(self, sess, state_dimension, num_actions, tau=0.001):
        self.sess = sess
        DQN.__init__(self,
                     sess,
                     state_dimension,
                     num_actions,
                     scope="model",
                     reuse=True)
        self.tau = tau
        self._counterpart = self._register_counterpart()

        def update():
            for op in self._counterpart:
                self.sess.run(op)

        self.update = update
        tf.global_variables_initializer().run(session=sess)
示例#9
0
    def __init__(self, learn_rate, input_shape, num_actions, batch_size):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.gamma = 0.99
        self.tau = 0.05
        self.has_target_net = False

        self.memories = []
        # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000)
        self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)

        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN().to(self.device)
        if self.has_target_net:
            self.target_net = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)
示例#10
0
    def __init__(self,
                 sess,
                 state_dimension,
                 num_actions,
                 scope="model",
                 reuse=False):
        # sess = tf.Session()  # TODO add CPU config information

        # Targets in loss computation
        self.target_in = tf.placeholder(shape=[None],
                                        dtype=tf.float32)  # target Q values
        self.action_in = tf.placeholder(shape=[None, 2], dtype=tf.int32)

        train_model = DQN(sess,
                          state_dimension,
                          num_actions,
                          scope,
                          reuse=reuse)
        # target_model = TargetNetwork(sess, state_dimension, num_actions)

        self.loss = tf.losses.mean_squared_error(
            labels=self.target_in,
            predictions=tf.gather_nd(params=train_model.pred_out,
                                     indices=self.action_in))
        self.optimizer = tf.train.AdamOptimizer(0.0005)
        self.train_step = self.optimizer.minimize(self.loss)

        # tf.add_to_collection(tf.GraphKeys.TRAIN_OP, self.pred_out)

        def train(obs, actions, targets):
            """
            Updates the weights of the neural network, based on its targets, its
            predictions, its loss and its optimizer.

            Args:
                sess: TensorFlow session.
                obs: [current_observation] or observations of batch
                actions: [current_action] or actions of batch
                targets: [current_target] or targets of batch
            """
            feed_dict = {
                train_model.obs_in: obs,
                self.action_in: actions,
                self.target_in: targets
            }
            # evaluate the TF tensors and operations self.loss and self.train_step
            loss, _ = sess.run([self.loss, self.train_step],
                               feed_dict=feed_dict)
            return loss

        self.train_model = train_model
        # self.target_model = target_model
        self.train = train
        self.predict = train_model.predict
        # self.save = save_params
        # self.load = load_params
        tf.global_variables_initializer().run(session=sess)
示例#11
0
def main():
    opt = parse_opt()
    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    env = gym.make(game)

    seed = 7122
    env.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    agent = DQN(env, opt, device=device)
    agent.network.apply(weights_init)
    agent.sync_weight()

    progress = trange(opt.episode, ascii=True)
    summary = Summary()
    last_rewards = 0

    for episode in progress:
        # Training
        state = env.reset()
        for s in range(opt.max_step):
            # use epsilon-greedy in training
            action = agent.egreedy_action(state)
            next_state, reward, done, _ = env.step(action)
            loss = agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        summary.add(episode, 'loss', loss)

        # Testing
        if opt.test_interval > 0 and (episode + 1) % opt.test_interval == 0:
            rewards = 0
            for t in trange(opt.test, ascii=True, leave=False):
                state = env.reset()
                for s in range(opt.max_step):
                    action = agent.action(state)
                    next_state, reward, done, _ = env.step(action)
                    state = next_state
                    rewards += reward
                    if done:
                        break

            if opt.test > 0:
                rewards /= opt.test

            last_rewards = rewards
            summary.add(episode, 'reward', rewards)

        progress.set_description('Loss: {:.4f} | Reward: {:2}'.format(
            loss, last_rewards))

    if opt.log:
        summary.write(opt.log)
示例#12
0
    def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions,
                 learning_rate, gamma, batch_size, replay_memory_size,
                 hidden_size, model_input_size, use_PER, use_ICM):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_anneal_over_steps = epsilon_anneal

        self.num_actions = nb_actions

        self.gamma = gamma

        self.batch_size = batch_size

        self.learning_rate = learning_rate

        self.step_no = 0

        self.policy = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target = DQN(hidden_size=hidden_size,
                          inputs=model_input_size,
                          outputs=nb_actions).to(self.device)
        self.target.load_state_dict(self.policy.state_dict())
        self.target.eval()
        self.hidden_size = hidden_size
        self.optimizer = torch.optim.AdamW(self.policy.parameters(),
                                           lr=self.learning_rate)

        self.use_PER = use_PER
        if use_PER:
            self.replay = Prioritized_Replay_Memory(replay_memory_size)
        else:
            self.replay = Replay_Memory(replay_memory_size)

        self.loss_function = torch.nn.MSELoss()
        self.use_ICM = use_ICM
        if use_ICM:
            self.icm = ICM(model_input_size, nb_actions)
示例#13
0
def run_gym(params):
    if params.CnnDQN:
        env = make_atari(params.env)
        env = wrap_pytorch(wrap_deepmind(env))
        q_network = CnnDQN(env.observation_space.shape, env.action_space.n)
        target_q_network = deepcopy(q_network)
    else:
        env = make_gym_env(params.env)
        q_network = DQN(env.observation_space.shape, env.action_space.n)
        target_q_network = deepcopy(q_network)

    if USE_CUDA:
        q_network = q_network.cuda()
        target_q_network = target_q_network.cuda()

    agent = Agent(env, q_network, target_q_network)
    optimizer = optim.Adam(q_network.parameters(), lr=params.learning_rate)
    replay_buffer = ReplayBuffer(params.replay_size)

    losses, all_rewards = [], []
    episode_reward = 0
    state = env.reset()

    for ts in range(1, params.max_ts + 1):
        epsilon = get_epsilon(params.epsilon_start, params.epsilon_end,
                              params.epsilon_decay, ts)
        action = agent.act(state, epsilon)

        next_state, reward, done, _ = env.step(int(action.cpu()))
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0

        if len(replay_buffer) > params.start_train_ts:
            # Update the q-network & the target network
            loss = compute_td_loss(agent, params.batch_size, replay_buffer,
                                   optimizer, params.gamma)
            losses.append(loss.data)

            if ts % params.target_network_update_f == 0:
                hard_update(agent.q_network, agent.target_q_network)

        if ts % params.log_every == 0:
            out_str = "Timestep {}".format(ts)
            if len(all_rewards) > 0:
                out_str += ", Reward: {}".format(all_rewards[-1])
            if len(losses) > 0:
                out_str += ", TD Loss: {}".format(losses[-1])
            print(out_str)
示例#14
0
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model1 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
            self.model2 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
        else:
            self.model1 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)
            self.model2 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)

        self.optimizer1 = torch.optim.Adam(self.model1.parameters())
        self.optimizer2 = torch.optim.Adam(self.model2.parameters())
示例#15
0
    def __init__(self, player, nb_rows, nb_cols, timelimit, episode):

        self.EPSILON = EPS_END + (EPS_START -
                                  EPS_END) * (1 - (episode / DECAY_LEN))
        self.EPSILON = max(self.EPSILON, EPS_END)
        self.timelimit = timelimit
        self.nb_rows = nb_rows
        self.nb_cols = nb_cols
        rows = []
        for _ in range(nb_rows + 1):
            columns = []
            for _ in range(nb_cols + 1):
                columns.append({"v": 0, "h": 0})
            rows.append(columns)
        self.cells = rows
        self.len_states = nb_rows * (nb_cols + 1) + nb_cols * (nb_rows + 1)
        self.state = np.zeros(self.len_states)
        self.player = player
        self.score = [0, 0]
        self.reward = 0
        self.prev_state = None
        self.dqn = DQN(self.len_states, self.len_states)
示例#16
0
class agent():
    def __init__(self, role, total_episode, epsilon, learning_rate, gamma, batch_size, target_replace_iter, memory_capacity, n_actions, n_states):
        self.role = role
        self.model = DQN(total_episode, epsilon, learning_rate, gamma, batch_size, target_replace_iter, memory_capacity, n_actions, n_states)

    def step(self, board: np.ndarray, episode):
        available = np.array([0 in board[:,col] for col in range(7)])
        action = self.model.get_action(board.flatten(), episode, available)
        location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action]
        
        while board[location, action] != 0:
            print('Occupied!! Try another move')
            available[action] = False
            action = self.model.get_action(board.flatten(), episode, available)
            location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action]
            
        board[location,action] = self.role
        return board, action
    
    def store(self, in_board, action, winner, board):
        s  = in_board.flatten()
        a  = action
        r  = winner * self.role
        s_ = board.flatten()
        self.model.store_transition(s, a, r, s_)
        
    def random_action(self, board: np.ndarray):
        available = np.array([0 in board[:,col] for col in range(7)])
        action = np.random.choice(np.array(range(7))[available])
        location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action]
        
        while board[location, action] != 0:
            print('Occupied!! Try another move')
            action = self.model.get_action(board)
            location = 5-(np.fliplr(board.T)==0).argmax(axis=1)[action]
            
        board[location,action] = self.role
        return board, action
示例#17
0
	def __init__(self):
		self.controller, self.target = DQN(), DQN() # For RL 
		self.vision = VAE()

		if USE_CUDA:
			self.controller.cuda()
			self.target.cuda()
			self.vision.cuda()

		# Init weights based on init function
		self.controller.apply(init_weights)
		self.vision.apply(init_weights)
		# Load model params into target
		self.target.load_state_dict(self.controller.state_dict())
		self.action_number = 0 # actions taken (to determine whether or not to update)
	
		# NOTE: DQN exp buffer should use embeddings generated by vision module
		# The vision module (aka the VAE) has memory consisting of game states
		self.exp_buffer = [] # exp buffer
		self.exp_number = 0 # size of exp buffer so far

		self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE)
		self.loss = nn.SmoothL1Loss()
示例#18
0
def main(env_id, embedding_size):
    env = wrap_deepmind(make_atari(env_id), scale=True)
    embedding_model = DQN(embedding_size)
    agent = NECAgent(env, embedding_model)

    # subprocess.Popen(["tensorboard", "--logdir", "runs"])
    configure("runs/pong-run")

    for t in count():
        if t == 0:
            reward = agent.warmup()
        else:
            reward = agent.episode()
        print("Episode {}\nTotal Reward: {}".format(t, reward))
        log_value('score', reward, t)
示例#19
0
def play_dqn(filename, n=10, seed=0):
    env = gym.make("CartPole-v0")
    env.seed(seed)
    env.reset()
    model = DQN(net_structure=(state_size, 64, 64, action_size),
                gamma=gamma,
                optim=optim.Adam,
                optim_param=[alpha],
                loss_function=nn.MSELoss(),
                tau=0.1,
                device=device)

    buffer = ReplayBuffer(memory_size, batch_size, device)
    learning_policy = EpsDecay(eps_start, eps_min, eps_decay,
                               env.action_space.n)
    playing_policy = Greedy()
    agent = Agent(model=model,
                  buffer=buffer,
                  learn_every=4,
                  update_every=4,
                  policy_learning=learning_policy,
                  policy_playing=playing_policy)
    model.predict.load_state_dict(torch.load(filename))
    agent.playing()
    for i in range(n):
        state = env.reset()
        score = 0
        env.render()
        for j in range(99999999999):
            action = agent.act(state)
            env.render()
            state, reward, done, _ = env.step(action)
            score += reward
            if done:
                break
        print(score)
    env.close()
    def __init__(self, name, state_size, action_size, use_double_dqn=False, use_dueling=False, seed=0, lr_decay=0.9999, use_prioritized_replay=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.use_double_dqn = use_double_dqn
        self.use_dueling = use_dueling
        self.seed = random.seed(seed)
        self.use_prioritized_replay = use_prioritized_replay

        # Q-Network
        if use_dueling:
            self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size, seed).to(device)

        self.qnetwork_target.eval()
            
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, lr_decay)

        # Replay memory
        if self.use_prioritized_replay:
            self.memory = PrioritizedReplayBuffer(BUFFER_SIZE, seed, alpha=0.2, beta=0.8, beta_scheduler=1.0)
        else:
            self.memory = ReplayBuffer(BUFFER_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#21
0
class Agent:
    def __init__(self):
        self.model, self.target = DQN(), DQN()
        if USE_CUDA:
            self.model.cuda()
            self.target.cuda()

        self.exp_buffer = Memory()
        self.exp_number = 0  # size of exp buffer so far
        self.param_updates = 0  # track how many times params updated

        self.opt = torch.optim.RMSprop(self.model.parameters(),
                                       lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()

    # Make an action given a state
    def act(self, state, explore=True):
        if explore and np.random.rand() <= EPSILON:
            # Act randomly
            a = np.random.randint(NUM_ACTIONS)
        else:
            # Send state to model
            a_vec = self.model(state)
            a = int(torch.argmax(torch.squeeze(a_vec)))

        return a

    # clear the buffer
    def clear_exp_buffer(self):
        self.exp_buffer = Memory()
        self.exp_number = 0

    # Add experience to exp buffer
    def add_exp(self, exp):
        self.exp_buffer.add(exp)
        self.exp_number += 1

    # Replay gets batch and trains on it
    def replay(self, batch_size):
        q_loss = 0
        # If experience buffer isn't right size yet, don't do anything
        if self.exp_number < MIN_BUFFER_SIZE: return
        # Get batch from experience_buffer
        batch = self.exp_buffer.get_batch(batch_size)

        s, a, r, s_new, _ = zip(*batch)
        s_new = s_new[:-1]  # Remove last item (it is 'None')
        # First turn batch into something we can run through model
        s = torch.cat(s)
        a = torch.LongTensor(a).unsqueeze(1)
        r = torch.FloatTensor(r).unsqueeze(1)
        s_new = torch.cat(s_new)

        #print(a.shape,r.shape, s.shape, s_new.shape)
        if USE_CUDA:
            a = a.cuda()
            r = r.cuda()

        # Get q vals for s (what model outputted) from a
        # .gather gets us q value for specific action a
        pred_q_vals = self.model(s).gather(1, a)

        # Having chosen a in s,
        # What is the highest possible reward we can get from s_new?
        # We add q of performing a in s then add best q from next state
        # cat 0 to end for the terminal state
        s_new_q_vals = self.target(s_new).max(1)[0]
        zero = torch.FloatTensor(0)
        if USE_CUDA: zero = zero.cuda()

        s_new_q_vals = torch.cat((s_new_q_vals, zero))
        exp_q_vals = r + s_new_q_vals * GAMMA

        myloss = self.loss(pred_q_vals, exp_q_vals)
        self.opt.zero_grad()
        myloss.backward()
        self.opt.step()

        if WEIGHT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(
                    -1, 1)  # Weight clipping avoids exploding gradients

        if self.param_updates % TARGET_UPDATE_INTERVAL == 0:
            self.target.load_state_dict(self.model.state_dict())

        self.param_updates += 1

        global EPSILON
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY

        return myloss.item()
示例#22
0
def init_dqn(args):
    """Intitialises and returns the necessary objects for
       Deep Q-learning:
       Q-network, target network, replay buffer and optimizer.
    """
    logging.info(
        "Initialisaling DQN with architecture {} and optimizer {}".format(
            args.dqn_archi, args.optimizer_agent))
    if args.dqn_archi == 'mlp':
        q_net = DQN(args.obs_shape, args.n_actions, args)
        q_target = DQN(args.obs_shape, args.n_actions, args)
    elif args.dqn_archi == 'cnn':
        q_net = CnnDQN(args.obs_shape, args.n_actions, args)
        q_target = CnnDQN(args.obs_shape, args.n_actions, args)
    if args.optimizer_agent == 'RMSProp':
        optimizer_agent = optim.RMSprop(q_net.parameters(),
                                        lr=args.lr_agent,
                                        weight_decay=args.lambda_agent)
    else:
        assert args.optimizer_agent == 'Adam'
        optimizer_agent = optim.Adam(q_net.parameters(),
                                     lr=args.lr_agent,
                                     weight_decay=args.lambda_agent)
    q_target.load_state_dict(
        q_net.state_dict())  # set params of q_target to be the same
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

    if args.epsilon_annealing_scheme == 'linear':
        epsilon_schedule = LinearSchedule(schedule_timesteps=int(
            args.exploration_fraction * args.n_agent_steps),
                                          initial_p=args.epsilon_start,
                                          final_p=args.epsilon_stop)
    else:
        assert args.epsilon_annealing_scheme == 'exp'
        epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay,
                                       final_p=args.epsilon_stop,
                                       initial_p=args.epsilon_start)

    return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
示例#23
0
class Agent():
    def __init__(self, learn_rate, input_shape, num_actions, batch_size):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.gamma = 0.99
        self.tau = 0.05
        self.has_target_net = False

        self.memories = []
        # self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=2000)
        self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)

        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN().to(self.device)
        if self.has_target_net:
            self.target_net = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)

    def update_target_net_params(self):
        for param, target_param in zip(self.net.parameters(),
                                       self.target_net.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def choose_action(self, observation, hidden_state):
        state = torch.tensor(observation).float().detach()
        state = state.to(self.device)

        q_values, hidden_state_ = self.net(state, hidden_state)
        action = torch.argmax(q_values).item()

        if random.random() <= self.epsilon.value():
            action = random.randint(0, self.num_actions - 1)

        return action, hidden_state_

    def fetch_batch(self):
        indices = np.random.choice(len(self.memories),
                                   self.batch_size,
                                   replace=False)
        indices = list(indices)

        for idx in indices:
            yield self.memories[idx]

    def store_trajectory(self, trajectory):
        self.memories.append(trajectory)

    def learn(self):
        if len(self.memories) < self.batch_size:
            return

        batch_losses = []
        for memory_idx, memory in enumerate(self.fetch_batch()):
            states, actions, rewards, dones = memory.fetch_on_device(
                self.device)

            self.net.train()

            episode_losses = []
            hidden_state = self.net.get_new_hidden_state().to(self.device)
            second_to_last_memory_index = len(memory.states) - 1
            for i in range(second_to_last_memory_index):
                state = states[i].detach()
                state_ = states[i + 1].detach()
                action = actions[i].detach()
                reward = rewards[i].detach()

                if i == second_to_last_memory_index - 1:
                    done = True
                else:
                    done = False

                qs, hidden_state_ = self.net(state, hidden_state)
                chosen_q = qs[action]

                if self.has_target_net:
                    qs_, hidden_state_3 = self.target_net(
                        state_, hidden_state_)
                    action_qs_, hidden_state_3 = self.net(
                        state_, hidden_state_)
                    action_ = torch.argmax(action_qs_)
                    chosen_q_ = qs_[action_]
                else:
                    action_qs_, hidden_state_3 = self.net(
                        state_, hidden_state_)
                    chosen_q_ = torch.max(action_qs_)
                if done:
                    chosen_q_ = torch.tensor(0.0, dtype=torch.float32).to(
                        self.device)

                q_target = reward + self.gamma * chosen_q_

                loss = (q_target - chosen_q)**2

                episode_losses.append(loss)

                hidden_state = hidden_state_

            episode_loss = sum(episode_losses) / len(episode_losses)
            batch_losses.append(episode_loss)

        batch_loss = sum(batch_losses) / len(batch_losses)
        self.optimizer.zero_grad()
        batch_loss.backward()
        self.optimizer.step()

        for i in range(self.batch_size):
            self.epsilon.step()
        if self.has_target_net:
            self.update_target_net_params()
示例#24
0
文件: dqn.py 项目: chris-lamb/deep-rl
def initialize(game, model_name, warm_start):
    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    capacity = int(1e4)

    # Cold start
    if not warm_start:
        # Initialize model
        model = DQN(in_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=1.0e-4,
                                  weight_decay=0.01)
        # Initialize replay memory
        memory_buffer = ReplayMemory(capacity)

        # Initialize statistics
        running_reward = None
        running_rewards = []

    # Warm start
    if warm_start:

        data_file = 'results/{}_{}.p'.format(game, model_name)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/{}_{}_ep_{}.p'.format(
                game, model_name, prior_eps)
            with open(model_file, 'rb') as f:
                saved_model = pickle.load(f)
                model, optimizer, memory_buffer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = DQN(in_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=1.0e-4,
                                      weight_decay=0.01)
            # Initialize replay memory
            memory_buffer = ReplayMemory(capacity)

            running_reward = None
            running_rewards = []

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    criterion = torch.nn.MSELoss()

    return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
示例#25
0
class Agent():
    def __init__(self, learn_rate, 
            state_shape, num_actions, action_shape, 
            batch_size, slice_size):
        self.gamma = 0.999
        self.tau = 0.01
        self.clip_grad_norm = 0.1
        self.has_target_net = True

        self.state_shape = state_shape
        self.num_actions = num_actions      #   this is how many actions there are to choose from
        self.action_shape = action_shape    #   this is how many actions the env accepts at each step

        self.buffer_size = 1_000_000
        self.batch_size = batch_size    # *times slice_size, because recurrency/rollouts
        self.slice_size = slice_size

        self.slice_replay_buffer = MemorySliceReplayBuffer(
            size=self.buffer_size, slice_size=self.slice_size, 
            state_shape=self.state_shape, action_shape=self.action_shape)
        self.epsilon = LinearSchedule(start=1.0, end=0.01, num_steps=300)
        # self.epsilon = LinearSchedule(start=1.0, end=0.1, num_steps=30)


        # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.device = torch.device("cpu")

        self.net = DQN(state_shape, num_actions).to(self.device)
        if self.has_target_net:
            self.target_net  = copy.deepcopy(self.net).to(self.device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learn_rate)

    def update_target_net_params(self):
        for param, target_param in zip(self.net.parameters(), self.target_net.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def choose_action(self, observation, hidden_state):
            state = torch.tensor(observation).float().unsqueeze(0)
            state = state.detach().to(self.device)

            q_values, hidden_state_ = self.net(state, hidden_state)
            action = torch.argmax(q_values[0]).item()

            if random.random() <= self.epsilon.value():
                action = random.randint(0, self.action_shape[0])

            return action, hidden_state_

    def learn(self, stats):
        if self.slice_replay_buffer.count < self.batch_size:
            return 

        self.net.train()

        states_slices, actions_slices, rewards_slices, next_states_slices, dones_slices = self.slice_replay_buffer.sample(self.batch_size, self.device)

        batch_losses = []
        hidden_states = self.net.get_batch_hidden_state(self.batch_size).to(self.device)

        for slice_index in range(self.slice_size):
            states = states_slices[:, slice_index]
            actions = actions_slices[:, slice_index]
            rewards = rewards_slices[:, slice_index]
            states_ = next_states_slices[:, slice_index]
            dones = dones_slices[:, slice_index]

            batch_indices = np.arange(self.batch_size, dtype=np.int64)
            qs, hidden_states_ = self.net(states, hidden_states)
            chosen_q = qs[batch_indices, actions.T[0]]

            if self.has_target_net:
                qs_, hidden_state_3 = self.target_net(states_, hidden_states_)
                action_qs_, hidden_state_3 = self.net(states_, hidden_states_)
                actions_ = torch.argmax(action_qs_, dim=1)
                chosen_q_ = qs_[batch_indices, actions_]
            else:
                action_qs_, hidden_state_3 = self.net(states_, hidden_states_)
                chosen_q_ = torch.max(action_qs_, dim=1)[0]

            rewards = rewards.T[0]
            q_target = rewards + self.gamma * chosen_q_

            loss = torch.mean( (q_target -  chosen_q) ** 2 )
            batch_losses.append(-loss)

            hidden_states = hidden_states_
            hidden_states[dones.T[0]] = 0.0 #   if an episode ends mid slice then zero the hidden_states
                                            #   this could be a problem if backprop stops here

        batch_losses = torch.stack(batch_losses)
        batch_loss = torch.mean(batch_losses)
        stats.last_loss = batch_loss.item()
        self.optimizer.zero_grad()
        batch_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.clip_grad_norm)
        self.optimizer.step()

        self.epsilon.step()
        if self.has_target_net:
            self.update_target_net_params()
示例#26
0
from collections import deque
import random
import torch
from torch import optim
from tqdm import tqdm
from env import Env
from hyperparams import ACTION_DISCRETISATION, OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, EPSILON, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, REPLAY_SIZE, TARGET_UPDATE_INTERVAL, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START
from models import DQN, create_target_network
from utils import plot

env = Env()
agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION)
target_agent = create_target_network(agent)
optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE)
D = deque(maxlen=REPLAY_SIZE)


def convert_discrete_to_continuous_action(action):
    return action.to(dtype=torch.float32) - ACTION_DISCRETISATION // 2


def test(agent):
    with torch.no_grad():
        env = Env()
        state, done, total_reward = env.reset(), False, 0
        while not done:
            action = agent(state).argmax(
                dim=1,
                keepdim=True)  # Use purely exploitative policy at test time
            state, reward, done = env.step(
                convert_discrete_to_continuous_action(action))