예제 #1
0
파일: rollers.py 프로젝트: aghriss/DeepRL
    def __init__(self, env, agent, memory_max):
        
        self.env = env
        self.agent = agent

        self.memory_max = memory_max
        
        self.progbar = Progbar(self.memory_max)
        
        self.memory = dummy_obj.Memory(self.memory_max,["t","state","action","next_state","reward","terminated"])
예제 #2
0
    def __init__(self, patch_size, dictionary_size, alpha, **kwargs):

        super(ReductionLayer, self).__init__(**kwargs)

        self.dict_size = dictionary_size

        self.patch_layer = patchlayer.PatchLayer(patch_size)

        self.alpha = alpha

        self.progbar = Progbar(100, stateful_metrics=["loss"])

        self.old_D = 0.0
예제 #3
0
    def __init__(self, env, gamma, max_steps):
        self.agent_type = "TRPO"
        policy = self.deep(env)
        self.old_policy = self.deep(env)
        super(TRPO, self).__init__(policy)

        self.discount = gamma
        self.env = env
        self.max_steps = max_steps

        self.setup_agent()

        self.baseline = deepfunctions.BaselineValueFunction(env)
        self.episodes = []
        self.progbar = Progbar(100)
예제 #4
0
    def __init__(self,
                 env,
                 deep_func,
                 gamma,
                 batch_size,
                 memory_min,
                 memory_max,
                 update_double=10000,
                 train_steps=1000000,
                 log_freq=1000,
                 eps_start=1,
                 eps_decay=-1,
                 eps_min=0.1):

        super(DDPG, self).__init__()

        self.env = env

        self.Q = self.model = deep_func(env)
        self.target_Q = deep_func(env)
        self.Q.summary()
        self.discount = gamma
        self.memory_min = memory_min
        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = ReplayMemory(
            self.memory_max,
            ["state", "action", "reward", "next_state", "terminated"])

        self.eps_decay = eps_decay
        if eps_decay == -1:
            self.eps_decay = 1 / train_steps
        self.eps_min = eps_min
        self.update_double = update_double
        self.actions = []
        self.path_generator = self.roller()
        self.past_rewards = collections.deque([], 50)
예제 #5
0
파일: ddpg.py 프로젝트: aghriss/DeepRL
    def __init__(self, env, gamma, memory_max, batch_size, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1):
        
        model = self.deep(env)
        
        super(DDPG,self).__init__(model)
        self.discount = gamma
        self.env = env
        
        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = dummy_obj.Memory(self.memory_max,self.batch_size,["t","state","action","reward","next_state","terminated"])

        self.eps_decay = eps_decay        
        if eps_decay == -1:            
            self.eps_decay = 1/train_steps
        self.eps_min = eps_min
예제 #6
0
    def __init__(self,
                 env,
                 gamma,
                 batch_size,
                 memory_max,
                 double_update=100000,
                 train_steps=1000000,
                 log_freq=1000,
                 eps_start=1,
                 eps_decay=-1,
                 eps_min=0.1):

        model = self.deep(env)
        self.agent_type = "DDQN"
        super(DDQN, self).__init__(model)
        self.target_model = self.deep(env)
        self.target_model.net.set_weights(self.model.net.get_weights())
        self.discount = gamma
        self.env = env

        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = dummy_obj.ReplayMemory(
            self.memory_max,
            ["t", "state", "action", "reward", "next_state", "terminated"])

        self.eps_decay = eps_decay
        if eps_decay == -1:
            self.eps_decay = 1 / train_steps
        self.eps_min = eps_min
        self.update_double = double_update
예제 #7
0
class DDPG(Agent):
    """
    Deep Deterministic Policy Gradient
    """
    name = "DDPG"

    def __init__(self,
                 env,
                 deep_func,
                 gamma,
                 batch_size,
                 memory_min,
                 memory_max,
                 update_double=10000,
                 train_steps=1000000,
                 log_freq=1000,
                 eps_start=1,
                 eps_decay=-1,
                 eps_min=0.1):

        super(DDPG, self).__init__()

        self.env = env

        self.Q = self.model = deep_func(env)
        self.target_Q = deep_func(env)
        self.Q.summary()
        self.discount = gamma
        self.memory_min = memory_min
        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = ReplayMemory(
            self.memory_max,
            ["state", "action", "reward", "next_state", "terminated"])

        self.eps_decay = eps_decay
        if eps_decay == -1:
            self.eps_decay = 1 / train_steps
        self.eps_min = eps_min
        self.update_double = update_double
        self.actions = []
        self.path_generator = self.roller()
        self.past_rewards = collections.deque([], 50)

    def act(self, state):

        if np.random.rand() < self.eps:
            return np.random.randint(self.env.action_space.n)
        return np.argmax(self.Q.predict(state))

    def train(self):

        self.progbar.__init__(self.memory_min)
        while (self.memory.size < self.memory_min):
            self.path_generator.__next__()

        while (self.done < self.train_steps):

            to_log = 0
            self.progbar.__init__(self.update_double)
            old_theta = self.Q.flattener.get()
            th0 = self.Q.net.dense[0].weight.detach().clone()
            self.target_Q.copy(self.Q)
            while to_log < self.update_double:

                self.path_generator.__next__()

                rollout = self.memory.sample(self.batch_size)
                state_batch = torch.tensor(rollout["state"],
                                           dtype=torch.float,
                                           device=device)
                action_batch = torch.tensor(rollout["action"],
                                            dtype=torch.long,
                                            device=device)
                reward_batch = torch.tensor(rollout["reward"],
                                            dtype=torch.float,
                                            device=device)

                non_final_batch = torch.tensor(1 - rollout["terminated"],
                                               dtype=torch.float,
                                               device=device)
                next_state_batch = torch.tensor(rollout["next_state"],
                                                dtype=torch.float,
                                                device=device)

                #current_q = self.Q(state_batch)

                current_q = self.Q(state_batch).gather(
                    1, action_batch.unsqueeze(1)).view(-1)
                _, a_prime = self.Q(next_state_batch).max(1)

                # Compute the target of the current Q values
                #target_q = self.target_Q(state_batch).gather(1, action_batch.unsqueeze(1)).view(-1)
                next_max_q = self.target_Q(next_state_batch).gather(
                    1, a_prime.unsqueeze(1)).view(-1)
                #target_q[torch.arange(self.batch_size).long(),action_batch.squeeze()] =  reward_batch + self.discount * non_final_batch * next_max_q.squeeze()
                target_q = reward_batch + self.discount * non_final_batch * next_max_q.squeeze(
                )

                # Compute loss
                loss = self.Q.total_loss(current_q, target_q.detach(
                ))  # loss = self.Q.total_loss(current_q, target_q)

                # Optimize the model
                self.Q.optimizer.zero_grad()
                loss.backward(
                )  #error =  target_q-current_q,current_q.backward(-1.0 * error.clamp(-1, 1))
                self.Q.optimize()

                self.progbar.add(self.batch_size,
                                 values=[("Loss",
                                          float(loss.detach().cpu().numpy()))])

                to_log += self.batch_size

            self.target_Q.copy(self.Q)
            new_theta = self.Q.flattener.get()
            th1 = self.Q.net.dense[0].weight.detach()
            self.log(
                "Delta Theta L1",
                float((new_theta -
                       old_theta).mean().abs().detach().cpu().numpy()))
            self.log("Delta Dense Theta L1",
                     float((th0 - th1).mean().abs().detach().cpu().numpy()))
            self.log("Av 50ep  rew", np.mean(self.past_rewards))
            self.log("Max 50ep rew", np.max(self.past_rewards))
            self.log("Min 50ep rew", np.min(self.past_rewards))
            self.log("Epsilon", self.eps)
            self.log("Done", self.done)
            self.log("Total", self.train_steps)
            self.target_Q.copy(self.Q)
            self.print()
            #self.play()
            self.save(self.env.name)

    def set_eps(self, x):
        self.eps = max(x, self.eps_min)

    def roller(self):

        state = self.env.reset()
        ep_reward = 0
        while True:
            episode = self.memory.empty_episode()
            for i in range(self.batch_size):

                # save current state
                episode["state"].append(state)

                # act
                action = self.act(state)
                self.actions.append(action)
                state, rew, done, info = self.env.step(action)

                episode["next_state"].append(state)
                episode["action"].append(action)
                episode["reward"].append(rew)
                episode["terminated"].append(done)

                ep_reward += rew
                self.set_eps(self.eps - self.eps_decay)

                if done:
                    self.past_rewards.append(ep_reward)
                    state = self.env.reset()
                    ep_reward = 0
                self.done += 1
                if not (self.done) % self.update_double:
                    self.update = True

            # record the episodes
            self.memory.record(episode)
            if self.memory.size < self.memory_min:
                self.progbar.add(self.batch_size, values=[("Loss", 0.0)])
            yield True

    def play(self, name='play'):

        name = name + self.env.name + str(self.eps)

        eps = self.eps
        self.set_eps(0)
        state = self.env.reset(record=True)

        done = False
        while not done:

            action = self.act(state)
            state, _, done, info = self.env.step(action)

        self.env.save_episode(name)
        self.set_eps(eps)

    def load(self):
        super(DDPG, self).load(self.env.name)
예제 #8
0
class ReductionLayer(Layer):
    def __init__(self, patch_size, dictionary_size, alpha, **kwargs):

        super(ReductionLayer, self).__init__(**kwargs)

        self.dict_size = dictionary_size

        self.patch_layer = patchlayer.PatchLayer(patch_size)

        self.alpha = alpha

        self.progbar = Progbar(100, stateful_metrics=["loss"])

        self.old_D = 0.0

    def build(self, input_shape):

        self.patch_layer.build(input_shape)

        self.input_shape_t = self.patch_layer.compute_output_shape(input_shape)

        self.dim = self.input_shape_t[-1]

        self.filters = self.dict_size

        self.strides = (1, self.dim)

        self.kernel_shape = (1, self.dim, self.dict_size)

        self.D0 = K.random_normal_variable((self.dim, self.dict_size),
                                           mean=0,
                                           scale=1)

        self.D = tf.matmul(tf.diag(1 / tf.norm(self.D0, axis=1)), self.D0)

        self.D_ols = tf.matmul(tf.linalg.inv(
            tf.matmul(self.D, self.D, transpose_a=True) +
            self.alpha * tf.eye(self.dict_size)),
                               self.D,
                               transpose_b=True)
        self.kernel = K.reshape(self.D_ols, self.kernel_shape)
        #self.add_weight(shape=self.kernel_shape,
        #                              initializer='glorot_uniform',
        #                              name='kernel')
        self.D_kernel = K.reshape(tf.matmul(self.D, self.D_ols),
                                  (1, self.dim, self.dim))

        self.trainable_weights = [self.D0]

    def call(self, inputs):

        beta = K.conv1d(self.patch_layer(inputs),
                        self.kernel,
                        strides=1,
                        padding='valid',
                        data_format='channels_last',
                        dilation_rate=1)

        return beta

    def fit(self, X, Y, batch_size=64):
        print("Fitting the reduction")

        n = len(X)
        self.progbar.__init__(n)
        for i in range(0, n, batch_size):
            weights = np.ones(min(n, i + batch_size) - i)
            inputs = X[i:min(i + batch_size, n)]
            targets = Y[i:min(n, i + batch_size)]
            self.fit_op([inputs, targets, weights])
            self.progbar.add(min(batch_size, n - batch_size),
                             values=[('loss',
                                      self.loss([inputs, targets,
                                                 weights])[0])])

    def display_update(self):
        res = np.linalg.norm(K.eval(self.D) - self.old_D)
        self.old_D = K.eval(self.D)
        return res

    def set_D(self, D):
        K.set_value(self.D_ridge, D)

    def compile(self, model):

        self.optimizer = tf.train.RMSPropOptimizer(0.001)
        self.opt = self.optimizer.minimize(model.total_loss,
                                           var_list=[self.D0])
        self.fit_op = K.Function(
            [model.input, model.targets[0], model.sample_weights[0]],
            [self.opt])
        self.loss = K.Function(
            [model.input, model.targets[0], model.sample_weights[0]],
            [model.total_loss])
        print(
            "Reduction Layer Compiled, batch %d" % self.patch_layer.patch_size,
            "\n", "Output shape:",
            self.compute_output_shape(self.input_shape_t))

    def compute_output_shape(self, input_shape):
        return self.patch_layer.compute_output_shape(input_shape)[:2] + (
            self.dict_size, )

    def get_config(self):
        config = {
            'rank': 1,
            'filters': self.dict_size,
            'kernel_size': self.kernel_shape,
            'strides': self.strides,
            'padding': 'valid',
            'data_format': 'channels_last',
            'activation': 'linear',
            'kernel_initializer': 'personal'
        }
        base_config = super(ReductionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
예제 #9
0
파일: ddpg.py 프로젝트: aghriss/DeepRL
class DDPG(Agent):
    
    deep = deepfunctions.DeepQ
    
    def __init__(self, env, gamma, memory_max, batch_size, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1):
        
        model = self.deep(env)
        
        super(DDPG,self).__init__(model)
        self.discount = gamma
        self.env = env
        
        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = dummy_obj.Memory(self.memory_max,self.batch_size,["t","state","action","reward","next_state","terminated"])

        self.eps_decay = eps_decay        
        if eps_decay == -1:            
            self.eps_decay = 1/train_steps
        self.eps_min = eps_min
        
    def act(self,state):
        
        if np.random.rand()<self.eps:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state))
    
    def setup_agent():
        
        current_state = K.
        
    def train(self):
        
        to_log = 0
        self.progbar.__init__(self.batch_size*self.log_freq)
        
        while(self.done<self.train_steps):
            _ = self.env.reset()
            old_theta = self.Flaten.get()
            avg_rew = 0
            max_rew = 0
            min_rew = 0
            while to_log <self.log_freq:

                self.get_episode()
                rollout = self.memory.sample()

                actions = rollout["action"]
                rewards = rollout["reward"]
                not_final = np.logical_not(rollout["terminated"])

                avg_rew += np.mean(rewards)

                max_rew,min_rew = max(np.max(rewards),max_rew),min(min_rew,np.min(rewards))

                target_q = self.model.predict(rollout["next_state"])
                max_Q_prim = np.max(target_q,axis=1)
        
                for i in range(len(actions)):
                    target_q[i,actions[i]] = rewards[i] + not_final[i]* self.discount*max_Q_prim[i]
                
                self.model.train_on_batch(rollout["state"],target_q)
                
                to_log+=1

            new_theta = self.Flaten.get()

            self.log("Theta MSE",np.linalg.norm(new_theta-old_theta))
            self.log("Average reward",np.mean(avg_rew/self.log_freq))
            self.log("Max reward",max_rew)
            self.log("Min reward",min_rew)
            self.log("Epsilon",self.eps)
            self.log("Done",self.done)
            self.log("Total",self.train_steps)
            self.print_log()
            self.play()
            self.save(self.env.name)
            self.progbar.__init__(self.batch_size*self.log_freq)
                
            to_log = 0 
            
    def set_eps(self,x):
        self.eps = max(x,self.eps_min)
        
    def get_episode(self):
        
        episode = self.memory.empty_episode()

        state = self.env.current_state()        
        
        for i in range(self.batch_size):
            
            self.progbar.add(1)
            self.done += 1

            # save current state
            
            episode["state"].append(state)
            
            # act
            action = self.act(state)   
            state, rew, done = self.env.step(action)

            episode["next_state"].append(state)            
            episode["t"].append(i)            
            episode["action"].append(action)
            episode["reward"].append(rew)        
            episode["terminated"].append(done)
            
            self.set_eps(self.eps-self.eps_decay)
            
            if done:
                state = self.env.reset()
                
        # record the episodes
        self.memory.record(episode)
        
        del(episode)

    def play(self,name='play'):
        
        name = name+self.env.name+str(self.eps)
        
        eps = self.eps
        
        self.set_eps(0)
        
        state = self.env.reset()
        #print(self.env.t,end=",")
        done = False
        
        while not done:
            
            action = self.act(state)
            
            state, _, done = self.env.step(action)
            #print(self.env.t,end=",")
        
        self.env.draw(name)
        self.set_eps(eps)
예제 #10
0
class TRPO(Agent):

    options = {
        "cg_damping":
        (1e-1, "Add multiple of the identity to Fisher matrix during CG"),
        "max_kl":
        (1e-2,
         "KL divergence between old and new policy (averaged over state-space)"
         ),
        "linesearch_accept": (1e-1, "Lineseach accept ratio")
    }
    deep = deepfunctions.DeepPolicy

    def __init__(self, env, gamma, max_steps):
        self.agent_type = "TRPO"
        policy = self.deep(env)
        self.old_policy = self.deep(env)
        super(TRPO, self).__init__(policy)

        self.discount = gamma
        self.env = env
        self.max_steps = max_steps

        self.setup_agent()

        self.baseline = deepfunctions.BaselineValueFunction(env)
        self.episodes = []
        self.progbar = Progbar(100)

    def setup_agent(self):

        self.states = self.model.input
        self.actions = K.placeholder(ndim=1, dtype='int32')
        self.advantages = K.placeholder(ndim=1)
        current_pi = self.model.output

        old_pi = self.old_policy(self.states)

        log_likeli_pi = utils.loglikelihood(self.actions, current_pi)
        log_likeli_old_pi = utils.loglikelihood(self.actions, old_pi)

        N = K.cast(K.shape(self.states)[0], dtype='float32')

        # Policy gradient:

        surrogate_loss = (-1.0 / N) * K.sum(
            K.exp(log_likeli_pi - log_likeli_old_pi) * self.advantages)

        policy_gradient = self.model.flattener.flatgrad(self.model.output)

        kl_firstfixed = K.mean(utils.entropy(current_pi))

        grads = self.model.flattener.flatgrad(kl_firstfixed)

        flat_tangent = K.placeholder(ndim=1)

        grad_vector_product = K.sum(grads * flat_tangent)

        # Fisher-vector product

        fisher_vector_product = self.model.flattener.flatgrad(
            grad_vector_product)

        entropy = K.mean(utils.entropy(current_pi))

        losses = [surrogate_loss, kl_firstfixed, entropy]

        self.loss_names = ["Surrogate", "KL", "Entropy"]

        args = [self.states, self.actions, self.advantages]

        self.compute_policy_gradient = K.function(args, [policy_gradient])
        self.compute_losses = K.function(args, losses)
        self.compute_fisher_vector_product = K.function(
            [flat_tangent] + args, [fisher_vector_product])

    def train(self):

        self.rollout()

        states = np.concatenate(
            [episode["state"] for episode in self.episodes], axis=0)
        actions = np.concatenate(
            [episode["action"] for episode in self.episodes], axis=0)
        advantages = np.concatenate(
            [episode["advantage"] for episode in self.episodes], axis=0)

        args = (states, actions, advantages)

        thprev = self.model.flattener.get_value()
        self.old_policy.flattener.set_value(thprev)

        g = self.compute_policy_gradient([*args])[0]

        losses_before = self.compute_losses([*args])

        if np.allclose(g, 0):
            print("got zero gradient. not updating")
        else:
            print("Using Conjugate gradient")
            stepdir = m_utils.conjugate_gradient(
                lambda x: self.fisher_vector_product(x, args), -g)
            shs = .5 * stepdir.dot(self.fisher_vector_product(stepdir, args))
            lm = np.sqrt(shs / self.options["max_kl"][0])

            print("Lagrange multiplier:", lm, "norm(g):", np.linalg.norm(g))

            fullstep = stepdir / lm

            def loss(th):
                self.model.flattener.set_value(th)
                return self.compute_losses([*args])[0]

            success, theta = m_utils.linesearch(loss, thprev, fullstep)
            print("Line-Search Success", success)
            self.model.flattener.set_value(theta)
        losses_after = self.compute_losses([*args])

        for (lname, lbefore, lafter) in zip(self.loss_names, losses_before,
                                            losses_after):
            self.log(lname + "_before", lbefore)
            self.log(lname + "_after", lafter)
        self.print_log()
        self.model.save(self.env.name)

    def act(self, state, train=False):

        proba = self.model.predict(state)
        if train:
            action = utils.choice_weighted(proba)
        else:
            action = np.argmax(proba)
        return action

    def fisher_vector_product(self, p, args):
        return self.compute_fisher_vector_product(
            [p] + [*args])[0] + self.options["cg_damping"][0] * p

    def rollout(self):

        self.episodes = []
        self.collected = 0
        self.progbar.__init__(self.max_steps)

        while self.collected < self.max_steps:
            self.get_episode()

        self.compute_advantage()
        self.baseline.fit(self.episodes)

    def get_episode(self):

        state = self.env.reset()

        episode = {
            s: []
            for s in ["t", "state", "action", "reward", "terminated"]
        }

        i = 0

        while self.collected < self.max_steps:

            episode["t"].append(i)
            episode["state"].append(state)
            # act
            action = self.act(state, train=True)

            state, rew, done, info = self.env.step(action)

            episode["action"].append(action)
            episode["reward"].append(rew)
            episode["terminated"].append(done)

            i += 1
            self.collected += 1
            self.progbar.add(1, values=[('Info', info)])
            if done:
                break
        for k, v in episode.items():
            episode[k] = np.array(v)
        episode["return"] = discount(np.array(episode["reward"]),
                                     self.discount)

        self.episodes.append(episode)

    def compute_advantage(self):

        # Compute baseline, advantage
        for episode in self.episodes:
            b = episode["baseline"] = self.baseline.predict(episode)
            b1 = np.append(b, 0 if episode["terminated"][-1] else b[-1])
            deltas = episode["reward"] + self.discount * b1[1:] - b1[:-1]
            episode["advantage"] = discount(deltas, self.discount)

        alladv = np.concatenate(
            [episode["advantage"] for episode in self.episodes])
        # Standardize advantage
        std = alladv.std()
        mean = alladv.mean()
        for episode in self.episodes:
            episode["advantage"] = (episode["advantage"] - mean) / std

    def play(self, name='play'):

        state = self.env.reset()
        done = False

        while not done:
            action = self.act(state)
            state, _, done, _ = self.env.step(action)

        self.env.draw(name)
예제 #11
0
파일: rollers.py 프로젝트: aghriss/DeepRL
class Roller(object):
    
    def __init__(self, env, agent, memory_max):
        
        self.env = env
        self.agent = agent

        self.memory_max = memory_max
        
        self.progbar = Progbar(self.memory_max)
        
        self.memory = dummy_obj.Memory(self.memory_max,["t","state","action","next_state","reward","terminated"])

    def rollout(self,num_steps):

        collected = 0
        self.progbar.__init__(num_steps)
        self.agent.set_epsilon(1)
        self.agent.theta = 0        
        while collected < num_steps:
            collected += self.get_episode(num_steps-collected+1,1/num_steps)                
        
        roll = self.memory.random_sample(num_steps)        
        return roll
        
    def get_episode(self, length, eps):
        
        state = self.env.reset()        
        
        episode = self.memory.empty_episode()
        
        i = 0
        
        while i < length:
            
            self.progbar.add(1)

            # save current state

            episode["state"].append(state)
            
            # act
            action = self.agent.act(state)   
            state, rew, done = self.env.step(action)

            episode["next_state"].append(state)            
            episode["t"].append(i)            
            episode["action"].append(action)
            episode["reward"].append(rew)        
            episode["terminated"].append(done)
            
            
            self.agent.decrement_eps(eps)
            i += 1
            
            if done:
                state = self.env.reset()
                break

        # record the episodes
        self.memory.record(episode)
        
        del(episode)
        
        return i

    def compute_advantage(self):
        # Compute baseline, advantage
        for episode in self.episodes:
            b = episode["baseline"] = self.baseline.predict(episode)
            b1 = np.append(b, 0 if episode["terminated"][-1] else b[-1])
            deltas = episode["reward"] + self.discount*b1[1:] - b1[:-1] 
            episode["advantage"] = discount(deltas, self.discount)
        alladv = np.concatenate([episode["advantage"] for episode in self.episodes])    
        # Standardize advantage
        std = alladv.std()
        mean = alladv.mean()
        for episode in self.episodes:
            episode["advantage"] = (episode["advantage"] - mean) / std

    def play(self,name='play'):
        eps = self.agent.eps
        self.agent.set_epsilon(0)
        state = self.env.reset()
        done = False
        
        while not done:
            
            action = self.agent.act(state)
            
            state, _, done = self.env.step(action)
        
        self.env.draw(name)
        self.agent.set_epsilon(eps)
예제 #12
0
파일: dqn.py 프로젝트: aghriss/DeepRL
class DQN(Agent):
    
    deep = deepfunctions.DeepQ

    def __init__(self, env, gamma, batch_size, memory_max, train_steps=1000000, log_freq = 1000, eps_start = 1, eps_decay = -1, eps_min = 0.1):
        
        model = self.deep(env)
        self.agent_type = "DQN"  
        super(DQN,self).__init__(model)
        self.discount = gamma
        self.env = env
        
        self.memory_max = memory_max
        self.eps = eps_start
        self.train_steps = train_steps
        self.batch_size = batch_size
        self.done = 0
        self.log_freq = log_freq
        self.progbar = Progbar(self.memory_max)
        self.memory = dummy_obj.ReplayMemory(self.memory_max,["t","state","action","reward","next_state","terminated"])

        self.eps_decay = eps_decay        
        if eps_decay == -1:            
            self.eps_decay = 1/train_steps
        self.eps_min = eps_min
        
    def act(self,state):
        
        if np.random.rand()<self.eps:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state))
    
    
    def setup_model(self):
        
        current_state = K.placeholder(shape=(None,)+self.env.observation_space.shape)
        next_state = K.placeholder(shape=(None,)+self.env.observation_space.shape)
        action = K.placeholder(ndim=1)
        terminated = K.placeholder(ndim=1)
        reward = K.placeholder(ndim=1)
        current_Q = self.model.net(current_state)
        next_Q = self.model.net(next_state)
        
        target_Q
        
        optimizer = tf.train.RMSProp()
        
        loss = K.mean(K.square(target_q-current_Q))
        op = K.Function([current_state,next_state,action,reward,terminated],[optimizer.minimize(loss)])
            
    def train(self):
        
        to_log = 0
        self.progbar.__init__(self.batch_size*self.log_freq)
        
        while(self.done<self.train_steps):
            _ = self.env.reset()
            old_theta = self.Flaten.get()
            avg_rew = 0
            max_rew = 0
            min_rew = 0
            while to_log <self.log_freq:

                self.get_episode()
                rollout = self.memory.sample(self.batch_size)

                actions = rollout["action"]
                rewards = rollout["reward"]
                not_final = np.logical_not(rollout["terminated"])

                avg_rew += np.mean(rewards)

                max_rew,min_rew = max(np.max(rewards),max_rew),min(min_rew,np.min(rewards))

                target_q = self.model.predict(rollout["next_state"])
                max_Q_prim = np.max(target_q,axis=1)
        
                for i in range(len(actions)):
                    target_q[i,actions[i]] = rewards[i] + not_final[i]* self.discount*max_Q_prim[i]
                
                self.model.train_on_batch(rollout["state"],target_q)
                
                to_log+=1

            new_theta = self.Flaten.get()

            self.log("Theta MSE",np.linalg.norm(new_theta-old_theta))
            self.log("Average reward",np.mean(avg_rew/self.log_freq))
            self.log("Max reward",max_rew)
            self.log("Min reward",min_rew)
            self.log("Epsilon",self.eps)
            self.log("Done",self.done)
            self.log("Total",self.train_steps)
            self.print_log()
            self.play()
            self.save(self.env.name)
            self.progbar.__init__(self.batch_size*self.log_freq)
                
            to_log = 0 
            
    def set_eps(self,x):
        self.eps = max(x,self.eps_min)
        
    def get_episode(self):
        
        episode = self.memory.empty_episode()

        state = self.env.current_state()        
        
        for i in range(self.batch_size):

            # save current state
            
            episode["state"].append(state)
            
            # act
            action = self.act(state)   
            state, rew, done,info = self.env.step(action)

            episode["next_state"].append(state)            
            episode["t"].append(i)            
            episode["action"].append(action)
            episode["reward"].append(rew)        
            episode["terminated"].append(done)
            
            self.set_eps(self.eps-self.eps_decay)
            
            if done:
                state= self.env.reset()
            
            self.progbar.add(1,values=("Info",info))
            self.done += 1
            if not(self.done)%self.update_double:
                self.update=True
                
        # record the episodes
        self.memory.record(episode)
        
        del(episode)

    def play(self,name='play'):
        
        name = name+self.env.name+str(self.eps)
        
        eps = self.eps
        
        self.set_eps(0)
        
        state = self.env.reset()
        #print(self.env.t,end=",")
        done = False
        
        while not done:
            
            action = self.act(state)
            
            state, _, done,_ = self.env.step(action)
            #print(self.env.t,end=",")
        
        self.env.draw(name)
        self.set_eps(eps)