Python ReplayBuffer.add 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: utils.ReplayBuffer

클래스/타입: ReplayBuffer

메소드/함수: add

hotexamples.com에서의 예제들: 8

Python ReplayBuffer.add - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 utils.ReplayBuffer.ReplayBuffer.add에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

ReplayBuffer(19)

sample(13)

add(8)

push(4)

GetOccupency(2)

SampleMiniBatch(2)

StoreTransition(2)

update(2)

LoadBuffer(1)

SaveBuffer(1)

record(1)

update_priorities(1)

예제 #1

파일 보기

class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        q_s, _ = self.bpolicy_net(x)

        if q_s.shape[0] == 3:
            q_s = q_s.unsqueeze(0)
            #act = q_s.argmax().detach()

    # else:
        act = torch.max(q_s, 1).indices.detach().numpy()

        for i in range(act.shape[0]):
            action = act[i]
            if action == 1:
                if np.random.rand() < self.epsilon:
                    act[i] = np.random.choice([0, 2])

        # if act.cpu().numpy() == 1:
        #     if np.random.rand() < self.epsilon:
        #         a = np.random.randint(self.actions-1)

        # if np.random.rand() < self.epsilon:
        #     a = np.random.randint(self.actions)
        #     return torch.tensor(a, device=device)

        # # otherwise take a greedy action
        # q_s, _ = self.bpolicy_net(x)
        # # print(q_s)
        # return q_s.argmax().detach()
        act_tensor = torch.from_numpy(act).detach().to(device)

        return act_tensor

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, sp, r, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 200:
            samples, idcs = self.buffer.sample(200)
            self.updateNetwork(samples)

예제 #2

파일 보기

class NAF:

    MODEL_NAME = "NAF"
    TARGET_MODEL_NAME = "target-NAF"

    class Build(Enum):
        SINGLE = 1
        MULTIPLE = 2
        HYDRA = 3

    def __init__(self,
                 prep,
                 build,
                 policy,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 buffer_size=10000,
                 batch_size=32,
                 steps_before_train=100,
                 train_freq=1,
                 num_steps=1000000,
                 learning_rate=1e-3,
                 update_rate=1e-3,
                 max_reward=None,
                 detailed_summary=False):

        self.prep = prep
        self.build_mode = build
        self.policy = policy
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detailed_summary = detailed_summary

        self.discount = 0.99
        self.learning_rate = learning_rate
        self.target_update_rate = update_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.steps_before_train = steps_before_train
        self.train_freq = train_freq
        self.max_reward = max_reward
        self.max_iters = num_steps

        self.step = 0
        self.solved = False

        self.state_layers = [64, 32]

        self.mu_layers = [16, 8, self.action_dim]

        self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2]

        self.v_layers = [16, 8, 1]

        self.action_inputs = None
        self.reward_inputs = None
        self.done = None
        self.state_inputs = None
        self.state_outputs = None
        self.mu_outputs = None
        self.l_outputs = None
        self.value_outputs = None
        self.next_state_inputs = None
        self.next_state_outputs = None
        self.target_value_outputs = None
        self.target = None
        self.advantages = None
        self.q_values = None
        self.loss = None
        self.global_step = None
        self.inc_global_step = None
        self.train_op = None
        self.target_update = None

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.build()

        self.merged = tf.summary.merge_all()

        self.session = tf.Session()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "learning rate": self.learning_rate,
                "batch size": self.batch_size,
                "update rate": self.target_update_rate,
                "buffer size": self.buffer_size,
                "build": self.build_mode.name,
                "train frequency": self.train_freq
            })
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session.run(init_op)

    def build(self):
        self.action_inputs = tf.placeholder(tf.float32,
                                            (None, self.action_dim))
        self.reward_inputs = tf.placeholder(tf.float32, (None, ))
        self.done = tf.placeholder(tf.float32, (None, ))

        self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \
          self.build_network(self.MODEL_NAME)

        self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \
          self.build_network(self.TARGET_MODEL_NAME)

        self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * (
            1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs

        # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py
        pivot = 0
        rows = []
        for idx in range(self.action_dim):
            count = self.action_dim - idx

            diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1)))
            non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1),
                                      (-1, count - 1))
            row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1),
                         ((0, 0), (idx, 0)))
            rows.append(row)

            pivot += count

        L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1))
        P = tf.matmul(L, tf.transpose(L, (0, 2, 1)))

        adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1)
        self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]),
                                     tf.matmul(P, adv_term)) / 2
        self.advantages = tf.reshape(self.advantages, [-1, 1])

        self.q_values = self.advantages + self.value_outputs

        self.loss = tf.reduce_mean(
            architect.huber_loss(self.q_values -
                                 tf.stop_gradient(self.target)))

        tf.summary.scalar("training_loss", self.loss)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

        self.create_target_update_op()

    def build_network(self, name):

        detailed_summary = self.detailed_summary
        if name == self.TARGET_MODEL_NAME:
            detailed_summary = False

        with tf.variable_scope(name):

            state_inputs = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim))

            if self.build_mode == self.Build.SINGLE:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.MULTIPLE:
                state_outputs = None
                mu_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="mu_state",
                    detailed_summary=detailed_summary)
                l_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="l_state",
                    detailed_summary=detailed_summary)
                value_state = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="value_state",
                    detailed_summary=detailed_summary)

                mu_outputs = architect.dense_block(
                    mu_state, [self.mu_layers[-1]],
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    l_state, [self.l_layers[-1]],
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    value_state, [self.v_layers[-1]],
                    "value_branch",
                    detailed_summary=detailed_summary)
            elif self.build_mode == self.Build.HYDRA:
                state_outputs = architect.dense_block(
                    state_inputs,
                    self.state_layers,
                    name="state_branch",
                    detailed_summary=detailed_summary)
                mu_outputs = architect.dense_block(
                    state_outputs,
                    self.mu_layers,
                    "mu_branch",
                    detailed_summary=detailed_summary)
                l_outputs = architect.dense_block(
                    state_outputs,
                    self.l_layers,
                    "l_branch",
                    detailed_summary=detailed_summary)
                value_outputs = architect.dense_block(
                    state_outputs,
                    self.v_layers,
                    "value_branch",
                    detailed_summary=detailed_summary)
            else:
                raise ValueError("Wrong build type.")

            return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs

    def create_target_update_op(self):
        # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py
        net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self.MODEL_NAME)
        target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                            scope=self.TARGET_MODEL_NAME)

        self.target_update = []
        for v_source, v_target in zip(net_vars, target_net_vars):
            # this is equivalent to target = (1-alpha) * target + alpha * source
            update_op = v_target.assign_sub(self.target_update_rate *
                                            (v_target - v_source))
            self.target_update.append(update_op)

        self.target_update = tf.group(*self.target_update)

    def learn(self):
        # learn
        batch = self.buffer.sample(self.batch_size)

        merged, targets, _ = self.session.run(
            [self.merged, self.target, self.train_op],
            feed_dict={
                self.state_inputs: batch["states"],
                self.action_inputs: batch["actions"],
                self.reward_inputs: batch["rewards"],
                self.next_state_inputs: batch["next_states"],
                self.done: batch["done"]
            })

        self.summary_writer.add_summary(merged, global_step=self.step)

        # target update
        self.session.run(self.target_update)

    def run_episode(self, env):

        self.policy.reset()

        state = env.reset()
        state, skip = self.prep.process(state)

        total_reward = 0

        while True:
            # play
            if skip:
                action = env.action_space.sample()
            else:
                action = self.session.run(self.mu_outputs,
                                          feed_dict={self.state_inputs:
                                                     state})[0]
                action = self.policy.add_noise(action)

            tmp_state = state
            tmp_skip = skip

            state, reward, done, _ = env.step(action)
            state, skip = self.prep.process(state)

            total_reward += reward

            if not tmp_skip and not tmp_skip:
                self.buffer.add({
                    "state": tmp_state[0],
                    "action": action,
                    "reward": reward,
                    "next_state": state[0],
                    "done": int(done)
                })

            if self.step >= self.steps_before_train and not self.solved:
                # learn
                for _ in range(self.train_freq):
                    self.learn()
                    _, self.step = self.session.run(
                        [self.inc_global_step, self.global_step])
            else:
                _, self.step = self.session.run(
                    [self.inc_global_step, self.global_step])

            if done:
                break

        summary_value = summary_pb2.Summary.Value(tag="episode_reward",
                                                  simple_value=total_reward)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=self.step)

        if self.max_reward is not None:
            if total_reward >= self.max_reward:
                self.solved = True
            else:
                self.solved = False

        if self.step == self.max_iters:
            self.saver.save(self.session,
                            self.summary_dir,
                            global_step=self.step)

        return total_reward, self.step

    def close(self):
        self.session.close()

예제 #3

파일 보기

파일: DQNAgent.py 프로젝트: yyhnmn/Regularized-GradientTD

class DQN(BaseAgent):
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']

    def updateNetwork(self, samples):
        # organize the mini-batch so that we can request "columns" from the data
        # e.g. we can get all of the actions, or all of the states with a single call
        batch = getBatchColumns(samples)

        # compute Q(s, a) for each sample in mini-batch
        Qs, x = self.policy_net(batch.states)
        Qsa = Qs.gather(1, batch.actions).squeeze()

        self.penultimate_features.append(x)

        # by default Q(s', a') = 0 unless the next states are non-terminal

        Qspap = torch.zeros(batch.size, device=device)
        # for i in range(len(batch.actions.numpy())):
        #     if batch.actions.numpy()[i][0] == 0:
        #         self.back_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 1:
        #         self.stay_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 2:
        #         self.forward_values.append(Qsa.detach().numpy()[i])

        # if we don't have any non-terminal next states, then no need to bootstrap
        if batch.nterm_sp.shape[0] > 0:
            Qsp, _ = self.target_net(batch.nterm_sp)

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        self.optimizer.zero_grad()
        self.target_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        self.td_loss.append(td_loss.detach().numpy())

        # self.td_loss = self.td_loss + list(td_loss.detach().numpy())

        Qs_state_array, _ = self.policy_net(self.state_array)

        Qsa_mean_states = torch.mean(Qs_state_array, 0)

        self.back_values.append(Qsa_mean_states[0].detach().numpy())
        self.stay_values.append(Qsa_mean_states[1].detach().numpy())
        self.forward_values.append(Qsa_mean_states[2].detach().numpy())

        # update the *policy network* using the combined gradients
        self.optimizer.step()

    def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList):
        batch = getBatchColumns(samples)
        Qs, x = q_net(batch.states)

        # Qsa = Qs.squeeze()
        # for i in range(len(batch.actions)):
        #     storeList.append(Qsa.detach().numpy()[i])
        Qspap = torch.zeros(batch.size, device=device)

        ############  ============  CHECK ================= ###############################
        if batch.nterm_sp.shape[0] > 0:
            ##  Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ????

            Qsp_back, _ = self.back_target_q_net(batch.nterm_sp)
            Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp)
            Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp)

            Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward])

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        ############  ============  CHECK ================= ###############################
        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        optimizer.zero_grad()
        target_q_net.zero_grad()
        self.back_target_q_net.zero_grad()
        self.stay_target_q_net.zero_grad()
        self.forward_target_q_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        Qs_state_array, _ = q_net(self.state_array)
        Qsa_mean_states = torch.mean(Qs_state_array, 0)
        storeList.append(Qsa_mean_states[0].detach().numpy())

        # update the *policy network* using the combined gradients
        optimizer.step()

    def update(self, s, a, sp, r, gamma):
        if a.cpu().numpy() == 0:
            self.buffer_BACK.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 1:
            self.buffer_STAY.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 2:
            self.buffer_FORWARD.add((s, a, sp, r, gamma))

        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)
            self.back_q_net.cloneWeightsTo(self.back_target_q_net)
            self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)
            self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        back_sample_count = math.floor(
            self.ratioMap.backward_ratio * self.sampleSize)
        stay_sample_count = math.floor(
            self.ratioMap.stay_ratio * self.sampleSize)
        forward_sample_count = math.floor(
            self.ratioMap.forward_ratio * self.sampleSize)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer_BACK) > back_sample_count \
                and len(self.buffer_STAY) > stay_sample_count \
                and len(self.buffer_FORWARD) > forward_sample_count:

            samplesBack, idcs = self.buffer_BACK.sample(back_sample_count)
            samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count)
            samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count)
            self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack,
                                 self.back_values_baseline)
            self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay,
                                 self.stay_values_baseline)
            self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward,
                                 self.forward_values_baseline)
            samples = samplesBack + samplesStay + samplesForward

            self.updateNetwork(samples)

예제 #4

파일 보기

class DDPG_Agent(Agent):
    """Interacts with and learns from the environment."""
    policy_type = "DDPG"

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = DDPG_Actor(state_size, action_size,
                                      random_seed).to(device)
        self.actor_target = DDPG_Actor(state_size, action_size,
                                       random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPG_Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_target = DDPG_Critic(state_size, action_size,
                                         random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
        #Statistics
        self.stats = {
            "actor_loss": [],
            "critic_loss": [],
            "reward_sum": [],
        }

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        action = self.actor_local.select_action(state)
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        #tmp = np.array((critic_loss.item(), actor_loss.item()))
        #print(tmp)
        # --------------------------- for the plot ----------------------------- #

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        with torch.no_grad():
            actions_pred_target = self.actor_target(states)
            actor_loss_target = -self.critic_target(
                states, actions_pred_target).mean()
            Q_expected_target = self.critic_target(states, actions)
            critic_loss_target = F.mse_loss(Q_expected_target, Q_targets)
            with open("saveDDPG_critic-actor_loss.csv", "a") as f:
                tmp = str(critic_loss_target.item()) + "," + str(
                    actor_loss_target.item()) + "\n"
                f.write(tmp)
            self.save_stats(actor_loss=actor_loss.item(),
                            critic_loss=critic_loss.item(),
                            reward_sum=rewards.sum().item())

    def store_policy(self, env_name, score):
        traced = torch.jit.script(self.actor_target)
        torch.jit.save(
            traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" +
            str(score) + ".zip")

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

예제 #5

파일 보기

class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        if np.random.rand() < self.epsilon:
            a = np.random.randint(self.actions)
            return torch.tensor(a, device=device)

        # otherwise take a greedy action
        q_s, _ = self.policy_net(x)
        # print(q_s.detach().numpy()[0][3])
        print(q_s.argmax().detach())

        return q_s.argmax().detach()

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, r, sp, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, r, sp, gamma))
        self.steps += 1
        a = a.numpy()
        s = s.numpy()

        self.actionCounter[s[0][0]][s[0][1]][a] += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 32:
            samples, idcs = self.buffer.sample(32)
            self.updateNetwork(samples)

예제 #6

파일 보기

파일: BaseAgent.py 프로젝트: andnp/nonlinear-control-template

class BaseAgent:
    def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector):
        self.features = features
        self.actions = actions
        self.params = params
        self.collector = collector
        self.seed = seed

        # define parameter contract
        self.gamma = params['gamma']
        self.epsilon = params.get('epsilon', 0)
        # the mellowmax parameter
        self.omega = params.get('omega', 1.0)

        # set up network for estimating Q(s, a)
        self.value_net = Network(features, actions, params, seed).to(device)

        # build the optimizer
        self.optimizer_params = params['optimizer']
        self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params)

        self.steps = 0

        # set up the replay buffer
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch']
        self.buffer_type = params.get('buffer', 'standard')

        if self.buffer_type == 'per':
            prioritization = params['prioritization']
            self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization)
        else:
            self.buffer = ReplayBuffer(self.buffer_size)

        # build a target network
        self.target_refresh = params.get('target_refresh', 1)
        self.target_net = copy.deepcopy(self.value_net)
        self.initializeTargetNet()

        def getValues(x: torch.Tensor):
            qs = self.values(x).detach().cpu().squeeze(0).numpy()
            return qs

        self.policy = createEpsilonGreedy(seed, self.epsilon, getValues)

    # return the Q(s, a) values from the value network
    def values(self, x):
        return self.value_net(x)[0]

    # sample an action according to our policy
    def selectAction(self, x):
        return self.policy.selectAction(x)

    def initializeTargetNet(self):
        # if we aren't using target nets, then save some compute
        if self.target_refresh > 1:
            self.target_net = copy.deepcopy(self.value_net)
            cloneNetworkWeights(self.value_net, self.target_net)
        else:
            self.target_net = self.value_net

    @abstractmethod
    def updateNetwork(self, batch: Batch, predictions: Dict):
        pass

    @abstractmethod
    def forward(self, batch: Batch) -> Dict[str, torch.Tensor]:
        pass

    @abstractmethod
    def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]:
        pass

    # a helper method that lets us bypass combining gradients whenever
    # target networks are disabled
    def combineTargetGrads(self):
        if self.target_net == self.value_net:
            return

        addGradients_(self.value_net, self.target_net)

    def update(self, s, a, sp, r, gamma):
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        if self.steps % self.target_refresh == 0 and self.target_refresh > 1:
            cloneNetworkWeights(self.value_net, self.target_net)

        if len(self.buffer) > self.batch_size + 1:
            samples, idcs = self.buffer.sample(self.batch_size)
            batch = getBatchColumns(samples)
            predictions = self.forward(batch)
            tde = self.updateNetwork(batch, predictions)

            self.buffer.update_priorities(idcs, tde)

예제 #7

파일 보기

파일: DDPG.py 프로젝트: ondrejba/drl_gym

class DDPG:

    CRITIC_NAME = "critic"
    TARGET_CRITIC_NAME = "target_critic"

    ACTOR_NAME = "actor"
    TARGET_ACTOR_NAME = "target_actor"

    def __init__(self,
                 state_dim,
                 action_dim,
                 monitor_directory,
                 actor_learning_rate=1e-5,
                 critic_learning_rate=1e-3,
                 critic_target_update_rate=1e-3,
                 actor_target_update_rate=1e-3,
                 discount=0.99,
                 l2_decay=1e-2,
                 buffer_size=1000000,
                 batch_size=64,
                 detail_summary=False,
                 tanh_action=True,
                 input_batch_norm=True,
                 all_batch_norm=True,
                 log_frequency=10):

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.critic_learning_rate = critic_learning_rate
        self.actor_learning_rate = actor_learning_rate
        self.critic_target_update_rate = critic_target_update_rate
        self.actor_target_update_rate = actor_target_update_rate
        self.discount = discount
        self.batch_size = batch_size
        self.l2_decay = l2_decay
        self.buffer_size = buffer_size
        self.summary_dir = os.path.join(monitor_directory, "summary")
        self.detail_summary = detail_summary
        self.tanh_action = tanh_action
        self.input_batch_norm = input_batch_norm
        self.all_batch_norm = all_batch_norm
        self.log_frequency = log_frequency

        self.step = 0
        self.solved = False

        self.buffer = ReplayBuffer(buffer_size, self.state_dim,
                                   self.action_dim)

        self.__build()

        self.summary_dir = utils.new_summary_dir(self.summary_dir)
        utils.log_params(
            self.summary_dir, {
                "actor learning rate": self.actor_learning_rate,
                "critic learning rate": self.critic_learning_rate,
                "batch size": self.batch_size,
                "actor update rate": self.actor_target_update_rate,
                "critic update rate": self.critic_target_update_rate,
                "buffer size": self.buffer_size,
            })

        self.saver = tf.train.Saver(max_to_keep=None)

        init_op = tf.global_variables_initializer()
        self.session = tf.Session()

        self.merged = tf.summary.merge_all()
        self.summary_writer = tf.summary.FileWriter(self.summary_dir,
                                                    self.session.graph)

        self.session.run(init_op)

    """
  PUBLIC
  """

    def learn(self):

        batch = self.buffer.sample(self.batch_size)
        self.__train_critic(batch["states"], batch["actions"],
                            batch["rewards"], batch["next_states"],
                            batch["done"])
        self.__train_actor(batch["states"])

        self.session.run([
            self.target_critic_update, self.target_actor_update,
            self.inc_global_step
        ])

    def act(self, state):
        a = self.session.run(self.action,
                             feed_dict={
                                 self.state_input: state,
                                 self.is_training: False
                             })[0]
        return a

    def perceive(self, transition):
        self.buffer.add(transition)

    def log_scalar(self, name, value, index):
        summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value)
        summary_2 = summary_pb2.Summary(value=[summary_value])
        self.summary_writer.add_summary(summary_2, global_step=index)

    def save(self):
        self.saver.save(self.session,
                        self.summary_dir,
                        global_step=self.session.run(self.global_step))

    def close(self):
        self.session.close()

    """
  PRIVATE
  """

    def __build_critic(self, name, state_input, action_input):

        bn_training = self.is_training
        if name == self.TARGET_CRITIC_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300),
                                    400 + self.action_dim,
                                    name="W2")
            b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2")

            W2_action = self.__get_weights((self.action_dim, 300),
                                           400 + self.action_dim,
                                           name="W2_action")

            W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.nn.relu(
                tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) +
                b2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.CRITIC_NAME:
                self.critic_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W2_action", W2_action),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            # weight decay
            weights = [W1, b1, W2, b2, W2_action, W3, b3]
            weight_decay = tf.add_n(
                [self.l2_decay * tf.nn.l2_loss(var) for var in weights])

            return output_layer, weight_decay

    def __build_actor(self, name, state_input):

        bn_training = self.is_training
        if name == self.TARGET_ACTOR_NAME:
            bn_training = False

        with tf.variable_scope(name):

            # weights and biases
            W1 = self.__get_weights((self.state_dim, 400),
                                    self.state_dim,
                                    name="W1")
            b1 = self.__get_weights((400, ), self.state_dim, name="b1")

            W2 = self.__get_weights((400, 300), 400, name="W2")
            b2 = self.__get_weights((300, ), 400, name="b2")

            W3 = tf.Variable(tf.random_uniform((300, self.action_dim),
                                               minval=-3e-3,
                                               maxval=3e-3),
                             name="W3")
            b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3,
                                               3e-3),
                             name="b3")

            # layers
            if self.input_batch_norm:
                state_input = tf.layers.batch_normalization(
                    state_input, training=bn_training)

            layer_1 = tf.matmul(state_input, W1) + b1

            if self.all_batch_norm:
                layer_1 = tf.layers.batch_normalization(layer_1,
                                                        training=bn_training)

            layer_1 = tf.nn.relu(layer_1)

            layer_2 = tf.matmul(layer_1, W2) + b2

            if self.all_batch_norm:
                layer_2 = tf.layers.batch_normalization(layer_2,
                                                        training=bn_training)

            layer_2 = tf.nn.relu(layer_2)

            output_layer = tf.matmul(layer_2, W3) + b3

            # summary
            if name == self.ACTOR_NAME:
                self.actor_summaries = [
                    tf.summary.histogram("W1", W1),
                    tf.summary.histogram("b1", b1),
                    tf.summary.histogram("W2", W2),
                    tf.summary.histogram("b2", b2),
                    tf.summary.histogram("W3", W3),
                    tf.summary.histogram("b3", b3),
                    tf.summary.histogram("layer_1", layer_1),
                    tf.summary.histogram("layer_2", layer_2),
                    tf.summary.histogram("output_layer", output_layer)
                ]

            if self.tanh_action:
                return tf.nn.tanh(output_layer)
            else:
                return output_layer

    def __build(self):

        self.state_input = tf.placeholder(tf.float32,
                                          shape=(None, self.state_dim),
                                          name="state_input")
        self.next_state_input = tf.placeholder(tf.float32,
                                               shape=(None, self.state_dim),
                                               name="next_state_input")
        self.action_input = tf.placeholder(tf.float32,
                                           shape=(None, self.action_dim),
                                           name="action_input")
        self.reward_input = tf.placeholder(tf.float32,
                                           shape=(None, ),
                                           name="reward_input")
        self.done_input = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name="done_input")
        self.is_training = tf.placeholder(tf.bool, name="is_training")

        # inputs summary
        if self.detail_summary:
            self.input_summaries = [
                tf.summary.histogram("state", self.state_input),
                tf.summary.histogram("next_state", self.next_state_input),
                tf.summary.histogram("action", self.action_input),
                tf.summary.histogram("reward", self.reward_input),
                tf.summary.histogram("done", self.done_input)
            ]

        self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME,
                                                self.next_state_input)

        self.q_value, weight_decay = self.__build_critic(
            self.CRITIC_NAME, self.state_input, self.action_input)
        self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME,
                                                     self.next_state_input,
                                                     self.target_action)

        self.tmp = tf.expand_dims(self.reward_input, 1)

        self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * (
            1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value
        self.diff = self.targets - self.q_value

        self.loss = tf.reduce_mean(
            tf.square(tf.stop_gradient(self.targets) -
                      self.q_value)) + weight_decay
        self.loss_summary = tf.summary.scalar("critic_loss", self.loss)

        self.critic_train_op = tf.train.AdamOptimizer(
            self.critic_learning_rate).minimize(self.loss)

        # add critic batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.critic_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME)
            self.critic_bn_update_op = tf.group(*self.critic_bn_update_op)
            self.critic_train_op = tf.group(self.critic_train_op,
                                            self.critic_bn_update_op)

        self.action = self.__build_actor(self.ACTOR_NAME, self.state_input)
        self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              scope=self.ACTOR_NAME)
        self.action_gradients = tf.gradients(self.q_value,
                                             self.action_input)[0]
        self.actor_params_gradient = tf.gradients(self.action,
                                                  self.actor_params,
                                                  -self.action_gradients)

        # actor gradients summary
        if self.detail_summary:
            self.actor_summaries.append(
                tf.summary.histogram("action_gradient", self.action_gradients))
            for grad in self.actor_params_gradient:
                self.actor_summaries.append(
                    tf.summary.histogram("actor_parameter_gradients", grad))

        self.actor_train_op = tf.train.AdamOptimizer(
            self.actor_learning_rate).apply_gradients(
                zip(self.actor_params_gradient, self.actor_params))

        # add actor batch norm. update
        if self.input_batch_norm or self.all_batch_norm:
            self.actor_bn_update_op = tf.get_collection(
                tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME)
            self.actor_bn_update_op = tf.group(*self.actor_bn_update_op)
            self.actor_train_op = tf.group(self.actor_train_op,
                                           self.actor_bn_update_op)

        self.target_critic_update = architect.create_target_update_ops(
            self.CRITIC_NAME, self.TARGET_CRITIC_NAME,
            self.critic_target_update_rate)
        self.target_actor_update = architect.create_target_update_ops(
            self.ACTOR_NAME, self.TARGET_ACTOR_NAME,
            self.actor_target_update_rate)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.inc_global_step = tf.assign(self.global_step,
                                         tf.add(self.global_step, 1))

        # group summaries
        self.critic_summaries = tf.summary.merge(self.critic_summaries)

        if self.detail_summary:
            self.actor_summaries = tf.summary.merge(self.actor_summaries)
            self.input_summaries = tf.summary.merge(self.input_summaries)

    @staticmethod
    def __get_weights(shape, input_shape, name="var"):
        return tf.Variable(tf.random_uniform(shape,
                                             -1 / math.sqrt(input_shape),
                                             1 / math.sqrt(input_shape)),
                           name=name)

    def __train_actor(self, states):

        actions = self.session.run(self.action,
                                   feed_dict={
                                       self.state_input: states,
                                       self.is_training: True
                                   })

        self.session.run(self.actor_train_op,
                         feed_dict={
                             self.state_input: states,
                             self.action_input: actions,
                             self.is_training: True
                         })

    def __train_critic(self, states, actions, rewards, next_states, done):
        feed_dict = {
            self.state_input: states,
            self.action_input: actions,
            self.reward_input: rewards,
            self.next_state_input: next_states,
            self.done_input: done,
            self.is_training: True
        }
        step = self.session.run(self.global_step)

        if step % self.log_frequency == 0:

            ops = [self.critic_train_op, self.loss_summary]

            if self.detail_summary:
                ops.append(self.actor_summaries)
                ops.append(self.input_summaries)

            res = self.session.run(ops, feed_dict=feed_dict)

            self.summary_writer.add_summary(res[1], global_step=step)

            if self.detail_summary:
                self.summary_writer.add_summary(res[2], global_step=step)
                self.summary_writer.add_summary(res[3], global_step=step)
        else:
            self.session.run(self.critic_train_op, feed_dict=feed_dict)

예제 #8

파일 보기

파일: DeepQNetwork.py 프로젝트: ondrejba/drl_gym

class DeepQNetwork:

  ACTION_VALUE_NET_NAME = "q-network"
  TARGET_ACTION_VALUE_NET_NAME = "target-q-network"

  def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3,
               hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000,
               discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000,
               train_freq=1, save_end=True):

    self.network = network
    self.prep = prep
    self.exp_policy = exp_policy
    self.greedy_policy = policy.Greedy()
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.discount = discount
    self.summary_dir = os.path.join(name, "summary")
    self.use_huber_loss = use_huber_loss
    self.detailed_summary = detailed_summary

    self.learning_rate = learning_rate
    self.batch_size = batch_size
    self.hard_update_frequency = hard_update_frequency
    self.soft_update_rate = soft_update_rate
    self.num_steps = num_steps
    self.step = 0
    self.steps_before_learn = steps_before_learn
    self.train_freq = train_freq
    self.solved = False
    self.max_reward = max_reward
    self.save_end = save_end

    self.actions = None
    self.rewards = None
    self.done = None
    self.action_q_values = None
    self.max_target_q_values = None
    self.targets = None
    self.global_step = None
    self.inc_global_step = None
    self.train_op = None
    self.states = None
    self.q_values = None
    self.next_states = None
    self.target_q_values = None
    self.target_update = None

    self.build_all()
  
    self.merged = tf.summary.merge_all()

    self.session = tf.Session()

    self.summary_dir = utils.new_summary_dir(self.summary_dir)
    self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph)

    self.saver = tf.train.Saver(max_to_keep=None)

    init_op = tf.global_variables_initializer()
    self.session.run(init_op)

    self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim)

  def build_all(self):

    self.actions = tf.placeholder(tf.float32, (None, self.action_dim), name="actions")
    self.rewards = tf.placeholder(tf.float32, (None,), name="rewards")
    self.done = tf.placeholder(tf.float32, (None,), name="done")

    self.build_network()
    self.build_target_network()

    if self.soft_update_rate is not None:
      self.create_soft_target_update_op()
    else:
      self.create_hard_target_update_op()

    self.action_q_values = tf.reduce_sum(self.q_values * self.actions, axis=1)
    self.max_target_q_values = tf.reduce_max(self.target_q_values, axis=1)

    self.targets = self.rewards + (1 - self.done) * (self.discount * self.max_target_q_values)

    if self.detailed_summary:
      architect.variable_summaries(self.targets, name="targets")

    td_diff = self.action_q_values - tf.stop_gradient(self.targets)

    if self.use_huber_loss:
      loss = tf.reduce_mean(architect.huber_loss(td_diff))
    else:
      loss = tf.reduce_mean(tf.pow(td_diff, 2))

    tf.summary.scalar("loss", loss)

    self.global_step = tf.Variable(0, name='global_step', trainable=False)
    self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1))
    self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)

  def build_network(self):
    self.states, self.q_values = self.network.build(self.state_dim, self.action_dim, self.ACTION_VALUE_NET_NAME)

  def build_target_network(self):
    self.next_states, self.target_q_values = self.network.build(self.state_dim, self.action_dim, self.TARGET_ACTION_VALUE_NET_NAME)

  def create_soft_target_update_op(self):
    # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py
    net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME)
    target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME)

    self.target_update = []
    for v_source, v_target in zip(net_vars, target_net_vars):
      # this is equivalent to target = (1-alpha) * target + alpha * source
      update_op = v_target.assign_sub(self.soft_update_rate * (v_target - v_source))
      self.target_update.append(update_op)

    self.target_update = tf.group(*self.target_update)

  def create_hard_target_update_op(self):
    net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTION_VALUE_NET_NAME)
    target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_ACTION_VALUE_NET_NAME)

    self.target_update = []
    for v_source, v_target in zip(net_vars, target_net_vars):
      update_op = v_target.assign(v_source)
      self.target_update.append(update_op)

    self.target_update = tf.group(*self.target_update)

  def learn(self):
    # learn
    batch = self.buffer.sample(self.batch_size)

    merged, _ = self.session.run([self.merged, self.train_op], feed_dict={
      self.states: batch["states"],
      self.actions: batch["actions"],
      self.rewards: batch["rewards"],
      self.next_states: batch["next_states"],
      self.done: batch["done"]
    })

    self.summary_writer.add_summary(merged, global_step=self.step)

    # target update
    if self.soft_update_rate is not None:
      self.session.run(self.target_update)
    elif self.step % self.hard_update_frequency == 0:
      self.session.run(self.target_update)

  def run_episode(self, env):

    state = env.reset()
    state, skip = self.prep.process(state)

    total_reward = 0

    while True:
      # play
      if skip:
        action = env.action_space.sample()
      else:
        q_values = self.session.run(self.q_values, feed_dict={self.states: state})[0]

        if self.solved:
          action = self.greedy_policy.select_action(q_values)
        else:
          action = self.exp_policy.select_action(q_values)

      action_one_hot = np.zeros(self.action_dim)
      action_one_hot[action] = 1

      tmp_state = state
      tmp_skip = skip

      state, reward, done, info = env.step(action)
      state, skip = self.prep.process(state)

      total_reward += reward

      if not tmp_skip and not tmp_skip:
        self.buffer.add({
            "state": tmp_state[0],
            "action": action_one_hot,
            "reward": reward,
            "next_state": state[0],
            "done": int(done)
          })

      if self.step >= self.steps_before_learn and self.step % self.train_freq == 0 and not self.solved:
        # learn
        self.learn()

      _, self.step = self.session.run([self.inc_global_step, self.global_step])

      if done:
        break

    summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward)
    summary_2 = summary_pb2.Summary(value=[summary_value])
    self.summary_writer.add_summary(summary_2, global_step=self.step)

    if total_reward >= self.max_reward:
      self.solved = True
    else:
      self.solved = False

    if self.step == self.num_steps:
      self.saver.save(self.session, self.summary_dir, global_step=self.step)

    return total_reward, self.step

  def close(self):
    self.session.close()