Exemplo n.º 1
0
class CartPoleAgent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        # discount rate
        self.gamma = 0.95
        # exploration rate
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        self.memory = Memory(2000)
        self.DQN = DQN(state_size, action_size)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.DQN.predict(state)
        return np.argmax(act_values[0])  # returns action

    def remember(self, state, action, reward, next_state, done):
        self.memory.add((state, action, reward, next_state, done))

    def replay(self, batch_size):
        minibatch = self.memory.sample(batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(
                    self.DQN.predict(next_state)[0])
            target_f = self.DQN.predict(state)
            target_f[0][action] = target
            self.DQN.train(state, target_f, epochs=1)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
Exemplo n.º 2
0
    with tf.Session() as sess:
        model = DQN(sess, state_size, action_size, learning_rate, gamma)
        memory = Memory(memory_size)

        while True:
            state = env.reset()
            for t in range(scene_iteration):
                env.render()

                # Play a new action
                action = model.select_action(state)
                new_state, reward, done, info = env.step(np.argmax(action))
                # if done:
                #     reward = -1.0

                memory.add((state, new_state, reward, action, done))

                # After batch_size experiences train the model
                if len(memory.buffer) > batch_size:
                    state_batch, new_state_batch, reward_batch, action_batch, done_batch = memory.sample(
                        batch_size)
                    model.learn(state_batch, new_state_batch, reward_batch,
                                action_batch, done_batch)
                state = new_state

                # Test phase
                if len(memory.buffer) > test_batch_size:
                    state_batch, new_state_batch, reward_batch, action_batch, done_batch = memory.sample(
                        test_batch_size)
                    current_Q = np.max(sess.run(model.output,
                                                feed_dict={
Exemplo n.º 3
0
class Agent(object):
    def __init__(self, state_count, action_count):
        self.state_count = state_count
        self.action_count = action_count

        self.brain = Brain(state_count, action_count)
        self.memory = Memory(MEMORY_CAPACITY)

        self.epsilon = MAX_EPSILON
        self.steps = 0

    def act(self, s):
        if random.random() < self.epsilon:
            return random.randint(0, self.action_count - 1)
        else:
            return np.argmax(self.brain.predict_one(s))

    def observe(self, samples):
        self.memory.add(samples)

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.update_target()

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(
            -LAMBDA * self.steps)

    def replay(self):
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)

        no_state = np.zeros(self.state_count)

        states = np.array([o[0] for o in batch])
        states_ = np.array([(no_state if o[3] is None else o[3])
                            for o in batch])

        p = self.brain.predict(states)
        #p_ = self.brain.predict(states_, target=True)

        p_ = self.brain.predict(states_, target=False)
        pTarget_ = self.brain.predict(states_, target=True)

        x = np.zeros((batchLen, self.state_count))
        y = np.zeros((batchLen, self.action_count))

        for i in range(batchLen):
            o = batch[i]
            s = o[0]
            a = o[1]
            r = o[2]
            s_ = o[3]

            t = p[i]
            if s_ is None:
                t[a] = r
            else:
                #t[a] = r + GAMMA * np.amax(p_[i])
                t[a] = r + GAMMA * pTarget_[i][np.argmax(p_[i])]  # double DQN

            x[i] = s
            y[i] = t

        self.brain.train(x, y)
Exemplo n.º 4
0
        # print("step:", i)
        # If it's the first step
        if i == 0:
            # First we need a state
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
        # Random action
        action = random.choice(possible_actions)
        # Get the rewards
        reward = game.make_action(action)
        done = game.is_episode_finished()
        if  done:
            # We finished the episode
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))
            # Start a new episode
            game.new_episode()
            # First we need a state
            state = game.get_state().screen_buffer
            # Stack the frames
            state, stacked_frames = stack_frames(stacked_frames, state, True)
        else:
            # Get the next state
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            # Add experience to memory
            memory.add((state, action, reward, next_state, done))
            # Our state is now the next_state
            state = next_state
Exemplo n.º 5
0
class DistributionalRL:
    def __init__(self, actions, gamma=0.1, e_greedy=0.9):
        state_size = 1
        neurons = 24

        self.actions = actions
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.1
        self.count = 0
        self.epochs = 5

        self.v_max = 10
        self.v_min = -10
        self.atoms = 51
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

        self.m = Build_Model(state_size,
                             neurons,
                             len(actions),
                             atoms=self.atoms)
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)

        self.capacity = 300
        self.memory = Memory(self.capacity)

    @timecost
    def choose_action(self, s):
        if np.random.uniform() < self.epsilon:
            # choose the best action
            state_action = []
            for i in self.model.predict([s]):
                state_action.append(
                    np.sum([self.z[j] * i[0][j] for j in range(self.atoms)]))
            action = np.random.choice([
                i for i in range(len(state_action))
                if state_action[i] == max(state_action)
            ])
        else:
            # choose action randomly
            action = np.random.choice(self.actions)
        return action

    @timecost
    def learn(self, s, a, r, s_, done):
        batch_size = 50
        record_size = self.capacity
        loss, q_distribution = self.get_q_value(s, a, r, s_, done)
        self.memory.add(loss, [s, a, r, s_, q_distribution])
        self.count += 1

        # train when do record_size times actions.
        if self.count % record_size == 0:
            batch, idxs, is_weight = self.memory.sample(batch_size)
            X_train = np.zeros((batch_size, 1))
            Y_train = [
                np.zeros((batch_size, self.atoms))
                for i in range(len(self.actions))
            ]

            for i in range(batch_size):
                X_train[i] = batch[i][0]
                for i_ in range(len(self.actions)):
                    Y_train[i_][i][:] = batch[i][4][i_][:]
            print('-----training-----')
            self.model.fit(X_train, Y_train, epochs=3, verbose=0)
            print('training')
            # update prioritized experience
            for i in range(batch_size):
                #_s, _a, _r, _s_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3]
                #loss = self.get_q_value(_s, _a, _r, _s_, done)[0]
                self.memory.update(idxs[i], 0)

    @timecost
    def get_q_value(self, s, a, r, s_, done):
        p = self.model.predict([s])
        old_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        # 一樣有 double dqn
        p_next = self.model.predict([s_])
        p_d_next = self.dump_model.predict([s_])
        q = np.sum(np.multiply(np.vstack(p_next), np.array(self.z)), axis=1)
        next_action_idxs = np.argmax(q)
        # init m 值
        m_prob = [np.zeros((1, self.atoms))]
        # action 後更新 m 值
        if done:  # Distribution collapses to a single point
            Tz = min(self.v_max, max(self.v_min, r))
            bj = (Tz - self.v_min) / self.delta_z
            m_l, m_u = math.floor(bj), math.ceil(bj)
            m_prob[0][0][int(m_l)] += (m_u - bj)
            m_prob[0][0][int(m_u)] += (bj - m_l)
        else:
            for j in range(self.atoms):
                Tz = min(self.v_max, max(self.v_min,
                                         r + self.gamma * self.z[j]))
                bj = (Tz - self.v_min) / self.delta_z
                m_l, m_u = math.floor(bj), math.ceil(bj)
                m_prob[0][0][int(
                    m_l)] += p_d_next[next_action_idxs][0][j] * (m_u - bj)
                m_prob[0][0][int(
                    m_u)] += p_d_next[next_action_idxs][0][j] * (bj - m_l)
# 更新後放回p,回去訓練
        p[a][0][:] = m_prob[0][0][:]
        # 計算q估計
        new_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        #計算 error 給PER
        error = abs(old_q[a] - new_q[a])
        return error, p
Exemplo n.º 6
0
for _ in range(N_EPISODES_PRETRAIN):
    env.reset()
    while True:
        states = dict()
        actions = dict()
        rewards = dict()
        next_states = dict()

        step_action = []
        for agent in agent_names:
            states[agent] = env.get_observation(agent)
            action = random.randint(0, 1)
            actions[agent] = action
        rewards, next_states, is_finished = env.set_action(actions)

        memory.add([states, actions, rewards, next_states, is_finished])
        if is_finished:
            break


# function to update Q learning
def update_model():
    minibatch = memory.sample(BATCH_SIZE)
    batch_states = []
    batch_targets = []

    for states, actions, rewards, next_states, done in minibatch:
        np_states = []
        np_actions = []
        np_rewards = []
        np_next_states = []
Exemplo n.º 7
0
class QLearningTable:
    def __init__(self,
                 actions,
                 learning_rate=0.001,
                 reward_decay=0.9,
                 e_greedy=0.4):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.batch_size = 25
        self.state_size = 4

        # neural network
        M = Build_Model(4, 4, 4)
        self.model = M.build()
        self.target_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr=self.lr)
        self.epochs = 1

        # memory
        self.capacity = 200
        self.memory = Memory(self.capacity)
        self.store_times = 0

    def choose_action(self, s):
        # action selection
        if np.random.uniform() < self.epsilon:
            # choose best action
            s = np.array(s)
            state_action = self.model.predict([[s]])[0]
            print(state_action)
            # some actions may have the same value, randomly choose on in these actions
            action = np.argmax(state_action)
        else:
            # choose random action
            action = np.random.choice(self.actions)

        return action

    def store_memory(self, s, a, r, s_):
        if r in [1, -1]:
            self.memory.add(100, [s, a, r, s_])
            self.memory.add(100, [s, a, r, s_])
        else:
            self.memory.add(1, [s, a, r, s_])
        self.store_times += 1

    def learn(self):
        self.loss_record = []
        batch, index, is_weight = self.memory.sample(self.batch_size)
        # initial the training data
        X_train = np.zeros((self.batch_size, self.state_size))
        Y_train = [np.zeros(len(self.actions)) for i in range(self.batch_size)]
        for i in range(self.batch_size):
            s, a, r, s_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3],
            q_table = self.model.predict([[s]])[0]
            q_predict = q_table[a]
            if s_ != 'terminal':
                q_next_table = self.target_model.predict([[s_]])[0]
                next_action = np.argmax(self.model.predict([[s]])[0])
                q_target = r + self.gamma * q_next_table[
                    next_action]  # next state is not terminal
            else:
                q_target = r  # next state is terminal

            loss = abs(q_target)
            q_table[a] += (q_target - q_predict)
            # store memory
            self.loss_record.append(loss)
            # setup training data
            X_train[i] = s
            for i_ in range(len(self.actions)):
                Y_train[i][i_] = q_table[i_]

        #training
        for epoch in range(self.epochs):
            self.train(self.model, X_train, Y_train)

        # memory update
        for i in range(self.batch_size):
            self.memory.update(index[i], self.loss_record[i])

    def _loss(self, model, x, y):
        x = np.array(x)

        y_ = model([[x]])

        loss = huber_loss(y, y_)
        return loss

    def _grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self._loss(model, inputs, targets)
            #self.epoch_loss_avg(loss_value)
            return tape.gradient(loss_value, self.model.trainable_variables)

    def train(self, model, s, q):
        grads = self._grad(model, s, q)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables),
            get_or_create_global_step())
Exemplo n.º 8
0
class SmartAgent:
    def __init__(self, stack_size, max_memory_size, model_save_path):
        self.env = gym.make('BreakoutDeterministic-v4')
        self.memory = Memory(max_memory_size)
        self.frame_stack_size = stack_size
        self.explore_prob = .08
        self.explore_prob_final = 0.01
        self.explore_decay = .995
        self.DQN = DQNModel(self.env.action_space.n, stack_size)
        self.num_exps = 0
        self.model_save_path = model_save_path
        self.discount = .99

    def setup_DQN(self, load_existing=None, model_path=None):
        if load_existing:
            self.DQN.restore_model(model_path)
        else:
            self.DQN.build_target_model()
            self.DQN.build_prediction_model()
            self.DQN.save_model_params(self.model_save_path)

    def train(self, batch_size, num_epochs, update_target):
        mem_block = self.memory.sample(batch_size)
        self.DQN.train(mem_block, self.discount, num_epochs)
        if self.num_exps % 1000 == 0:
            self.DQN.save_model_params(self.model_save_path)

        self.DQN.update_target_model()
        if self.explore_prob > self.explore_prob_final:
            self.explore_prob *= 0.9999

    def test_policy(self):
        is_new_episode = True
        frame_stack = Util.new_frame_stack(self.frame_stack_size)
        first_frame = self.env.reset()
        # self.env.render()
        state, frame_stack = Util.stack_frames(frame_stack,
                                               self.frame_stack_size,
                                               first_frame, is_new_episode)
        episode_is_done = False
        while not episode_is_done:
            print(self.num_exps)
            is_new_episode = False

            # Take an action
            num_acts = 0
            action = self.DQN.get_next_action(state, self.explore_prob)

            frame, reward, episode_is_done, _ = self.env.step(action)
            self.env.render()

            next_state, frame_stack = Util.stack_frames(
                frame_stack, self.frame_stack_size, frame, is_new_episode)

            state = next_state

    def gather_experience(self, num_training):
        for i in range(num_training):
            pts = 0
            episode_is_done = False
            is_new_episode = True
            # Start the environment and put the first frame into a stack
            frame_stack = Util.new_frame_stack(self.frame_stack_size)
            first_frame = self.env.reset()
            #self.env.render()
            state, frame_stack = Util.stack_frames(frame_stack,
                                                   self.frame_stack_size,
                                                   first_frame, is_new_episode)
            # for i in range(4):
            #3plt.imshow(state[:,:,0])
            #plt.show()
            #print(state.shape)
            num_consec_actions = 4
            num_acts = 0
            while not episode_is_done:
                print(self.num_exps)
                is_new_episode = False

                # Take an action

                if num_acts % num_consec_actions == 0:
                    num_acts = 0
                    action = self.DQN.get_next_action(state, self.explore_prob)

                frame, reward, episode_is_done, _ = self.env.step(action)
                pts += reward
                num_acts += 1
                #print(reward)
                #plt.imshow(frame)
                #plt.show()
                self.env.render()
                next_state, frame_stack = Util.stack_frames(
                    frame_stack, self.frame_stack_size, frame, is_new_episode)
                # for i in range(4):
                #     plt.imshow(next_state[:, :, i])
                #     plt.show()
                experience = (state, action, np.sign(reward), next_state,
                              episode_is_done)
                state = next_state
                self.memory.add(experience)
                self.num_exps += 1

                if self.num_exps >= 100:
                    if (self.num_exps - 200) % 4 == 0:
                        self.train(32, 1, True)

                #print('Explore Prob:', self.explore_prob)

            print('Total points:', pts)
Exemplo n.º 9
0
class Agent:
    def __init__(self, actions, gamma=0.1, e_greedy=0.9):
        self.state_size = 4
        neurons = 24

        self.actions = actions
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.1
        self.count = 0
        self.epochs = 50

        self.v_max = 10
        self.v_min = -10
        self.atoms = 51
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

        self.m = Build_Model(self.state_size,
                             neurons,
                             len(actions),
                             atoms=self.atoms)
        self.m.build()
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr=self.lr)
        self.batch_size = 100

        self.capacity = 300
        self.memory = Memory(self.capacity)

        self.record_size = self.capacity

    @timecost
    def choose_action(self, s):
        if np.random.uniform() < self.epsilon:
            # choose the best action
            state_action = []
            for i in self.model.predict([[s]]):
                state_action.append(
                    np.sum([self.z[j] * i[0][j] for j in range(self.atoms)]))
            action = np.random.choice([
                i for i in range(len(state_action))
                if state_action[i] == max(state_action)
            ])
        else:
            # choose action randomly
            action = np.random.choice(self.actions)
        return action

    #@timecost
    def learn(self, s, a, r, s_, done):
        loss, q_distribution = self.get_q_value(s, a, r, s_, done)
        self.memory.add(loss, [s, a, r, s_, q_distribution])
        self.count += 1

        # train when do record_size times actions.
        if self.count % self.record_size == 0:
            batch, idxs, is_weights = self.memory.sample(self.batch_size)
            X_train = np.zeros((self.batch_size, self.state_size))
            Y_train = [
                np.zeros((self.batch_size, self.atoms))
                for i in range(len(self.actions))
            ]

            for i in range(self.batch_size):
                X_train[i] = batch[i][0]
                for i_ in range(len(self.actions)):
                    Y_train[i_][i][:] = batch[i][4][i_][:]

            print('-----training-----')
            for i in range(self.epochs):
                self.train(X_train, Y_train)

            # update prioritized experience
            for i in range(self.batch_size):
                _s, _a, _r, _s_, is_weight = batch[i][0], batch[i][1], batch[
                    i][2], batch[i][3], is_weights[i]
                loss = self.get_q_value(_s, _a, _r, _s_, done)[0]
                self.memory.update(idxs[i], is_weight * loss)

    #@timecost
    def get_q_value(self, s, a, r, s_, done):
        p = self.model.predict([[s]])
        old_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        # 一樣有 double dqn
        p_next = self.model.predict([[s_]])
        q = np.sum(np.multiply(np.vstack(p_next), np.array(self.z)), axis=1)

        p_d_next = self.dump_model.predict([[s_]])
        next_action_idxs = np.argmax(q)
        # init m 值
        m_prob = [np.zeros((1, self.atoms))]
        # action 後更新 m 值
        if done:  # Distribution collapses to a single point
            Tz = min(self.v_max, max(self.v_min, r))
            bj = (Tz - self.v_min) / self.delta_z
            m_l, m_u = math.floor(bj), math.ceil(bj)
            m_prob[0][0][int(m_l)] += (m_u - bj)
            m_prob[0][0][int(m_u)] += (bj - m_l)
        else:
            for j in range(self.atoms):
                Tz = min(self.v_max, max(self.v_min,
                                         r + self.gamma * self.z[j]))
                bj = (Tz - self.v_min) / self.delta_z
                m_l, m_u = math.floor(bj), math.ceil(bj)
                m_prob[0][0][int(
                    m_l)] += p_d_next[next_action_idxs][0][j] * (m_u - bj)
                m_prob[0][0][int(
                    m_u)] += p_d_next[next_action_idxs][0][j] * (bj - m_l)
# 更新後放回p,回去訓練
        p[a][0][:] = m_prob[0][0][:]
        # 計算q估計
        new_q = np.sum(np.multiply(np.vstack(p), np.array(self.z)), axis=1)
        #計算 error 給PER
        error = abs(old_q[a] - new_q[a])
        return error, p

    def _loss(self, model, x, y):
        y_ = self.model(x)
        #loss = sum(sum(tf.nn.softmax_cross_entropy_with_logits(y, y_)))
        loss = tf.nn.softmax_cross_entropy_with_logits(y, y_)

        return loss

    def _grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self._loss(self.model, inputs, targets)
        return loss_value, tape.gradient(loss_value,
                                         self.model.trainable_variables)

    def train(self, s, q):
        loss_value, grads = self._grad(self.model, s, q)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables),
            get_or_create_global_step())
Exemplo n.º 10
0
def train(sess, env, args, actor, critic, actor_noise, desired_goal_dim, achieved_goal_dim, observation_dim):
    # Set path to save results
    tensorboard_dir = './' + args['env'] + '_' + args['name'] + '/train_' + datetime.now().strftime('%Y-%m-%d-%H')
    model_dir = './' + args['env'] + '_' + args['name'] + '/model'

    # add summary to tensorboard
    summary_ops, summary_vars = build_summaries()

    # initialize variables, create writer and saver
    sess.run(tf.compat.v1.global_variables_initializer())
    saver = tf.compat.v1.train.Saver()
    writer = tf.compat.v1.summary.FileWriter(tensorboard_dir, sess.graph)

    # restore session if exists
    try:
        saver.restore(sess, os.path.join(model_dir, args['env'] + '_' + args['name'] + '.ckpt'))
        print('------------------------Continue--------------------------')
    except:
        print('----------------------New Training------------------------')

    # initialize target network weights and replay memory
    actor.update()
    critic.update()
    replay_memory = Memory(int(args['memory_size']), int(args['seed']))

    # train in loop
    for i in range(int(args['episodes'])):

        # reset gym, get achieved_goal, desired_goal, state
        achieved_goal, desired_goal, s, s_prime = unpack(env.reset())
        episode_reward = 0
        episode_maximum_q = 0

        for j in range(int(args['episode_length'])):

            # render episode
            if args['render']:
                env.render()

            # predict action and add noise
            a = actor.predict(np.reshape(s, (1, actor.state_dim)))
            a = a + actor_noise.get_noise()

            # play
            obs_next, reward, done, info = env.step(a[0])
            achieved_goal, desired_goal, state_next, state_prime_next = unpack(obs_next)

            # add normal experience to memory
            replay_memory.add(np.reshape(s, (actor.state_dim,)),
                              np.reshape(a, (actor.action_dim,)),
                              reward,
                              done,
                              np.reshape(state_next, (actor.state_dim,)))

            # add hindsight experience to memory
            substitute_goal = achieved_goal.copy()
            substitute_reward = env.compute_reward(achieved_goal, substitute_goal, info)
            replay_memory.add(np.reshape(s_prime, (actor.state_dim,)),
                              np.reshape(a, (actor.action_dim,)),
                              substitute_reward,
                              True,
                              np.reshape(state_prime_next, (actor.state_dim,)))

            # start to train when there's enough experience
            if replay_memory.size() > int(args['batch_size']):
                s_batch, a_batch, r_batch, d_batch, s2_batch = replay_memory.sample_batch(int(args['batch_size']))

                # find TD -- temporal difference
                # actor find target action
                a2_batch = actor.predict_target(s2_batch)

                # critic find target q
                q2_batch = critic.predict_target(s2_batch, a2_batch)

                # add a decay of q to reward if not done
                r_batch_discounted = []
                for k in range(int(args['batch_size'])):
                    if d_batch[k]:
                        r_batch_discounted.append(r_batch[k])
                    else:
                        r_batch_discounted.append(r_batch[k] + critic.gamma * q2_batch[k])

                # train critic with state, action, and reward
                pred_q, _ = critic.train(s_batch,
                                         a_batch,
                                         np.reshape(r_batch_discounted, (int(args['batch_size']), 1)))

                # record maximum q
                episode_maximum_q += np.amax(pred_q)

                # actor find action
                a_outs = actor.predict(s_batch)

                # get comment from critic
                comment_gradients = critic.get_comment_gradients(s_batch, a_outs)

                # train actor with state and the comment gradients
                actor.train(s_batch, comment_gradients[0])

                # Update target networks
                actor.update()
                critic.update()

            # record reward and move to next state
            episode_reward += reward
            s = state_next

            # if episode ends
            if done:
                # write summary to tensorboard
                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: episode_reward,
                    summary_vars[1]: episode_maximum_q / float(j)
                })
                writer.add_summary(summary_str, i)
                writer.flush()

                # print out results
                print('| Episode: {:d} | Reward: {:d} '.format(i, int(episode_reward)))
                # save model
                saver.save(sess, os.path.join(model_dir, args['env'] + '_' + args['name'] + '.ckpt'))

                break
    return
Exemplo n.º 11
0
class Agent:
    def __init__(self, actions, gamma=0.7, e_greedy = 0.7):
        self.actions = actions  # a list
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.01
        self.count = 0
        self.epochs = 50
        self.bar = Progbar(self.epochs)
        self.epoch_loss_avg = tf.keras.metrics.Mean()

        self.batch_size = 100
        self.state_size = 2
        self.record_size = 200

        # initial model include the hard-working one and the dump one.
        M = Build_Model(self.state_size, 16, len(actions))
        self.model = M.build()
        self.dump_model = copy.copy(self.model)
        self.optimizer = tf.optimizers.Adam(lr = self.lr)
        # initial memory with sum tree
        self.capacity = 200
        self.memory = Memory(self.capacity)

    def choose_action(self, s):
        # action selection
        if np.random.uniform() < self.epsilon:
            # choose best action
            state_action = self.model.predict([[s]])[0]
            action = np.argmax(state_action)
            #action = np.random.choice([i for i in range(len(state_action)) if state_action[i] == max(state_action)])
        else:
            # choose random action
            action = np.random.choice(self.actions)
        return action

    def store(self, s, a, r, s_, done):
        self.memory.add(1, [s, a, r, s_, done])
        self.count += 1

    def learn(self):
        loss_record = []
        batch, idxs, is_weight = self.memory.sample(self.batch_size)
        X_train = np.zeros((self.batch_size, self.state_size))
        Y_train = [np.zeros(len(self.actions)) for i in range(self.batch_size)]
        for i in range(self.batch_size):
            _s, _a, _r, _s_, done_ = batch[i][0], batch[i][1], batch[i][2], batch[i][3], batch[i][4]
            q_list, loss = self.get_loss(_s, _a, _r, _s_, done_)
            loss_record.append(loss)
            X_train[i] = _s
            for i_ in range(len(self.actions)):
                Y_train[i][i_] = q_list[i_]


        # Train!
        print('-----------Training-----------')
        for i in range(self.epochs):
            self.train(self.model, X_train, Y_train)
            self.bar.update(i, values=[('loss', self.epoch_loss_avg.result().numpy())])

        # update prioritized experience
        for i in range(self.batch_size):
            loss = loss_record[i] * is_weight[i]
            self.memory.update(idxs[i], loss)

    
    def get_loss(self, s, a, r, s_, done):
        # calculate q value
        q_list = self.model.predict([[s]])[0]
        q_predict = q_list[a]
        qvalue = self.dump_model.predict([[s_]])[0][np.argmax(q_list)]
        loss = r + self.gamma * qvalue - q_predict
        if done:
            q_target = r
            
        else:
            q_target = loss
        q_list[a] =r + self.gamma * qvalue
  
        return q_list, loss

    def _loss(self, model, x, y):
        x = np.array(x)
        y_ = model(x)
        loss = huber_loss(y, y_)
        return loss
	
    def _grad(self, model, inputs, targets):
        with tf.GradientTape() as tape:
            loss_value = self._loss(model, inputs, targets)
            self.epoch_loss_avg(loss_value)
            return tape.gradient(loss_value, self.model.trainable_variables)

    def train(self, model, s, q):
        grads = self._grad(model, s, q)
        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables),
            get_or_create_global_step())
Exemplo n.º 12
0
class Agent:
    def __init__(self, config):
        self.config = config
        self.epsilon = self.config.explore_start
        print("start of epsilon is ", self.epsilon)
        self.brain = MModel(self.config.action_size, self.config.state_size[0],
                            self.config.state_size[1],
                            self.config.state_size[2])
        self.memory = Memory(self.config.memory_size)
        self.num_actions = self.config.action_size
        self.decayStep = 0
        #self.stacked_frames is 4 for now

    def act(self, StackOfImage):
        #input is 88,84,4
        #input is the stack of images (1,84,84,4)    So input data has a shape of (batch_size, height, width, depth),
        # it will return numpy , then
        #choose action to take based on episolon greedy,

        #actionsToTake = np.zeros([self.num_actions]) # action at t a_t[0,0]
        if random.random() <= self.epsilon:  #randomly explore an action
            #print("----------Random Action----------")
            #print(self.epsilon)
            action_index = random.randrange(self.num_actions)  # it will be 0,1
            #actionsToTake[action_index]=1
            #print(action_index)
        else:
            q = self.brain.predict(
                StackOfImage.reshape((1, *StackOfImage.shape)))
            #print(q)
            #self.display(q[0][0],q[0][1])
            action_index = np.argmax(q)
            #actionsToTake[action_index]=1
            #print(action_index)
        return action_index

    def exploreLess(self):
        self.decayStep += 1
        #this method decay too fast,not gradient is not slowingdown
        # self.epsilon = max(self.config.explore_stop, self.epsilon * np.exp(-self.config.decay_rate*self.decayStep))
        self.epsilon = self.config.explore_stop + (
            self.config.explore_start - self.config.explore_stop) * np.exp(
                -self.config.decay_rate * self.decayStep)

    def remember(self, experience):
        self.memory.add(experience)
        #print("inside remember function",experience[0].shape)

    def replay(self):
        #replay means learn, it will decrease episolon also.
        self.exploreLess()

        targets = np.zeros((self.config.batch_size, self.config.action_size))
        batch = self.memory.sample(self.config.batch_size)
        #print("batch size is ",batch[0][0].shape)
        states_mb = np.array([each[0] for each in batch], ndmin=3)
        # ndmin meaning is stack to the last dimention.  from 64 4 84 84    to 64 84 84 4
        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch])
        next_states_mb = np.array([each[3] for each in batch], ndmin=3)
        dones_mb = np.array([each[4] for each in batch])
        Qs_targets = self.getQvalue(states_mb)
        #print(next_states_mb.shape)
        Qs_nextState = self.getQvalue(next_states_mb)  # 64*4

        for i in range(0, len(batch)):
            terminal = dones_mb[i]
            if terminal:
                actionNumber = np.argmax(Qs_targets[i])
                Qs_targets[i][actionNumber] = rewards_mb[i]
            else:
                actionNumber = np.argmax(Qs_targets[i])
                Qs_targets[i][
                    actionNumber] = rewards_mb[i] + self.config.gamma * np.max(
                        Qs_nextState[i])  # the bootstraping way to get Q value.
        #targets_np=np.array(target_Qs_batch)# change to numpy array
        loss = self.brain.train(states_mb, Qs_targets)

    def getQvalue(self, stackOfImage):
        return self.brain.predict(stackOfImage)

    def saveModel(self, name):
        self.brain.save(self.config.checkpoints + '/' + name + '.ckpt')

    def getEpsilon(self):
        return self.epsilon
Exemplo n.º 13
0
class NoisyQ:
    def __init__(self, actions, gamma=0.1, e_greedy=0.9):
        self.actions = actions  # a list
        self.gamma = gamma
        self.epsilon = e_greedy
        self.lr = 0.1
        self.count = 0
        self.epochs = 5
        # initial model include the hard-working one and the dump one.
        self.m = Build_Model(1, 10, len(actions))
        self.model = self.m.model
        self.dump_model = copy.copy(self.model)
        # initial memory with sum tree
        self.capacity = 200
        self.memory = Memory(self.capacity)

    def choose_action(self, s):
        # action selection
        if np.random.uniform() < self.epsilon:
            # choose best action
            state_action = self.model.predict([s])[0]
            # Doulbe method
            action = np.random.choice([
                i for i in range(len(state_action))
                if state_action[i] == max(state_action)
            ])
        else:
            # choose random action
            action = np.random.choice(self.actions)
        return action

    def learn(self, s, a, r, s_):
        batch_size = 100
        record_size = 300
        s, a, r, s_, q_list, loss = self.q_value_cal(s, a, r, s_)
        self.memory.add(loss, [s, a, r, s_, q_list])
        self.count += 1

        # train when do record_size times actions.
        if self.count % record_size == 0:
            batch, idxs, is_weight = self.memory.sample(batch_size)
            Train = copy.copy(batch)
            X_train = np.array(Train)[:, 0]
            Y_train = np.array([i for i in np.array(Train)[:, 4]])
            print(X_train.shape)
            print(Y_train.shape)
            self.model.fit(X_train, Y_train, epochs=self.epochs)

            # update prioritized experience
            for i in range(batch_size):
                _s, _a, _r, _s_ = batch[i][0], batch[i][1], batch[i][2], batch[
                    i][3]
                loss = self.q_value_cal(_s, _a, _r, _s_)[5]
                self.memory.update(idxs[i], loss)

    def q_value_cal(self, s, a, r, s_):
        # calculate q value
        q_list = self.model.predict([s])[0]
        q_predict = q_list[a]
        qvalue = self.dump_model.predict([s_])[0][np.argmax(q_list)]
        #q_target = r + self.gamma * qvalue
        loss = qvalue - q_predict
        q_list[a] += r + self.gamma * qvalue - q_predict
        return s, a, r, s_, q_list, loss
class DoubleDQN:
    def __init__(self, state_shape, action_size, discount_rate=0.97, epsilon=1.0,
                 epsilon_decay=0.98, mem_size=20001, batch_size=32, tau=0.05):
        self.state_shape = state_shape
        self.action_size = action_size
        self.epsilon = epsilon
        self.epsilon_min = 0.05
        self.epsilon_decay = epsilon_decay
        # self.epsilon_speed = episodes * 2
        self.discount_rate = discount_rate
        self.tau = tau
        self.memory = Memory(mem_size)
        self.batch_size = batch_size
        self.optimizer = Adam()
        self.model = self._build_model()
        self.target_model = self._build_model()

    def _build_model(self):
        inp = Input(self.state_shape)
        out = Dense(48, activation='relu')(inp)
        out = Dense(32, activation='relu')(out)
        out = Dense(self.action_size, activation='linear')(out)
        model = Model(inp, out)
        model.compile(loss="mse", optimizer=self.optimizer)

        return model

    def choose_action(self, state):
        prob = np.random.uniform(0, 1)
        if prob >= self.epsilon:
            return np.argmax(self.model.predict(np.array([state]))[0])
        else:
            return np.random.randint(self.action_size)

    def update_epsilon(self, loop_counter):
        if self.epsilon > self.epsilon_min:
            # self.epsilon *= 1/(1+loop_counter/self.epsilon_speed)
            self.epsilon *= self.epsilon_decay

    def train(self):
        """
        samples is a set of (S, A, R, S', done)
        """
        samples = self.memory.sample(self.batch_size)
        states = tf.convert_to_tensor(np.array([sample[0] for sample in samples]), dtype='float32')
        actions = np.array([sample[1] for sample in samples])
        rewards = np.array([sample[2] for sample in samples])
        next_states = np.array([sample[3] for sample in samples])
        done = np.array([sample[4] for sample in samples])
        maxQ_next_actions_index = np.argmax(self.model.predict_on_batch(next_states), axis=1)
        target_next_Qvals = np.array([self.target_model.predict_on_batch(next_states)[i, maxQ_next_actions_index[i]]
                                      for i in range(len(samples))])
        target_Qvals = rewards + self.discount_rate * done * target_next_Qvals

        out = self.model(states).numpy()
        for i in range(len(samples)):
            out[i][actions[i]] = target_Qvals[i]
        self.model.train_on_batch(states, out)

        # def train(self):
        #     """
        #     samples is a set of (S, A, R, S', done)
        #     """
        # samples = self.memory.sample(self.batch_size)
        #
        # for i in range(len(samples)):
        #     state = tf.convert_to_tensor(samples[i][0], dtype='float32')
        #     action = samples[i][1]
        #     reward = samples[i][2]
        #     next_state = samples[i][3]
        #     done = samples[i][4]
        #     target_next_Qval = self.target_model.predict(next_state)[0, np.argmax(self.model.predict(next_state)[0])]
        #     target_Qval = reward + self.discount_rate * done * target_next_Qval
        #     with tf.GradientTape() as tape:
        #         out = self.model(state)
        #         y_train = out.numpy()
        #         y_train[0][action] = target_Qval
        #         loss = (out - y_train) ** 2
        #     grad = tape.gradient(loss, self.model.trainable_variables)
        #     self.optimizer.apply_gradients(zip(grad, self.model.trainable_variables))

    def remember(self, state, action, reward, next_state, done):
        self.memory.add([state, action, reward, next_state, done])

    def update_target(self):
        weights = self.model.trainable_variables
        target_weights = self.target_model.trainable_variables
        for i in range(len(weights)):
            target_weights[i].assign(self.tau * weights[i] + (1 - self.tau) * target_weights[i])

    def save(self, path):
        self.model.save(path + "/model.h5")
        self.target_model.save(path + "/target_model.h5")

    def load(self, path):
        self.model = load_model(path + "/model.h5")
        self.target_model = load_model(path + "/target_model.h5")