예제 #1
0
def Q_train(alpha, gamma, epsilon, max_iterations):
    w = np.zeros((SS,act_set))       # Initialize
    b = 0                       # Initialize
    Rewards = []
    
    for noe in range(episodes):
        state = Car.reset()
        r = 0                       # Initialize reward
        done = False
         
        for m in range(max_iterations):
            if done == True:
                break
            
            q_vals = weight(state, w, b)
            a = Action_select(q_vals, epsilon)
            Q = q_vals[a]
            Sprime, reward, done = Car.step(a)
            
            '''Computing q_pi (s,a)'''
            Qprime = weight(Sprime, w, b)
            Q_next = max(Qprime)
            
            '''Gradient Update''' 
            grad = alpha * (Q - (reward + gamma*Q_next))
            for j in state.keys():
                w[j][a] = w[j][a] - grad * state[j]
            
            b = b - grad
            state = Sprime
            r += reward
            
            ## Rendering ##
            '''Executed to see improvements after every 1000 episodes else it slows the overall execution'''
            if noe%1000 == 0:
                MountainCar.render(Car)
        
        #env        
        Rewards.append(r)
    
    MountainCar.close(Car)    
    return w, b, Rewards
예제 #2
0
def main():
    (program, mode, weight_out, returns_out, episodes, max_iterations, epsilon,
     gamma, alpha) = sys.argv
    epsilon, gamma, alpha, episodes, max_iterations = float(epsilon), float(
        gamma), float(alpha), int(episodes), int(max_iterations)
    # Output files
    w_out = open(weight_out, 'w')
    r_out = open(returns_out, 'w')
    # Initialize Mountain Car
    car = MountainCar(mode=mode)
    actions, num_actions = (0, 1, 2), 3
    # Weights: <dim(S)> by <num_actions> matrix
    w = np.zeros((car.state_space, num_actions))
    bias = 0

    # Represent state as numpy array
    def state_rep(state_dict, mode):
        if mode == "raw":
            state = np.asarray(list(state_dict.values()))
        elif mode == "tile":
            state = np.zeros(2048)
            for key in state_dict:
                state[key] = 1
        return state

    # Do actions
    for i in range(episodes):
        # Initialize
        num_iters = 0
        total_rewards = 0
        # Raw dictionary
        state_dict = car.reset()
        # Convert to numpy array
        state = state_rep(state_dict, mode)

        while num_iters < max_iterations:
            num_iters += 1

            # E greedy
            action = getAction(state, actions, epsilon, w, bias)

            # Observe sample
            (next_state_dict, reward, done) = car.step(action)

            # Add current reward
            total_rewards += reward

            # Next state, get best action for next state
            next_state = state_rep(next_state_dict, mode)
            next_best_action = getBestAction(next_state, actions, w, bias)
            next_state_best_Q = QValue(next_state, next_best_action, w, bias)

            # Sample
            sample = reward + (gamma * next_state_best_Q)
            diff = QValue(state, action, w, bias) - sample

            # Update weights
            w[:, action] = w[:, action] - alpha * diff * state
            bias = bias - alpha * diff * 1

            # Break if done
            if not done:
                state = next_state
            else:
                break

        # Print rewards
        r_out.write(str(total_rewards) + "\n")

    # Print weight outputs
    w_out.write(str(bias) + '\n')
    for row in w:
        for elem in row:
            w_out.write(str(elem) + '\n')

    # Close
    car.close()
    w_out.close()
    r_out.close()
class qlearning(object):
    def __init__(self, mode, epsilon, gamma, learning_rate):
        self.epsilon = epsilon
        self.gamma = gamma
        self.lr = learning_rate
        self.mode = mode
        self.env = MountainCar(mode)
        self.state_space = self.env.state_space
        self.action_space = 3

        self.W = np.zeros((self.state_space, self.action_space))
        self.b = 0

    # given the current state and action, approximate thee action value (q_s)
    def linear_approx(self, state):
        #return np.dot(state.T, self.W).T + self.b
        return state.dot(self.W) + self.b

    # choose an action based on epsilon-greedy method
    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            # selects uniformly at random from one of the 3 actions (0, 1, 2) with probability ε
            return np.random.randint(0, self.action_space)
        else:
            # selects the optimal action with probability 1 − ε
            # In case of multiple maximum values, return the first one
            return np.argmax(self.linear_approx(state))

    def transfer_state(self, state):
        if self.mode == "raw":
            return np.fromiter(state.values(), dtype=float)
        elif self.mode == "tile":
            idx = sorted(state.keys())
            trans_state = np.zeros((self.state_space))
            trans_state[idx] = 1
            return trans_state
        else:
            print("Error mode.")
            return

    def run(self, weight_out, returns_out, episodes, max_iterations):
        with open(returns_out, 'w') as f_returns:
            # perform training
            for episode in range(episodes):
                rewards = 0
                state = self.transfer_state(self.env.reset())
                if Debug:
                    print("episode " + str(episode) + " init state: ", end="")
                    print(state)
                for i in range(max_iterations):
                    # call step
                    action = self.select_action(state)
                    next_state, reward, done = self.env.step(action)
                    next_state = self.transfer_state(next_state)

                    if Debug and i % 100 == 0:
                        print("episode " + str(episode) + " iter " + str(i) +
                              ", action: " + str(action) + " next state: ",
                              end="")
                        print(next_state)

                    # update w_a
                    delta = state
                    cur_q = self.linear_approx(state)
                    next_q = self.linear_approx(next_state)
                    self.W[:, action] = self.W[:, action] - self.lr * (
                        cur_q[action] -
                        (reward + self.gamma * np.max(next_q))) * delta
                    # update bias
                    self.b = self.b - self.lr * (
                        cur_q[action] - (reward + self.gamma * np.max(next_q)))

                    state = next_state
                    rewards += reward
                    if done:
                        break

                f_returns.write(str(rewards) + "\n")
                if Debug:
                    print("[episode ", episode + 1, "] total rewards: ",
                          rewards)

        with open(weight_out, 'w') as f_weight:
            f_weight.write(str(self.b) + "\n")
            # write the values of weights in row major order
            for i in range(self.W.shape[0]):
                for j in range(self.W.shape[1]):
                    f_weight.write(str(self.W[i][j]) + "\n")

        # visualization
        # self.env.render()

    def close(self):
        self.env.close()