def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    """ score """

    weight = np.random.rand(4, 2)
    episode_rewards = []

    for episode in range(nb_episodes):
        state = env.reset()[None, :]
        gradients = []
        rewards = []
        score = 0

        while True:
            if show_result and (episode % 1000 == 0):
                env.render()
            action, grad = policy_gradient(state, weight)
            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            gradients.append(grad)
            rewards.append(reward)
            score += reward
            state = next_state

            if done:
                break

        for i in range(len(gradients)):
            weight += alpha * gradients[i] *\
                sum([r * gamma ** r for _, r in enumerate(rewards[i:])])

        episode_rewards.append(score)

        print("{}: {}".format(episode, score), end="\r", flush=False)

    return episode_rewards
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    """ write a function that implements a full training"""
    env = gym.make('CartPole-v0')
    np.random.seed(1)
    weight = np.random.rand(4, 2)
    episode_rewards = []
    for e in range(nb_episodes):
        state = env.reset()[None, :]
        grads = []
        rewards = []
        score = 0
        while True:
            if show_result and (e % 1000 == 0):
                env.render()
            action, grad = policy_gradient(state, weight)
            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            grads.append(grad)
            rewards.append(reward)
            score += reward
            state = next_state
            if done:
                break
        for i in range(len(grads)):
            weight += alpha * grads[i] *\
             sum([ r * (gamma ** r) for t, r in enumerate(rewards[i:])])
        episode_rewards.append(score)
    return episode_rewards
示例#3
0
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    '''implements a full training.
    Args:
        env: initial environment
        nb_episodes: number of episodes used for training
        alpha: the learning rate
        gamma: the discount factor
    Return:all values of the score(sum of all rewards during one episode loop)
    '''
    W = np.random.rand(4, 2)
    episode_rewards = []
    for e in range(nb_episodes):
        state = env.reset()[None, :]
        grads = []
        rewards = []
        score = 0
        while True:
            if show_result and (e % 1000 == 0):
                env.render()
            action, grad = policy_gradient(state, W)
            next_state, reward, done, info = env.step(action)
            next_state = next_state[None, :]
            grads.append(grad)
            rewards.append(reward)
            score += reward
            state = next_state
            if done:
                break
        for i in range(len(grads)):
            W += alpha * grads[i] *\
                 sum([r * gamma**r for t, r in enumerate(rewards[i:])])

        episode_rewards = rewards.append(score)
        print("{}: {}".format(e, score), end="\r", flush=False)
    return episode_rewards
示例#4
0
def eval_round(i, docs):
    print 'Training %d' % i
    theta = policy_gradient.policy_gradient(docs)
    print 'Theta:', list(theta)
    print 'Evaluating %d' % i
    doc_pct, cmd_pct = evaluate("data/sendacard_mturk_corpus.tsv", theta, "http://localhost:8000")
    print i, "Doc Pct: " , doc_pct , " Cmd Pct: " , cmd_pct

    return doc_pct, cmd_pct
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    """
    Function to train an agent applying monte carlo policy gradient.

    - env is the initial environment
    - nb_episodes is the number of episodes used for training
    - alpha is the learning rate
    - gamma is the discount factor

    Returns: all values of the score (sum of all rewards during
    one episode loop).
    """
    n_obs = env.observation_space.shape[0]
    n_actions = env.action_space.n
    policy_weights = np.random.rand(n_obs, n_actions)

    scores = []
    for episode in range(nb_episodes):
        state = env.reset()[None, :]
        gradients = []
        rewards = []
        score = 0

        # Run an episode
        done = False
        while not done:
            if show_result and episode % 1000 == 0:
                env.render()

            action, gradient = policy_gradient(state, policy_weights)
            state, reward, done, _ = env.step(action)
            state = state[None, :]

            gradients.append(gradient)
            rewards.append(reward)
            score += reward

        scores.append(score)

        # Policy update
        num_steps = len(gradients)
        discount_factor = gamma**np.arange(num_steps)
        for i in range(num_steps):
            rews_after_step = rewards[i:]
            discount_factors = discount_factor[:len(rews_after_step)]
            disc_reward = np.sum(rews_after_step * discount_factors)

            policy_weights += alpha * gradients[i] * disc_reward
        print("{}: {}".format(episode, score), end="\r", flush=False)

    return scores
示例#6
0
def play_episode(env, weight, i, show_result):
    """Plays a single i"""
    state = env.reset()[None, :]
    state_action_reward_grad = []
    while True:
        if show_result and (i % 1000 == 0):
            env.render()
        action, grad = policy_gradient(state, weight)
        state, reward, done, _ = env.step(action)
        state = state[None, :]
        state_action_reward_grad.append((state, action, reward, grad))
        if done:
            break
    env.close()
    return state_action_reward_grad
def single_episode(env, weight, episode, show_result):
    """play one episode"""
    state = env.reset()[None, :]
    return_grad = []

    while True:
        if show_result and (episode % 1000 == 0):
            env.render()
        action, grad = policy_gradient(state, weight)
        state, reward, done, _ = env.step(action)
        state = state[None, :]
        return_grad.append((state, action, reward, grad))
        if done:
            break
    env.close()
    return return_grad
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    """Train agent cartpole game

    Args:
    -> env: initial environment
    -> nb_episodes: number of episodes used for training
    -> alpha: the learning rate
    -> gamma: the discount factor
    -> show_result: ender the environment every 1000 episodes computed

    Return:
    all values of the score (sum of all rewards during one episode loop)
    """
    w = np.random.rand(4, 2)
    all_scores = []
    for ep in range(nb_episodes + 1):
        state = env.reset()[None, :]

        rewards = []
        score = 0
        grads = []

        while True:
            if show_result and (ep % 1000 == 0):
                env.render()

            action, grad = policy_gradient(state, w)
            new_st, reward, done, _ = env.step(action)
            new_st = new_st[None, :]

            grads.append(grad)
            rewards.append(reward)
            score += reward

            state = new_st

            if done:
                break

        for i in range(len(grads)):
            discounts = sum([r * gamma**r for r in rewards[i:]])
            w += alpha * grads[i] * discounts

        all_scores.append(score)
        print("Ep: {}, Score: {}".format(ep, score), end='\r', flush=False)

    return all_scores
def train(env, nb_episodes, alpha=0.000045, gamma=0.98, show_result=False):
    """ train the policy gradients
        env: the initia environment (from openai gym)
        nb_episodes: number of episodes for training
        alpha: learning rate
        gamma: discount factor
        Returns: all vlaues of the score (sum of rewards during ea. episode)
    """
    # initializ future return
    scores = []
    # initialize rnadom starting weights
    weights = np.random.rand(env.observation_space.shape[0],
                             env.action_space.n)
    # loop through episodes performing steps
    for ep in range(nb_episodes):
        state = env.reset()[None, :]
        # initialize variables for the ep
        grads = []
        rewards = []
        actions = []
        done = False
        counter = 0
        # run episode
        while not done:
            if show_result and ep % 1000 == 0:
                env.render()
            # if using colab be ware of import changes
            action, grad = pg.policy_gradient(state, weights)
            state, reward, done, info = env.step(action)
            state = state[None, :]
            grads.append(grad)
            rewards.append(reward)
            actions.append(action)
            counter += 1
        # when episodes ended calculate rewards/new weights
        for i in range(len(grads)):
            # Loop through everything that happend in the episode
            rew = sum([r * (gamma**r) for t, r in enumerate(rewards[i:])])
            weights += alpha * grads[i] * rew
        # end_reward = 0
        # for i in range(counter)
        #     end_reward = reward[counter - i] + end_reward * gamma
        #     weights[:, action] += alpha * grad[:, action] *
        scores.append(sum(rewards))
        print(ep, scores[ep], end="\r", flush=False)
    return scores
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False):
    """Train a policy based Monte Carlo/REINFORCE algorithm"""
    scores = []
    weights = np.random.rand(env.observation_space.shape[0],
                             env.action_space.n)
    for episode in range(nb_episodes):
        state = env.reset()[None, :]

        grads = []
        rewards = []
        actions = []
        done = 0
        while not done:
            if show_result and not episode % 1000:
                env.render()
            action, grad = policy_gradient(state, weights)
            # print("step", action, grad)
            state, reward, done, info = env.step(action)
            grads.append(grad)
            rewards.append(reward)
            actions.append(action)
        total_reward = 0
        """
        if not episode %100 and 0:
            print("weights", weights)
        """
        for grad, reward, action in zip(grads[::-1], rewards[::-1],
                                        actions[::-1]):
            total_reward = reward + total_reward * gamma
            weights[:, action] += alpha * grad[:, action] * total_reward
            """
            if not episode % 100 and 0:
                # print(total_reward, grad)
                # print(alpha * grad * total_reward)
            """
        scores.append(sum(rewards))
        print(episode, sum(rewards))
        """
        if not episode % 100:
            print(episode, sum(rewards))
            #print("grads", len(grads), grads)
            #print("rewards", len(rewards), rewards[::-1])
            #print(weights)
        """
    return scores
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False):
    """Train a policy based Monte Carlo"""
    scores = []
    weights = np.random.rand(env.observation_space.shape[0],
                             env.action_space.n)
    for episode in range(nb_episodes):
        state = env.reset()[None, :]
        grads = []
        rewards = []
        actions = []
        done = 0
        while not done:
            if show_result and not episode % 1000:
                env.render()
            action, grad = policy_gradient(state, weights)
            state, reward, done, info = env.step(action)
            grads.append(grad)
            rewards.append(reward)
            actions.append(action)
        total_reward = 0
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False):
    """
    env: initial environment
    nb_episodes: number of episodes used for training
    alpha: the learning rate
    gamma: the discount factor
    show_result:  is True, render the environment every 1000 episodes computed.
    """
    weights = np.random.rand(4, 2)
    episode = []

    for i in range(nb_episodes):
        state = env.reset()[None, :]
        grads = []
        rewards = []
        score = 0

        while True:
            if show_result and (i % 1000 == 0):
                env.render()
            action, grad = policy_gradient(state, weights)
            next_state, reward, done, _ = env.step(action)
            next_state = next_state[None, :]
            grads.append(grad)
            rewards.append(reward)
            score += reward
            state = next_state
            if done:
                break

        for j in range(len(grads)):
            weights += alpha * grads[j] *\
                sum([r * gamma ** r for t, r in enumerate(rewards[j:])])
        episode.append(score)
        print("{}: {}".format(i, score), end="\r", flush=False)

    return (episode)
def train(env, nb_episodes, alpha=0.00045, gamma=0.98, show_result=False):
    """
    *********************************************
    ******Implementation of a full training******
    *********************************************
    @env: initial environment
    @nb_episodes: number of episodes used for training
    @alpha: the learning rate
    @gamma: the discount factor
    Return:
        all values of the score (sum of all rewards
        during one episode loop)
    """
    # Initiate scores list
    scores = []
    # Initiate θ to random
    # np.random.seed(0)
    # env.seed(0)
    W = np.random.rand(env.observation_space.shape[0],
                       env.action_space.n)
    for ep in range(nb_episodes):
        # **** Generating episode *****************************
        # Reseting the environment each time as per requirement
        state = env.reset()[None, :]
        # initiate needed variabes
        done = False
        t = 0
        R = []
        Grads = []
        Actions = []
        while not done:
            # Renderig the environment every 1000
            if show_result and not ep % 1000:
                env.render()
            # Taking action and gradient
            action, grad = policy_gradient(state, W)
            # Getting the reward and outcome state
            new_state, Returns, done, info = env.step(action)
            # Appending needed Values
            Actions.append(action)
            R.append(Returns)
            Grads.append(grad)
            # Incrementing state
            state = new_state[None, :]
            t += 1
        # Appending summed score
        scores.append(sum(R))
        print("Episode N°: " + str(ep) + " Score: " + str(sum(R)),
              end="\r", flush=False)

        # **** Updating θ ***************************************************
        # initiate needed variabes
        G = 0  # empirical return
        T = t
        for t in range(T):
            Returns = R[t]
            action = Actions[t]
            # Gt = ∑k=0 to ∞ (γ^(k) * R(t+k+1))
            G = sum(gamma**(k) * R[k+t+1] for k in range(T-t-1))
            # θ ← θ + α * γ^(t) * Gt * ∇θlnπθ(At|St) ; from Barto Satton book
            # W[:, action] += alpha * Grads[t][:, action] * gamma**(t) * G
            # θ ← θ + α * ∇θlogπθ(st, at) * vt ; from David Silver course
            W[:, action] += alpha * Grads[t][:, action] * G

    return scores
示例#14
0
#!/usr/bin/env python3
"""
Main file
"""
import gym
import numpy as np
from policy_gradient import policy_gradient

env = gym.make('CartPole-v1')
np.random.seed(1)

weight = np.random.rand(4, 2)
state = env.reset()[None,:]
# state = [[0.04228739, -0.04522399,  0.01190918, -0.03496226]]
# state = np.array(state)

print(weight)
print(state)

action, grad = policy_gradient(state, weight)
print(action)
print(grad)

env.close()
示例#15
0
文件: train.py 项目: sbirch/webtalk
import argparse
from data import gen_docs
from policy_gradient import policy_gradient
import numpy as np
import sys

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train webtalk to generate a parameter vector")

    parser.add_argument("corpus_file", type=file, help="A corpus file to train off of", default=sys.stdin, nargs='?')
    parser.add_argument("--url", type=str, help="An initial URL on which to start each training document", default="http://localhost:8000")
    parser.add_argument("--iters", type=int, help="Number of iterations to train on all the docs", default=50)

    args = parser.parse_args()

    docs = gen_docs.parse_docs_file(args.corpus_file)
    theta = policy_gradient(docs, args.url, ITERATIONS=args.iters)

    np.savetxt(sys.stdout, theta)