Exemplo n.º 1
0
 def __init__(self, module, env = None):
     EpisodicEvaluator.__init__(self, module, BalanceTask(env = env))
     self.desiredValue = self.task.N - 1
     # a simpler fitness: total number of balanced steps
     self.task.getTotalReward = lambda: self.task.t
     
     
Exemplo n.º 2
0
from pybrain.rl.environments.cartpole import CartPoleEnvironment, BalanceTask
from pybrain.rl.agents import OptimizationAgent
from pybrain.optimization import ExactNES
from pybrain.rl.experiments import EpisodicExperiment

batch = 2  #number of samples per learning step
prnts = 100  #number of learning steps after results are printed
epis = 4000 / batch / prnts  #number of roleouts
numbExp = 10  #number of experiments
et = ExTools(batch, prnts)  #tool for printing and plotting

for runs in range(numbExp):
    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)
    # create controller network
    net = buildNetwork(4, 1, bias=False)
    # create agent with controller and learner (and its options)
    agent = OptimizationAgent(net, ExactNES(storeAllEvaluations=True))
    et.agent = agent
    # create the experiment
    experiment = EpisodicExperiment(task, agent)

    #Do the experiment
    for updates in range(epis):
        for i in range(prnts):
            experiment.doEpisodes(batch)
        print "Epsilon   : ", agent.learner.sigma
        et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates)
    et.addExps()
Exemplo n.º 3
0
import sys

episodes = 1
epilen = 200

if len(sys.argv) < 5:
    sys.exit('please give 4 parameters. run: "python play_catpole.py <p1> <p2> <p3> <p4>"\n')

# create environment
env = CartPoleEnvironment()
env.setRenderer(CartPoleRenderer())
env.getRenderer().start()
env.delay = (episodes == 1)

# create task
task = BalanceTask(env, epilen)

# create controller network
net = buildNetwork(4, 1, bias=False)

# create agent and set parameters from command line
agent = LearningAgent(net, None)
agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])])

# create experiment
experiment = EpisodicExperiment(task, agent)
experiment.doEpisodes(episodes)

# run environment
ret = []
for n in range(agent.history.getNumSequences()):
Exemplo n.º 4
0
# 9 lines total marked as "for plotting"
# Author: Frank Sehnke, [email protected]
#########################################################################

from pybrain.tools.shortcuts import buildNetwork
from pybrain.rl.environments.cartpole import CartPoleEnvironment, BalanceTask
from pybrain.rl.agents.finitedifference import FiniteDifferenceAgent
from pybrain.rl.learners import SPLA
from pybrain.rl.experiments import EpisodicExperiment
from scipy import random

numbExp = 12
for runs in range(numbExp):
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200)
    # create controller network
    net = buildNetwork(4, 1, bias=False)
    # create agent with controller and learner
    agent = FiniteDifferenceAgent(net, SPLA())
    # learning options
    agent.learner.gd.alpha = 0.05
    agent.learner.gdSig.alpha = 0.1
    agent.learner.gd.momentum = 0.9
    agent.learner.epsilon = 6.0
    agent.learner.initSigmas()
    # agent.learner.rprop = True
    experiment = EpisodicExperiment(task, agent)
    batch = 16
    prnts = 10
    epis = 50000 / batch / prnts
Exemplo n.º 5
0
environments in PyBrain are located under rl/environments. One of these 
environments is the cart-pole balancing, which we will use for this tutorial.
Its states consist of cart position, cart velocity, pole angle, and
pole angular velocity. It can receive one scalar value, the force with which
the cart is pushed, either to the left (negative value) or right (positive).
Let's create the instance.
"""
environment = CartPoleEnvironment()
"""
Next, we need an agent. The agent is where the learning happens. It can
interact with the environment with its .getAction() and .integrateObservation()
methods. For continuous problems, like the CartPole, we need a policy gradient
agent. Each agent needs a controller, that maps the current state to an action.
We will use a linear controller, which can be created in PyBrain with the 
buildNetwork() shortcut function. We need 4 inputs and 1 output. 
Each agent also has a learner component. There are several learners
for policy gradient agents, which we won't cover in this tutorial. Let's just
use the ENAC learning algorithm now and create the agent.
"""
controller = buildNetwork(4, 1, bias=False)
agent = PolicyGradientAgent(controller, ENAC())
"""
So far, there is no connection between the agent and the environment. In fact,
in PyBrain, there is one component that connects environment and agent: the
task. A task also specifies what the goal is in an environment and how the
agent is rewarded for its actions. For episodic experiments, the Task also
decides when an episode is over. Environments usually bring along their own
tasks. The CartPoleEnvironment for example has a BalanceTask, that we will use.
"""
task = BalanceTask()
"""... to be continued """
Exemplo n.º 6
0
    # Training the network
    costs = np.zeros(N_ITERATIONS)

    # Initialize serial communication class
    serial = SocketServer()
    ring_buffer = RingBuffer(size=N_TIME_STEPS +
                             1)  # need reward of next step for training

    # Form forget vector
    forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)])

    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    # Cost = mean squared error, starting from delay point
    cost = T.mean((l_action_formed.get_output(input)[:, :, :] -
                   target_output[:, :, :])**2)

    unfolding_time = 10
    for n in range(N_ITERATIONS):

        rewards = []
        for n in xrange(unfolding_time):
            train_inputs = theano_form(
                task.getObservation(),
                shape=[N_BATCH, N_TIME_STEPS, N_INPUT_FEATURES])
            model_reward_result = action_prediction(train_inputs)
            task.performAction(model_reward_result)
Exemplo n.º 7
0
def main():



    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200)



    all_params = lasagne.layers.get_all_params(l_action_formed)

    records = []
    real_world_sample_counts = []
    for time in xrange(50):
        records.append([])
        _all_params = lasagne.layers.get_all_params(l_action_formed)
        _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1)))


        baseline = None
        num_parameters = 4 # five parameters
        init_sigma = 3 # initial number sigma
        sigmas = ones(num_parameters) * init_sigma
        best_reward = -1000
        current = all_params[0].get_value()[:, 0]
        arg_reward = []


        previous_cost = 10000
        real_world_sample_count = 0
        thinking_count = 0

        cost_confidence = 2

        for n in xrange(1500):

            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            if previous_cost <= cost_confidence:

                rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon)
                rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon)
                thinking_count += 1
                if thinking_count == 2:
                    previous_cost = 10000
                    thinking_count = 0
            else:
                # Perform actions in real environment

                rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon)
                real_world_sample_count += 1
                if reward1 > best_reward:
                    best_reward = reward1
                rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon)
                real_world_sample_count += 1
                if reward2 > best_reward:
                    best_reward = reward2


                # Prepare for data for first process
                actions1 = theano_form(actions1, shape=(len(actions1), 1))
                observations1 = theano_form(observations1, shape=(len(observations1), 4))
                predicted_obs1 = concatenate([observations1[1::], [last_obs1]])
                input_data1 = concatenate([actions1, observations1], axis=1)
                output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1)

                # Training with data gathered from first process
                critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS))
                critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS))


                # Prepare for data for second process
                actions2 = theano_form(actions2, shape=(len(actions2), 1))
                observations2 = theano_form(observations2, shape=(len(observations2), 4))
                predicted_obs2 = concatenate([observations2[1::], [last_obs2]])
                input_data2 = concatenate([actions2, observations2], axis=1)
                output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1)

                # Training with data gathered from second process
                critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS))
                critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS))



                train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence

                count1 = 0
                while True:
                    count1 += 1
                    costs1 = []
                    for input, output in zip(critic_train_inputs1, critic_train_outputs1):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs1.append(train(critic_train_input, critic_train_output))
                    if mean(costs1) < train_base_line:
                        break
                    else:
                        if not count1%50:
                            print mean(costs1)
                        #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line
                    if count1 > 1:
                        break


                count2 = 0
                while True:
                    count2 += 1
                    costs2 = []
                    for input, output in zip(critic_train_inputs2, critic_train_outputs2):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs2.append(train(critic_train_input, critic_train_output))

                    if mean(costs2) < train_base_line:
                        break
                    else:
                        if not count2%50:
                            print mean(costs2)

                        #print "mean cost2: ", mean(costs2), "baseline :", train_base_line

                    if count2 > 1:
                        break

                previous_cost = sum(costs1) + sum(costs2)


            mreward = (reward1 + reward2) / 2.

            if baseline is None:
                # first learning step
                baseline = mreward
                fakt = 0.
                fakt2 = 0.
            else:
                #calc the gradients
                if reward1 != reward2:
                    #gradient estimate alla SPSA but with likelihood gradient and normalization
                    fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2)
                else:
                    fakt=0.
                #normalized sigma gradient with moving average baseline
                norm = (best_reward - baseline)
                if norm != 0.0:
                    fakt2=(mreward-baseline)/(best_reward-baseline)
                else:
                    fakt2 = 0.0
            #update baseline
            baseline = 0.9 * baseline + 0.1 * mreward
            # update parameters and sigmas
            current = current + LEARNING_RATE * fakt * epsilon

            if fakt2 > 0: #for sigma adaption alg. follows only positive gradients
                #apply sigma update locally
                sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas


            # Test set
            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon)
            _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon)
            test_mreward = (test_reward1 + test_reward2)/ 2.0
            arg_reward.append(test_mreward)

            print n


            if not n%10:
                print "test_reward 1:", test_reward1
                _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon)
                print "simulated reward 1:", sim_test_reward1
                print "test_reward 2:", test_reward2
                _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon)
                print "simulated reward 2:", sim_test_reward2


                print "previous_cost :", previous_cost
                print "real_word_example :", real_world_sample_count
                temp_arg = sum(arg_reward)/len(arg_reward)
                records[time].append([real_world_sample_count, temp_arg])
                print "best reward:", best_reward, "average reward:", temp_arg
                print
                arg_reward = []
        real_world_sample_counts.append(real_world_sample_count)
    #print records
    pickle.dump(records, open("records_lambda_mu.p", "wb"))
    pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))