def __init__(self, module, env = None): EpisodicEvaluator.__init__(self, module, BalanceTask(env = env)) self.desiredValue = self.task.N - 1 # a simpler fitness: total number of balanced steps self.task.getTotalReward = lambda: self.task.t
from pybrain.rl.environments.cartpole import CartPoleEnvironment, BalanceTask from pybrain.rl.agents import OptimizationAgent from pybrain.optimization import ExactNES from pybrain.rl.experiments import EpisodicExperiment batch = 2 #number of samples per learning step prnts = 100 #number of learning steps after results are printed epis = 4000 / batch / prnts #number of roleouts numbExp = 10 #number of experiments et = ExTools(batch, prnts) #tool for printing and plotting for runs in range(numbExp): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # create controller network net = buildNetwork(4, 1, bias=False) # create agent with controller and learner (and its options) agent = OptimizationAgent(net, ExactNES(storeAllEvaluations=True)) et.agent = agent # create the experiment experiment = EpisodicExperiment(task, agent) #Do the experiment for updates in range(epis): for i in range(prnts): experiment.doEpisodes(batch) print "Epsilon : ", agent.learner.sigma et.printResults((agent.learner._allEvaluations)[-50:-1], runs, updates) et.addExps()
import sys episodes = 1 epilen = 200 if len(sys.argv) < 5: sys.exit('please give 4 parameters. run: "python play_catpole.py <p1> <p2> <p3> <p4>"\n') # create environment env = CartPoleEnvironment() env.setRenderer(CartPoleRenderer()) env.getRenderer().start() env.delay = (episodes == 1) # create task task = BalanceTask(env, epilen) # create controller network net = buildNetwork(4, 1, bias=False) # create agent and set parameters from command line agent = LearningAgent(net, None) agent.module._setParameters([float(sys.argv[1]), float(sys.argv[2]), float(sys.argv[3]), float(sys.argv[4])]) # create experiment experiment = EpisodicExperiment(task, agent) experiment.doEpisodes(episodes) # run environment ret = [] for n in range(agent.history.getNumSequences()):
# 9 lines total marked as "for plotting" # Author: Frank Sehnke, [email protected] ######################################################################### from pybrain.tools.shortcuts import buildNetwork from pybrain.rl.environments.cartpole import CartPoleEnvironment, BalanceTask from pybrain.rl.agents.finitedifference import FiniteDifferenceAgent from pybrain.rl.learners import SPLA from pybrain.rl.experiments import EpisodicExperiment from scipy import random numbExp = 12 for runs in range(numbExp): env = CartPoleEnvironment() # create task task = BalanceTask(env, 200) # create controller network net = buildNetwork(4, 1, bias=False) # create agent with controller and learner agent = FiniteDifferenceAgent(net, SPLA()) # learning options agent.learner.gd.alpha = 0.05 agent.learner.gdSig.alpha = 0.1 agent.learner.gd.momentum = 0.9 agent.learner.epsilon = 6.0 agent.learner.initSigmas() # agent.learner.rprop = True experiment = EpisodicExperiment(task, agent) batch = 16 prnts = 10 epis = 50000 / batch / prnts
environments in PyBrain are located under rl/environments. One of these environments is the cart-pole balancing, which we will use for this tutorial. Its states consist of cart position, cart velocity, pole angle, and pole angular velocity. It can receive one scalar value, the force with which the cart is pushed, either to the left (negative value) or right (positive). Let's create the instance. """ environment = CartPoleEnvironment() """ Next, we need an agent. The agent is where the learning happens. It can interact with the environment with its .getAction() and .integrateObservation() methods. For continuous problems, like the CartPole, we need a policy gradient agent. Each agent needs a controller, that maps the current state to an action. We will use a linear controller, which can be created in PyBrain with the buildNetwork() shortcut function. We need 4 inputs and 1 output. Each agent also has a learner component. There are several learners for policy gradient agents, which we won't cover in this tutorial. Let's just use the ENAC learning algorithm now and create the agent. """ controller = buildNetwork(4, 1, bias=False) agent = PolicyGradientAgent(controller, ENAC()) """ So far, there is no connection between the agent and the environment. In fact, in PyBrain, there is one component that connects environment and agent: the task. A task also specifies what the goal is in an environment and how the agent is rewarded for its actions. For episodic experiments, the Task also decides when an episode is over. Environments usually bring along their own tasks. The CartPoleEnvironment for example has a BalanceTask, that we will use. """ task = BalanceTask() """... to be continued """
# Training the network costs = np.zeros(N_ITERATIONS) # Initialize serial communication class serial = SocketServer() ring_buffer = RingBuffer(size=N_TIME_STEPS + 1) # need reward of next step for training # Form forget vector forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)]) # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # Cost = mean squared error, starting from delay point cost = T.mean((l_action_formed.get_output(input)[:, :, :] - target_output[:, :, :])**2) unfolding_time = 10 for n in range(N_ITERATIONS): rewards = [] for n in xrange(unfolding_time): train_inputs = theano_form( task.getObservation(), shape=[N_BATCH, N_TIME_STEPS, N_INPUT_FEATURES]) model_reward_result = action_prediction(train_inputs) task.performAction(model_reward_result)
def main(): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] real_world_sample_counts = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1))) baseline = None num_parameters = 4 # five parameters init_sigma = 3 # initial number sigma sigmas = ones(num_parameters) * init_sigma best_reward = -1000 current = all_params[0].get_value()[:, 0] arg_reward = [] previous_cost = 10000 real_world_sample_count = 0 thinking_count = 0 cost_confidence = 2 for n in xrange(1500): epsilon, epsilon_star = sample_parameter(sigmas=sigmas) if previous_cost <= cost_confidence: rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon) rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon) thinking_count += 1 if thinking_count == 2: previous_cost = 10000 thinking_count = 0 else: # Perform actions in real environment rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon) real_world_sample_count += 1 if reward1 > best_reward: best_reward = reward1 rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon) real_world_sample_count += 1 if reward2 > best_reward: best_reward = reward2 # Prepare for data for first process actions1 = theano_form(actions1, shape=(len(actions1), 1)) observations1 = theano_form(observations1, shape=(len(observations1), 4)) predicted_obs1 = concatenate([observations1[1::], [last_obs1]]) input_data1 = concatenate([actions1, observations1], axis=1) output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1) # Training with data gathered from first process critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS)) critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS)) # Prepare for data for second process actions2 = theano_form(actions2, shape=(len(actions2), 1)) observations2 = theano_form(observations2, shape=(len(observations2), 4)) predicted_obs2 = concatenate([observations2[1::], [last_obs2]]) input_data2 = concatenate([actions2, observations2], axis=1) output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1) # Training with data gathered from second process critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS)) critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS)) train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence count1 = 0 while True: count1 += 1 costs1 = [] for input, output in zip(critic_train_inputs1, critic_train_outputs1): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs1.append(train(critic_train_input, critic_train_output)) if mean(costs1) < train_base_line: break else: if not count1%50: print mean(costs1) #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line if count1 > 1: break count2 = 0 while True: count2 += 1 costs2 = [] for input, output in zip(critic_train_inputs2, critic_train_outputs2): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs2.append(train(critic_train_input, critic_train_output)) if mean(costs2) < train_base_line: break else: if not count2%50: print mean(costs2) #print "mean cost2: ", mean(costs2), "baseline :", train_base_line if count2 > 1: break previous_cost = sum(costs1) + sum(costs2) mreward = (reward1 + reward2) / 2. if baseline is None: # first learning step baseline = mreward fakt = 0. fakt2 = 0. else: #calc the gradients if reward1 != reward2: #gradient estimate alla SPSA but with likelihood gradient and normalization fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2) else: fakt=0. #normalized sigma gradient with moving average baseline norm = (best_reward - baseline) if norm != 0.0: fakt2=(mreward-baseline)/(best_reward-baseline) else: fakt2 = 0.0 #update baseline baseline = 0.9 * baseline + 0.1 * mreward # update parameters and sigmas current = current + LEARNING_RATE * fakt * epsilon if fakt2 > 0: #for sigma adaption alg. follows only positive gradients #apply sigma update locally sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas # Test set epsilon, epsilon_star = sample_parameter(sigmas=sigmas) _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon) _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon) test_mreward = (test_reward1 + test_reward2)/ 2.0 arg_reward.append(test_mreward) print n if not n%10: print "test_reward 1:", test_reward1 _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon) print "simulated reward 1:", sim_test_reward1 print "test_reward 2:", test_reward2 _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon) print "simulated reward 2:", sim_test_reward2 print "previous_cost :", previous_cost print "real_word_example :", real_world_sample_count temp_arg = sum(arg_reward)/len(arg_reward) records[time].append([real_world_sample_count, temp_arg]) print "best reward:", best_reward, "average reward:", temp_arg print arg_reward = [] real_world_sample_counts.append(real_world_sample_count) #print records pickle.dump(records, open("records_lambda_mu.p", "wb")) pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))