def __init__(self, learner, memory, simulator, settings, dqn_policy=None, rollout_policy=None): """ The learning agent is responsible for communicating and moving data between the three modules: Learner, Simulator, Memory Inputs: - learner: containes the neural network and the optimizer to train it - memory: expereince replay memory that can be minibatch sampled - simulator: simulates the environemnt - settings: hyper parameters for training - rollout_policy: rollout policy, random by default """ self.learner = learner self.memory = memory self.simulator = simulator # for populating the experience replay self.evaluator = deepcopy(simulator) # for evaluation self.dqn_policy = dqn_policy if dqn_policy is None: self.dqn_policy = DQNPolicy(learner) self.rollout_policy = rollout_policy if rollout_policy is None: self.rollout_policy = RandomPolicy(simulator.n_actions) self.set_params(settings) self.n_epochs = self.iterations / float(memory.memory_size) self.iteration = [] self.loss = [] self.q_ave = [] self.eval_iteration = [] self.r_eval = [] self.r_per_episode_eval = []
# run the simulation input_state = np.zeros((1, 2), dtype=np.float32) for i in xrange(nsteps): state = simulator.get_screenshot() input_state[0] = state a = policy.action((input_state, None)) simulator.act(a) r = simulator.reward() rtot += r xpos[i], vel[i] = state if simulator.episode_over(): break return rtot, xpos, vel mdp = MountainCar() simulator = MDPSimulator(mdp) net = pickle.load(open("../chimp/pre_trained_nets/mountain_car.net", "rb")) backend = ChainerBackend(settings) backend.set_net(net) learner = DQNLearner(settings, backend) policy = DQNPolicy(learner) r, xtrace, vtrace = car_sim(300, simulator, policy, verbose=True) p.plot(xtrace) p.plot(10.0 * vtrace) p.show()