示例#1
0
 def __init__(self, prediction, maxsteps):
     super(SimBalanceTask, self).__init__(None)
     self.prediction = prediction
     self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4)
     self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0])
     self.sensors = self.sensors_sequence.data[-1]
     self.t = 0
     self.N = maxsteps
示例#2
0
 def reset(self):
     if self.randomInitialization:
         angle = random.uniform(-0.2, 0.2)
         pos = random.uniform(-0.5, 0.5)
     else:
         angle = -0.2
         pos = 0.2
     self.t = 0
     self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4)
     self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0])
     self.sensors = (angle, 0.0, pos, 0.0)
     self.sensors_sequence.append(self.sensors)
示例#3
0
class SimBalanceTask(EpisodicTask):

    randomInitialization = True
    def __init__(self, prediction, maxsteps):
        super(SimBalanceTask, self).__init__(None)
        self.prediction = prediction
        self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4)
        self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0])
        self.sensors = self.sensors_sequence.data[-1]
        self.t = 0
        self.N = maxsteps

    def performAction(self, action):
        self.t += 1
        self.actions_sequence.append(action[0][0])
        predict_input = concatenate([theano_form(self.actions_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 1)),
                                     theano_form(self.sensors_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 4))], axis=2)
        prediction = self.prediction(predict_input)
        self.sensors = prediction[0][-1][1::]
        print "sensors", self.sensors
        raw_input()
        self.sensors_sequence.append(self.sensors)
        self.reward = prediction[0][-1][0]

    def getObservation(self):
        return array(self.sensors)

    def getPoleAngles(self):
        return self.sensors[0]

    def getCartPosition(self):
        return self.sensors[2]

    def isFinished(self):
        if abs(self.getPoleAngles())> 0.7:
            # pole has fallen
            return True
        elif abs(self.getCartPosition()) > 2.4:
            # cart is out of it's border conditions
            return True
        elif self.t >= self.N:
            # maximal timesteps
            return True
        return False

    def reset(self):
        if self.randomInitialization:
            angle = random.uniform(-0.2, 0.2)
            pos = random.uniform(-0.5, 0.5)
        else:
            angle = -0.2
            pos = 0.2
        self.t = 0
        self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4)
        self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0])
        self.sensors = [angle, 0.0, pos, 0.0]
        self.sensors_sequence.append(self.sensors)

    def getReward(self):
        return self.reward
示例#4
0
    # Theano Functions for Critic Network,
    critic_train = theano.function([critic_input, critic_output],
                                   critic_cost,
                                   updates=critic_updates)
    # Predict Action
    critic_prediction = theano.function(
        [critic_input], l_reward_formed.get_output(critic_input))
    # Compute the cost
    critic_cost = theano.function([critic_input, critic_output], critic_cost)
    # Record all costs of the Actor Network.
    critic_costs = np.zeros(N_ITERATIONS)

    # Initialize serial communication class
    serial = SocketServer()
    ring_buffer = RingBuffer(size=N_TIME_STEPS +
                             1)  # need reward of next step for training
    actions_set = RingBuffer(size=N_TIME_STEPS)
    actions_set.data = binomial(1, 0.5, N_TIME_STEPS).astype(
        theano.config.floatX).tolist()
    iter_init_actions = iter(actions_set.data)
    costs = [0] * N_ITERATIONS

    # Send n_time_steps information to client
    serial.send("%d\0" % N_TIME_STEPS)

    # Form forget vector
    forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)])

    for n in range(N_ITERATIONS):
        if None in ring_buffer.get():
            signal = serial.receive()
示例#5
0
    #y_pred_action = theano.function([input], l_action_formed.get_output(input))
    reward_prediction = theano.function([input], l_reward_formed.get_output(input))

    # Predict Action
    action_prediction = theano.function([input], l_action_formed.get_output(input))

    # Compute the cost
    compute_cost = theano.function([input, target_output], cost)

    # Training the network
    costs = np.zeros(N_ITERATIONS)

    # Initialize serial communication class
    serial = SocketServer()
    ring_buffer = RingBuffer(size=N_TIME_STEPS + 1) # need reward of next step for training

    # Send n_time_steps information to client
    serial.send("%i\0" % N_TIME_STEPS)

    # Form forget vector
    forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)])

    for n in range(N_ITERATIONS):
        signal = serial.receive()
        epoch_data = signal.split(',') # rm1 is reward of last time step
        ring_buffer.append(epoch_data)
        buffered_data = ring_buffer.get()
        if None not in buffered_data:
            all_data = theano_form(list=buffered_data, shape=[N_BATCH, N_TIME_STEPS+1, N_TRANS])
示例#6
0
    reward_prediction = theano.function([input],
                                        l_reward_formed.get_output(input))

    # Predict Action
    action_prediction = theano.function([input],
                                        l_action_formed.get_output(input))

    # Compute the cost
    compute_cost = theano.function([input, target_output], cost)

    # Training the network
    costs = np.zeros(N_ITERATIONS)

    # Initialize serial communication class
    serial = SocketServer()
    ring_buffer = RingBuffer(size=N_TIME_STEPS +
                             1)  # need reward of next step for training

    # Form forget vector
    forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)])

    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    # Cost = mean squared error, starting from delay point
    cost = T.mean((l_action_formed.get_output(input)[:, :, :] -
                   target_output[:, :, :])**2)

    unfolding_time = 10
    for n in range(N_ITERATIONS):