def performAction(self, action): self.t += 1 self.actions_sequence.append(action[0][0]) predict_input = concatenate([theano_form(self.actions_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 1)), theano_form(self.sensors_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 4))], axis=2) prediction = self.prediction(predict_input) self.sensors = prediction[0][-1][1::] print "sensors", self.sensors raw_input() self.sensors_sequence.append(self.sensors) self.reward = prediction[0][-1][0]
def train(self): print "sending" # first send n_time_steps information to the client self.setting.serial.send_int(self.setting.n_time_steps) print "sent" self.cost = [0] * self.setting.n_iterations for n in xrange(self.setting.n_iterations): signal = self.setting.serial.receive() epoch_data = signal.split(',') # rm1 is reward of last time step self.ring_buffer.append(epoch_data) buffered_data = self.ring_buffer.get() if None not in buffered_data: all_data = D.theano_form(list=buffered_data, shape=[ self.setting.n_batches, self.setting.n_time_steps + 1, self.setting.n_trans ]) actor_train_inputs = all_data[:, 0:self.setting.n_time_steps, 1::] # Predict action of actor model action_predict = self.mba.predict(actor_train_inputs) critic_train_inputs = np.dstack( (action_predict, actor_train_inputs)) critic_train_outputs = all_data[:, 1::, # extract reward from 1 to N_TIME_STEPS, 0].reshape([ self.setting.n_batches, self.setting.n_time_steps, self.setting. n_output_features ]) # Reward takes the first position
def one_iteration(task, all_params): """ Give current value of weights, output all rewards :return: """ rewards = [] observations = [] actions = [] _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(all_params, shape=(4, 1))) task.reset() while not task.isFinished(): obs = task.getObservation() observations.append(obs) states = theano_form(obs, shape=[N_BATCH, 1, N_INPUT_FEATURES - 1 ]) # this is for each time step model_action_result = action_prediction(states) actions.append(model_action_result.reshape(1)) task.performAction(model_action_result) rewards.append(task.getReward()) last_obs = task.getObservation() return rewards, actions, observations, last_obs, sum(rewards)
def one_sim_iteration(task, all_params): """ This function estimates the reward by RNN function. in our case, it is LSTM """ rewards = [] observations = [] actions = [] _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(all_params, shape=(4, 1))) while not task.isFinished(): obs = task.getObservation() observations.append(obs) states = theano_form(obs, shape=[N_BATCH, 1, N_INPUT_FEATURES - 1 ]) # this is for each time step model_action_result = action_prediction(states) actions.append(model_action_result.reshape(1)) task.performAction(model_action_result) rewards.append(task.getReward()) last_obs = task.getObservation() return rewards, actions, observations, last_obs, sum(rewards)
def train(self): self.build_functions() print "sending" # first send n_time_steps information to the client self.setting.serial.send_int(self.setting.n_time_steps) print "sent" self.costs = [0] * self.setting.n_iterations for n in xrange(self.setting.n_iterations): signal = self.setting.serial.receive() epoch_data = signal.split(',') # rm1 is reward of last time step self.ring_buffer.append(epoch_data) buffered_data = self.ring_buffer.get() if None not in buffered_data: all_data = D.theano_form(list=buffered_data, shape=[ self.setting.n_batches, self.setting.n_time_steps + 1, self.setting.n_trans ]) train_inputs = all_data[:, 0:self.setting.n_time_steps, 1::] # Set desired output, the second number of result is reward train_outputs = all_data[:, 1::, # extract reward from 1 to N_TIME_STEPS, 0 # reward is the first element in this structure ].reshape([ self.setting.n_batches, self.setting.n_time_steps, self.setting.n_output_features ]) # Reward takes the first position self.costs[n] = self._train(train_inputs, train_outputs) # Extract the most recent action from all result. action = self.get_binomial_action( self.pred_action(train_inputs)[:, -1]) * 2 - 1 self.setting.serial.send_int(action) if not n % 10: cost_val = self.compute_cost(train_inputs, train_outputs) model_reward_result = self.predict(train_inputs) print "Iteration {} validation cost = {}".format( n, cost_val) print "reward predict: " print model_reward_result print "train results:" print train_outputs print "predcted action" print self.pred_action(train_inputs)[:, -1]
# create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value( theano_form(uniform(-0.1, 0.1, 4), shape=(4, 1))) baseline = None num_parameters = 4 # five parameters init_sigma = 3 # initial number sigma sigmas = ones(num_parameters) * init_sigma best_reward = -1000 current = all_params[0].get_value()[:, 0] arg_reward = [] previous_cost = 10000 real_world_sample_counts = 0 thinking_count = 0 for n in xrange(1500): epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
def main(): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] real_world_sample_counts = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1))) baseline = None num_parameters = 4 # five parameters init_sigma = 3 # initial number sigma sigmas = ones(num_parameters) * init_sigma best_reward = -1000 current = all_params[0].get_value()[:, 0] arg_reward = [] previous_cost = 10000 real_world_sample_count = 0 thinking_count = 0 cost_confidence = 2 for n in xrange(1500): epsilon, epsilon_star = sample_parameter(sigmas=sigmas) if previous_cost <= cost_confidence: rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon) rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon) thinking_count += 1 if thinking_count == 2: previous_cost = 10000 thinking_count = 0 else: # Perform actions in real environment rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon) real_world_sample_count += 1 if reward1 > best_reward: best_reward = reward1 rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon) real_world_sample_count += 1 if reward2 > best_reward: best_reward = reward2 # Prepare for data for first process actions1 = theano_form(actions1, shape=(len(actions1), 1)) observations1 = theano_form(observations1, shape=(len(observations1), 4)) predicted_obs1 = concatenate([observations1[1::], [last_obs1]]) input_data1 = concatenate([actions1, observations1], axis=1) output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1) # Training with data gathered from first process critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS)) critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS)) # Prepare for data for second process actions2 = theano_form(actions2, shape=(len(actions2), 1)) observations2 = theano_form(observations2, shape=(len(observations2), 4)) predicted_obs2 = concatenate([observations2[1::], [last_obs2]]) input_data2 = concatenate([actions2, observations2], axis=1) output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1) # Training with data gathered from second process critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS)) critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS)) train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence count1 = 0 while True: count1 += 1 costs1 = [] for input, output in zip(critic_train_inputs1, critic_train_outputs1): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs1.append(train(critic_train_input, critic_train_output)) if mean(costs1) < train_base_line: break else: if not count1%50: print mean(costs1) #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line if count1 > 1: break count2 = 0 while True: count2 += 1 costs2 = [] for input, output in zip(critic_train_inputs2, critic_train_outputs2): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs2.append(train(critic_train_input, critic_train_output)) if mean(costs2) < train_base_line: break else: if not count2%50: print mean(costs2) #print "mean cost2: ", mean(costs2), "baseline :", train_base_line if count2 > 1: break previous_cost = sum(costs1) + sum(costs2) mreward = (reward1 + reward2) / 2. if baseline is None: # first learning step baseline = mreward fakt = 0. fakt2 = 0. else: #calc the gradients if reward1 != reward2: #gradient estimate alla SPSA but with likelihood gradient and normalization fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2) else: fakt=0. #normalized sigma gradient with moving average baseline norm = (best_reward - baseline) if norm != 0.0: fakt2=(mreward-baseline)/(best_reward-baseline) else: fakt2 = 0.0 #update baseline baseline = 0.9 * baseline + 0.1 * mreward # update parameters and sigmas current = current + LEARNING_RATE * fakt * epsilon if fakt2 > 0: #for sigma adaption alg. follows only positive gradients #apply sigma update locally sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas # Test set epsilon, epsilon_star = sample_parameter(sigmas=sigmas) _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon) _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon) test_mreward = (test_reward1 + test_reward2)/ 2.0 arg_reward.append(test_mreward) print n if not n%10: print "test_reward 1:", test_reward1 _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon) print "simulated reward 1:", sim_test_reward1 print "test_reward 2:", test_reward2 _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon) print "simulated reward 2:", sim_test_reward2 print "previous_cost :", previous_cost print "real_word_example :", real_world_sample_count temp_arg = sum(arg_reward)/len(arg_reward) records[time].append([real_world_sample_count, temp_arg]) print "best reward:", best_reward, "average reward:", temp_arg print arg_reward = [] real_world_sample_counts.append(real_world_sample_count) #print records pickle.dump(records, open("records_lambda_mu.p", "wb")) pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))