def get_reward_distribution(): global mean_list mean_list = np.zeros(10) # get the 10 q*(a) for i in range(10): mean_list[i] = rand_norm(0.0, 1.0)
def env_step(this_action): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global this_reward_observation, rewards avg_reward = rewards[this_action[0]] the_reward = rand_norm(avg_reward, 1.0) # rewards drawn from (0, 1) Gaussian this_reward_observation = (the_reward, this_reward_observation[1], False) return this_reward_observation
def env_step( this_action ): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global this_reward_observation the_reward = rand_norm(reward_distribution[int(this_action)], 1.0) this_reward_observation = (the_reward, this_reward_observation[1], False) return this_reward_observation
def env_step( this_action ): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global this_reward_observation the_reward = rand_norm(distributions[int(this_action)], 1.0) # rewards drawn from (0, 1) Gaussian this_reward_observation = (the_reward, this_reward_observation[1], False) return this_reward_observation
def env_init(): global this_reward_observation,q local_observation = np.zeros(0) # An empty NumPy array this_reward_observation = (0.0, local_observation, False) #set up q*(a) for i in range(0,10): q[i]=rand_norm(0.0, 1.0)
def env_init(): global this_reward_observation, reward_distribution local_observation = np.zeros(0) # An empty NumPy array reward_distribution = np.zeros(10) for i in range(10): reward_distribution[i] = rand_norm(0.0, 1.0) this_reward_observation = (0.0, local_observation, False)
def __init__(self, number_of_arms): super(KArmedTestbed, self).__init__() self.number_of_arms = number_of_arms self.action_value = np.array( [rand_norm(0, 1.0) for i in np.arange(number_of_arms)]) self.current_state = np.zeros(0) self.available_actions = np.array( [i for i in np.arange(1, number_of_arms + 1, 1)])
def createDistribution( ): # create distributions contains mean of each distribution global distributions, numActions i = 0 distributions = [] while i < 10: distribution = rand_norm(0.0, 1.0) distributions.append(distribution) i += 1 return
def env_init(): global this_reward_observation, bandits, opt_act local_observation = np.zeros(0) # An empty NumPy array bandits = np.zeros(10) opt_act = 0 for i in xrange(10): bandits[i] = rand_norm(0, 1) if bandits[opt_act] < bandits[i]: opt_act = i this_reward_observation = (0.0, local_observation, False)
def env_step( this_action ): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global mean_q_actions_array global this_reward_observation action = int(this_action) #obtains a reward for the action taken #the value is based on a distribution of mean = mean value for that action and variance = 1 the_reward = rand_norm(mean_q_actions_array[action], 1) this_reward_observation = (the_reward, this_reward_observation[1], False) #returns (reward, state and if some terminal state was hit) return this_reward_observation
def env_step(this_action): # returns (floating point, NumPy array, Boolean), this_action: NumPy array global this_reward_observation,real_reward,q # rewards drawn from (q(At), 1) Gaussian for i in range(10): real_reward[i] = rand_norm(q[i], 1) the_reward = real_reward[this_action] #print "the reward for action %d is %f, optimal is %d,optimal value is %f"%(int(this_action),the_reward,max(enumerate(real_reward),key=lambda x: x[1])[0],real_reward[max(enumerate(real_reward),key=lambda x: x[1])[0]]) this_reward_observation = (the_reward, this_reward_observation[1], False) return this_reward_observation
def env_init(): global this_reward_observation global mean_q_actions_array global num_actions #initialize the variables for the enviroment, dependending on the quantity of actions available num_actions = 10 mean_q_actions_array = np.zeros(num_actions) #Initialize the mean values for each action utilizing a Gaussian distribution (mean = 0, variance = 1) for i in range(0, num_actions): mean_q_actions_array[i] = rand_norm(0.0, 1.0) local_observation = np.zeros(0) # An empty NumPy array this_reward_observation = (0.0, local_observation, False)
def env_get_reward(this_action): global mean_list the_reward = rand_norm(mean_list[int(this_action[0])], 1.0) return the_reward
def _arms(self, k): tmp = np.zeros(k) for cnt in range(k): tmp[cnt] = rand_norm(0, 1) # print('q* is {}'.format(tmp)) return tmp
def q(): global qa qa = [] for i in range(num_actions): qa.append(rand_norm(0.0, 1.0)) return
def do_action(self, action): for a in self.available_actions: self.action_value[a - 1] += rand_norm(0.0, 0.01) return rand_norm(self.action_value[action], 1.0)
def do_action(self, action): # state update goes here return rand_norm(self.action_value[action], 1.0)
def env_init(): global this_reward_observation local_observation = np.zeros(0) # An empty NumPy array for i in range(actions): qtrue[i] = rand_norm(0.0, 1.0) this_reward_observation = (0.0, local_observation, False)
def env_init(): global arms_centres for i in range(arm_count): arms_centres[i] = rand_norm(0, 1)
def reward(self, action): # print('Action is ' + str(action)) index = int(action) return rand_norm(self._q_array[index], 1)
def env_init(): global this_reward_observation, mode, rewards local_observation = np.zeros(0) # An empty NumPy array for i in range(10): rewards[i] = rand_norm(0.0,1.0) this_reward_observation = (0.0, local_observation, False)