def test_step(): # create an empty state space of 4,4 state_space = StateSpace(4, 4, goal_1_position=(1, 3), goal_2_position=(3, 3), forbidden_position=(2, 3), wall_position=(2, 2)) # start state start_state = state_space.get_state(0, 1) # create a new agent with this state space agent = Agent( state_space=state_space, start_state=start_state, learning_rate=0.1, discount_rate=0.2, exploit_prob=0.1, living_reward=-0.1, ) # step assert agent.step() is not None
class Markov(): def __init__(self, metric_list, train_data): self.train_data = train_data self.metric_list = metric_list self.n_mets = len(self.metric_list) # state space self.X = StateSpace(self.metric_list, self.train_data) self.state_map = self.X.state_map self.inv_map = self.X.inverse_map self.h_map = self.X.h_state_map self.state_poss = self.X.state_poss_seq self.n_states = len(self.state_map) self.trans_probs = np.zeros((len(self.h_map), len(self.state_poss))) # initialize data self.markov_data = [] self.markov_data.append(int(self.n_states * np.random.random())) def fit(self): self.state_seq = [] print('Generating transition probabilities...') for i in range(self.X.pts_required, len(self.train_data) - self.X.pts_required): self.state_seq.append( self.X.get_state(self.train_data.iloc[i - self.X.pts_required:i])) state_tuple = self.inv_map[self.state_seq[-1]] h_seq = [] s_seq = [] ctr = 0 for metric in self.X.metric_list: for j in range(len(metric.poss_h_seq)): if metric.poss_seq[state_tuple[ctr]][ 0:metric.markov_mem] == metric.poss_h_seq[j]: h_seq.append(j) s_seq.append( metric.poss_seq[state_tuple[ctr]][metric.markov_mem]) ctr += 1 if ctr == self.n_mets: for k in range(len(self.state_poss)): if tuple(s_seq) == self.state_poss[k]: s_idx = k row = self.h_map[tuple(h_seq)] self.trans_probs[row, s_idx] += 1 row_cnt = [] for i in range(len(self.trans_probs)): row_cnt.append(np.sum(self.trans_probs[i, :])) for i in range(len(self.trans_probs)): if row_cnt[i] != 0: self.trans_probs[i, :] = self.trans_probs[i, :] / row_cnt[i] print('done!') #return self.trans_probs def generate(self, data_len: int): """data_len = 93600 is 1Q worth of data""" print('generating markov data...') for i in range(data_len): state_tuple = self.inv_map[self.markov_data[-1]] seq = [] h_seq = [] s_seq = [] ctr = 0 for metric in self.X.metric_list: for j in range(len(metric.poss_h_seq)): if metric.poss_seq[state_tuple[ctr]][ -metric.markov_mem:] == metric.poss_h_seq[j]: h_seq.append(j) seq.append(metric.poss_h_seq[j]) s_seq.append( metric.poss_seq[state_tuple[ctr]][metric.markov_mem]) ctr += 1 row = self.h_map[tuple(h_seq)] rand = np.random.random() probs = [] for k in range(len(self.trans_probs[row, :])): probs.append(np.sum(self.trans_probs[row, :k])) s_idx = 0 while rand >= probs[s_idx] and s_idx < len(probs) - 1: s_idx += 1 for l in range(self.n_mets): seq[l] += tuple([self.state_poss[s_idx][l]]) state = [] ctr = 0 for metric in self.X.metric_list: state.append(metric.markov_dict[seq[ctr]]) ctr += 1 s = self.state_map[tuple(state)] self.markov_data.append(s) print('done!') return self.markov_data
class Qagent(): def __init__(self, num_ccy: int, precision: int, gamma: float, metric_list: list, train_data: pd.DataFrame): self.num_ccy = num_ccy self.precision = precision self.gamma = gamma self.metric_list = metric_list self.train_data = train_data self.A = ActionSpace(self.num_ccy, self.precision) self.a_space = self.A.actions self.X = StateSpace(self.metric_list, self.train_data) self.state_map = self.X.state_map self.n_states = len(self.state_map) self.n_actions = len(self.a_space) self.q_table = np.zeros((self.n_states, self.n_actions)) self.lr_table = np.zeros((self.n_states, self.n_actions)) def train(self, data=0, markov=False): if markov == True: train_start = 0 train_end = len(data) - 1 else: train_start = self.X.pts_required train_end = len(self.train_data) - 1 print('Training q_table...') for i in range(train_start, train_end): # initialize state if markov == True: s_idx = data[i] else: s_idx = self.X.get_state( self.train_data.iloc[i - self.X.pts_required:i]) # if all actions for a state have zero reward 0 randomly select action if np.sum(self.q_table[s_idx, :]) == 0: # randint randomly selects ints from 0 up to but not including n_actions a_idx = np.random.randint(0, self.n_actions) # select the action with largest reward value in state s else: a_idx = np.argmax(self.q_table[s_idx, :]) # get action from action space a = self.a_space[a_idx] # get new state from state space assuming states are iid price = self.train_data.iloc[i - 1].values[0] next_price = self.train_data.iloc[i].values[0] b = [1, next_price / price] z = -1 # calculate reward r = np.log(np.dot(a, b)) #print(r) # choose learning rate if self.lr_table[s_idx, a_idx] == 0: alpha = 1 else: alpha = 1 / (self.lr_table[s_idx, a_idx]) # update q table for action a taken on state s if markov == True: next_s_idx = data[i + 1] else: next_s_idx = self.X.get_state( train_data.iloc[i + 1 - self.X.pts_required:i + 1]) #print(next_s_idx) self.q_table[s_idx, a_idx] += r + alpha * (self.gamma * np.max( self.q_table[next_s_idx, :]) - self.q_table[s_idx, a_idx]) # update learning rate self.lr_table[s_idx, a_idx] += 1 print('done!') return self.q_table, self.lr_table