Пример #1
0
def test_step():
    # create an empty state space of 4,4
    state_space = StateSpace(4,
                             4,
                             goal_1_position=(1, 3),
                             goal_2_position=(3, 3),
                             forbidden_position=(2, 3),
                             wall_position=(2, 2))

    # start state
    start_state = state_space.get_state(0, 1)

    # create a new agent with this state space
    agent = Agent(
        state_space=state_space,
        start_state=start_state,
        learning_rate=0.1,
        discount_rate=0.2,
        exploit_prob=0.1,
        living_reward=-0.1,
    )
    # step
    assert agent.step() is not None
Пример #2
0
class Markov():
    def __init__(self, metric_list, train_data):

        self.train_data = train_data
        self.metric_list = metric_list
        self.n_mets = len(self.metric_list)

        # state space
        self.X = StateSpace(self.metric_list, self.train_data)
        self.state_map = self.X.state_map
        self.inv_map = self.X.inverse_map
        self.h_map = self.X.h_state_map
        self.state_poss = self.X.state_poss_seq
        self.n_states = len(self.state_map)

        self.trans_probs = np.zeros((len(self.h_map), len(self.state_poss)))

        # initialize data
        self.markov_data = []
        self.markov_data.append(int(self.n_states * np.random.random()))

    def fit(self):
        self.state_seq = []
        print('Generating transition probabilities...')
        for i in range(self.X.pts_required,
                       len(self.train_data) - self.X.pts_required):

            self.state_seq.append(
                self.X.get_state(self.train_data.iloc[i -
                                                      self.X.pts_required:i]))
            state_tuple = self.inv_map[self.state_seq[-1]]

            h_seq = []
            s_seq = []
            ctr = 0
            for metric in self.X.metric_list:
                for j in range(len(metric.poss_h_seq)):
                    if metric.poss_seq[state_tuple[ctr]][
                            0:metric.markov_mem] == metric.poss_h_seq[j]:
                        h_seq.append(j)

                s_seq.append(
                    metric.poss_seq[state_tuple[ctr]][metric.markov_mem])
                ctr += 1

                if ctr == self.n_mets:
                    for k in range(len(self.state_poss)):
                        if tuple(s_seq) == self.state_poss[k]:
                            s_idx = k

            row = self.h_map[tuple(h_seq)]
            self.trans_probs[row, s_idx] += 1

        row_cnt = []
        for i in range(len(self.trans_probs)):
            row_cnt.append(np.sum(self.trans_probs[i, :]))

        for i in range(len(self.trans_probs)):
            if row_cnt[i] != 0:
                self.trans_probs[i, :] = self.trans_probs[i, :] / row_cnt[i]

        print('done!')
        #return self.trans_probs

    def generate(self, data_len: int):
        """data_len = 93600 is 1Q worth of data"""

        print('generating markov data...')
        for i in range(data_len):

            state_tuple = self.inv_map[self.markov_data[-1]]

            seq = []
            h_seq = []
            s_seq = []
            ctr = 0
            for metric in self.X.metric_list:
                for j in range(len(metric.poss_h_seq)):
                    if metric.poss_seq[state_tuple[ctr]][
                            -metric.markov_mem:] == metric.poss_h_seq[j]:
                        h_seq.append(j)
                        seq.append(metric.poss_h_seq[j])

                s_seq.append(
                    metric.poss_seq[state_tuple[ctr]][metric.markov_mem])
                ctr += 1

            row = self.h_map[tuple(h_seq)]
            rand = np.random.random()

            probs = []
            for k in range(len(self.trans_probs[row, :])):
                probs.append(np.sum(self.trans_probs[row, :k]))

            s_idx = 0
            while rand >= probs[s_idx] and s_idx < len(probs) - 1:
                s_idx += 1

            for l in range(self.n_mets):
                seq[l] += tuple([self.state_poss[s_idx][l]])

            state = []
            ctr = 0

            for metric in self.X.metric_list:
                state.append(metric.markov_dict[seq[ctr]])
                ctr += 1

            s = self.state_map[tuple(state)]

            self.markov_data.append(s)

        print('done!')
        return self.markov_data
Пример #3
0
class Qagent():
    def __init__(self, num_ccy: int, precision: int, gamma: float,
                 metric_list: list, train_data: pd.DataFrame):

        self.num_ccy = num_ccy
        self.precision = precision
        self.gamma = gamma
        self.metric_list = metric_list
        self.train_data = train_data

        self.A = ActionSpace(self.num_ccy, self.precision)
        self.a_space = self.A.actions
        self.X = StateSpace(self.metric_list, self.train_data)
        self.state_map = self.X.state_map

        self.n_states = len(self.state_map)
        self.n_actions = len(self.a_space)

        self.q_table = np.zeros((self.n_states, self.n_actions))
        self.lr_table = np.zeros((self.n_states, self.n_actions))

    def train(self, data=0, markov=False):

        if markov == True:
            train_start = 0
            train_end = len(data) - 1
        else:
            train_start = self.X.pts_required
            train_end = len(self.train_data) - 1

        print('Training q_table...')
        for i in range(train_start, train_end):

            # initialize state
            if markov == True:
                s_idx = data[i]
            else:
                s_idx = self.X.get_state(
                    self.train_data.iloc[i - self.X.pts_required:i])

            # if all actions for a state have zero reward 0 randomly select action
            if np.sum(self.q_table[s_idx, :]) == 0:
                # randint randomly selects ints from 0 up to but not including n_actions
                a_idx = np.random.randint(0, self.n_actions)

            # select the action with largest reward value in state s
            else:
                a_idx = np.argmax(self.q_table[s_idx, :])

            # get action from action space
            a = self.a_space[a_idx]

            # get new state from state space assuming states are iid
            price = self.train_data.iloc[i - 1].values[0]
            next_price = self.train_data.iloc[i].values[0]
            b = [1, next_price / price]
            z = -1
            # calculate reward
            r = np.log(np.dot(a, b))
            #print(r)

            # choose learning rate
            if self.lr_table[s_idx, a_idx] == 0:
                alpha = 1
            else:
                alpha = 1 / (self.lr_table[s_idx, a_idx])

            # update q table for action a taken on state s
            if markov == True:
                next_s_idx = data[i + 1]
            else:
                next_s_idx = self.X.get_state(
                    train_data.iloc[i + 1 - self.X.pts_required:i + 1])

            #print(next_s_idx)
            self.q_table[s_idx, a_idx] += r + alpha * (self.gamma * np.max(
                self.q_table[next_s_idx, :]) - self.q_table[s_idx, a_idx])

            # update learning rate
            self.lr_table[s_idx, a_idx] += 1

        print('done!')
        return self.q_table, self.lr_table