예제 #1
0
    q_start_hist.append(np.max(qlearn.q[obs]))
    while not done:
        # obs = env.reset()
        # action = 1
        # new_obs, reward, done, _ = env.step(action)
        # print(obs, action, reward, new_obs)
        # time.sleep(4)
        # print(obs)
        # print(qlearn.q[obs])
        action = qlearn.chooseAction(obs)
        new_obs, reward, done, _ = env.step(action)
        total_reward += gamma_pow * reward
        gamma_pow *= gamma

        qlearn.learn(obs, action, reward, new_obs, done)
        # if (action == 1):
        # print(obs, action, reward, new_obs)
        # env.render()
        # time.sleep(0.01)
        obs = new_obs

    cum_total_reward += total_reward
    total_reward_hist.append(cum_total_reward)

q = qlearn.q
policy = np.argmax(q, axis=1)

value = np.max(q, axis=1)
policy = policy.reshape(maze_shape)
value = value.reshape(maze_shape)
예제 #2
0
        )  # local step method, applies the action as an offset
        # to the state
        print("reward: ", reward)
        # print("next state observation: ",observation[:3])###

        # Digitize the observation to get a state
        joint1_position, joint2_position, joint3_position = observation[:3]
        nextState = build_state([
            to_bin(joint1_position, joint1_bins),
            to_bin(joint2_position, joint2_bins),
            to_bin(joint3_position, joint3_bins)
        ])

        # print("nextState", nextState)
        if done:
            last_time_steps = numpy.append(last_time_steps, [int(t + 1)])
            break
        else:
            # Q-learn stuff
            #qlearn.learn(state, action, reward, nextState)
            qlearn.learn(state, action, reward, nextState,
                         save_model_with_prefix, it)
            state = nextState

            it += 1  #####

# l = last_time_steps.tolist()
# l.sort()
# print("Overall score: {:0.2f}".format(last_time_steps.mean()))
# print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))
예제 #3
0
class AgentQlearn:
    def __init__(self, env):
        self.env = env
        self.levels = levels
        self.ai = QLearn(self.levels)

    def update(self, t, i, force_execution=False):
        aiState = ActionState(t, i)
        a = self.ai.chooseAction(aiState)
        # print('Random action: ' + str(level) + ' for state: ' + str(aiState))
        action = self.env.createAction(level=a,
                                       state=aiState,
                                       force_execution=force_execution)
        action.run(self.env.orderbook)
        i_next = self.env.determineNextInventory(action)
        t_next = self.env.determineNextTime(t)
        reward = action.getReward()
        state_next = ActionState(action.getState().getT(),
                                 action.getState().getI(),
                                 action.getState().getMarket())
        state_next.setT(t_next)
        state_next.setI(i_next)
        #print("Reward " + str(reward) + ": " + str(action.getState()) + " with " + str(action.getA()) + " -> " + str(state_next))
        self.ai.learn(state1=action.getState(),
                      action1=action.getA(),
                      reward=reward,
                      state2=state_next)
        return (t_next, i_next)

    def train(self, episodes=1, force_execution=False):
        for episode in range(int(episodes)):
            for t in self.env.T:
                logging.info("\n" + "t==" + str(t))
                for i in self.env.I:
                    logging.info("     i==" + str(i))
                    logging.info("Action run " + str((t, i)))
                    (t_next, i_next) = self.update(t, i, force_execution)
                    while i_next != 0:
                        if force_execution:
                            raise Exception("Enforced execution left " +
                                            str(i_next) + " unexecuted.")
                        logging.info("Action transition " + str((t, i)) +
                                     " -> " + str((t_next, i_next)))
                        (t_next, i_next) = self.update(t_next, i_next,
                                                       force_execution)

    def backtest(self, q=None, episodes=10, average=False, fixed_a=None):
        if q is None:
            q = self.ai.q
        else:
            self.ai.q = q

        if not q:
            raise Exception('Q-Table is empty, please train first.')

        Ms = []
        #T = self.T[1:len(self.T)]
        for t in [self.env.T[-1]]:
            logging.info("\n" + "t==" + str(t))
            for i in [self.env.I[-1]]:
                logging.info("     i==" + str(i))
                actions = []
                state = ActionState(t, i, {})
                #print(state)
                if fixed_a is not None:
                    a = fixed_a
                else:
                    try:
                        a = self.ai.getQAction(state, 0)
                        print("t: " + str(t))
                        print("i: " + str(i))
                        print("Action: " + str(a))
                        # print("Q action for state " + str(state) + ": " + str(a))
                    except:
                        # State might not be in Q-Table yet, more training requried.
                        logging.info("State " + str(state) +
                                     " not in Q-Table.")
                        break
                actions.append(a)
                action = self.env.createAction(level=a,
                                               state=state,
                                               force_execution=False)
                midPrice = action.getReferencePrice()

                #print("before...")
                #print(action)
                action.run(self.env.orderbook)
                #print("after...")
                #print(action)
                i_next = self.env.determineNextInventory(action)
                t_next = self.env.determineNextTime(t)
                # print("i_next: " + str(i_next))
                while i_next != 0:
                    state_next = ActionState(t_next, i_next, {})
                    if fixed_a is not None:
                        a_next = fixed_a
                    else:
                        try:
                            a_next = self.ai.getQAction(state_next, 0)
                            print("t: " + str(t_next))
                            print("i: " + str(i_next))
                            print("Action: " + str(a_next))
                            # print("Q action for next state " + str(state_next) + ": " + str(a_next))
                        except:
                            # State might not be in Q-Table yet, more training requried.
                            # print("State " + str(state_next) + " not in Q-Table.")
                            break
                    actions.append(a_next)
                    #print("Action transition " + str((t, i)) + " -> " + str(aiState_next) + " with " + str(runtime_next) + "s runtime.")

                    runtime_next = self.env.determineRuntime(t_next)
                    action.setState(state_next)
                    action.update(a_next, runtime_next)
                    action.run(self.env.orderbook)
                    #print(action)
                    i_next = self.env.determineNextInventory(action)
                    t_next = self.env.determineNextTime(t_next)

                price = action.getAvgPrice()
                # TODO: last column is for for the BUY scenario only
                if action.getOrder().getSide() == OrderSide.BUY:
                    profit = midPrice - price
                else:
                    profit = price - midPrice
                Ms.append([state, midPrice, actions, price, profit])
        if not average:
            return Ms
        return self.averageBacktest(Ms)

    def averageBacktest(self, M):
        # Average states within M
        N = []
        observed = []
        for x in M:
            state = x[0]
            if state in observed:
                continue
            observed.append(state)
            paid = []
            reward = []
            for y in M:
                if y[0] == state:
                    paid.append(y[3])
                    reward.append(y[4])
            N.append([state, x[1], x[2], np.average(paid), np.average(reward)])
        return N

    def run(self, epochs_train=1, epochs_test=10):
        if epochs_train > 0:
            agent.train(episodes=epochs_train)
        M = agent.backtest(episodes=epochs_test, average=False)
        M = np.array(M)
        return np.mean(M[0:, 4])

    def simulate(self, epochs_train=1, epochs_test=10, interval=100):
        from agent_utils.ui import UI
        UI.animate(lambda: self.run(epochs_train, epochs_test),
                   interval=interval)
예제 #4
0
 def testStateEquality(self):
     ai = QLearn([-1, 0, 1])
     a1 = ActionState(1.0, 1.0, {'vol60': 1})
     a2 = ActionState(1.0, 1.0, {'vol60': 1})
     ai.learn(a1, 1, 1.0, a2)
     self.assertEqual(ai.getQAction(a2), 1)
예제 #5
0
        # Pick an action based on the current state
        action = qlearn.chooseAction(state)
        print("action: ", action)
        # Execute the action and get feedback
        observation, reward, done, info = env.step(action)
        print("reward: ", reward)
        # print("observation: ",observation)
        print("q: ", qlearn.q)

        # Digitize the observation to get a state
        joint1_position, joint2_position, joint3_position = observation[:3]
        nextState = build_state([
            to_bin(joint1_position, joint1_bins),
            to_bin(joint2_position, joint2_bins),
            to_bin(joint3_position, joint3_bins)
        ])

        if done:
            last_time_steps = numpy.append(last_time_steps, [int(t + 1)])
            break
        else:
            # Q-learn stuff
            qlearn.learn(state, action, reward, nextState)
            state = nextState

l = last_time_steps.tolist()
l.sort()
print("Overall score: {:0.2f}".format(last_time_steps.mean()))
print("Best 100 score: {:0.2f}".format(
    reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))