def __init__(self, q, alpha, reward, discount, initial_state, actions): self.q = {} self.alpha = alpha self.reward = reward self.discount = discount self.states = initial_state QLearn.__init__(self, actions, len(initial_state), alpha)
def load_model(actions, input_dir, circuit, experiment, number): path = os.path.join(input_dir, circuit, experiment, number) q_table_path_file = os.path.join(path, sorted(os.listdir(path))[0]) qlearn_file = open(os.path.join(q_table_path_file)) model = pickle.load(qlearn_file) qlearn = QLearn(actions=actions, alpha=0.2, gamma=0.9, epsilon=0.05) qlearn.q = model print( "\n\n---------------- MODEL LOADED ----------------\n {}\n-----------------------\n\n" .format(qlearn_file)) return qlearn
def __init__(self, orderbook, side, T, I, ai=None, levels=None): self.orderbook = orderbook self.side = side self.levels = levels if not ai: ai = QLearn(self.levels) # levels are our qlearn actions self.ai = ai self.T = T self.I = I
def __init__(self, policy="greedy", lvfa=False, dim=7, nA=7, nS=78125, epsilon=0.05, alpha=0.01, gamma=0.9, ellgibility_trace=True, Q=None): self.policy_class = Policy(nA=nA, epsilon=epsilon) if policy == "greedy": self.policy = self.policy_class.greedy_policy if policy == "eps_greedy": self.policy = self.policy_class.eps_policy if policy == "softmax": self.policy = self.policy_class.softmax_policy self.reset_ellgibility_trace = self.do_nothing if lvfa: self.agent = LVFA(dim=dim, nA=nA, nS=nS, alpha=alpha, gamma=gamma, policy=self.policy) self.learn = self.agent.learn self.chooseAction = self.agent.chooseAction self.save_model = self.agent.save_model self.load_model = self.agent.save_model else: self.agent = QLearn(nA=nA, nS=nS, epsilon=epsilon, alpha=alpha, gamma=gamma, Q=Q, policy=self.policy) self.chooseAction = self.agent.chooseAction self.save_model = self.agent.save_model self.load_model = self.agent.save_model if ellgibility_trace: self.learn = self.agent.learn_ellgibility_trace self.reset_ellgibility_trace = self.agent.reset_ellgibility_trace else: self.learn = self.agent.learn
def testStateEquality(self): ai = QLearn([-1, 0, 1]) a1 = ActionState(1.0, 1.0, {'vol60': 1}) a2 = ActionState(1.0, 1.0, {'vol60': 1}) ai.learn(a1, 1, 1.0, a2) self.assertEqual(ai.getQAction(a2), 1)
import unittest from qlearn import QLearn from action_state import ActionState import numpy as np class QlearnTest(unittest.TestCase): def testStateEquality(self): ai = QLearn([-1, 0, 1]) a1 = ActionState(1.0, 1.0, {'vol60': 1}) a2 = ActionState(1.0, 1.0, {'vol60': 1}) ai.learn(a1, 1, 1.0, a2) self.assertEqual(ai.getQAction(a2), 1) #def testQTableLookup(self): actions = [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -7, -10, -15, -20] ai = QLearn(actions) ai.q = np.load('test_q.npy').item() ai.q state = ActionState(30, 0.9, {}) ai.q.get((state, -10)) print(ai.getQAction(state))
states_reward = {} env = gym.wrappers.Monitor(env, outdir, force=True) plotter = liveplot.LivePlot(outdir) last_time_steps = np.ndarray(0) actions = range(env.action_space.n) counter = 0 estimate_step_per_lap = environment["estimated_steps"] lap_completed = False total_episodes = 20000 epsilon_discount = 0.9986 # Default 0.9986 qlearn = QLearn(actions=actions, alpha=0.8, gamma=0.9, epsilon=0.99) if settings.load_model: # file_name = 'qlearn_camera_solved/montreal/2/1_20200928_2303_act_set_simple_epsilon_0.87_QTABLE.pkl' file_name = 'qlearn_camera_solved/points_1_actions_simple__simple_circuit/4/1_20200921_2024_act_set_simple_epsilon_0.83_QTABLE.pkl' load_model(qlearn, file_name) highest_reward = max(qlearn.q.values(), key=stats.get) else: highest_reward = 0 initial_epsilon = qlearn.epsilon telemetry_start_time = time.time() start_time = datetime.datetime.now() start_time_format = start_time.strftime("%Y%m%d_%H%M") print(settings.lets_go)
# the following actions represent an offset over the existing state, # modifications over the traditional "step" method of the environment # are implemented in the "step" function actions = [(difference_bins, 0.0, 0.0), (-difference_bins, 0.0, 0.0), (0.0, difference_bins, 0.0), (0.0, -difference_bins, 0.0), (0.0, 0.0, difference_bins), (0.0, 0.0, -difference_bins), (0.0, 0.0, 0.0)] ############ # The Q-learn algorithm #qlearn = QLearn(actions=actions, # alpha=0.2, gamma=0.90, epsilon=0.5, epsilon_decay_rate=0.99) qlearn = QLearn(actions=actions, alpha=0.2, gamma=0.90, epsilon=0.1, epsilon_decay_rate=0.98) for i_episode in range(30): # episodes print("I_EPISODE", i_episode) ##### observation = env.reset() joint1_position, joint2_position, joint3_position = observation[:3] state = build_state([ to_bin(joint1_position, joint1_bins), to_bin(joint2_position, joint2_bins), to_bin(joint3_position, joint3_bins) ])
else: y = [np.mean(np.array(x)) for x in ys] y2 = [np.mean(np.array(x)) for x in ys2] plt.plot(x, y, 'r-') if enable_after_exec_return: plt.plot(x, y2, 'g-') plt.grid(linestyle='-', linewidth=2) plt.show() #logging.basicConfig(level=logging.DEBUG) side = OrderSide.BUY levels = [5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -10, -12, -15] ai = QLearn(actions=levels, epsilon=0.4, alpha=0.3, gamma=0.8) #trainBook = 'query_result_train_15m.tsv' #testBook = 'query_result_train_15m.tsv' # orderbook = Orderbook(extraFeatures=False) # orderbook.loadFromBitfinexFile('orderbook_bitfinex_btcusd_view.tsv') # orderbook_test = Orderbook(extraFeatures=False) # orderbook_test.loadFromBitfinexFile('orderbook_bitfinex_btcusd_view.tsv') # Load orderbook cols = ["ts", "seq", "size", "price", "is_bid", "is_trade", "ttype"] import pandas as pd events = pd.read_table('ob-1-small.tsv', sep='\t', names=cols, index_col="seq") d = Orderbook.generateDictFromEvents(events) orderbook = Orderbook()
parser.add_argument('--algo', choices=['qlearn', 'sarsa', 'esarsa'], default='qlearn') parser = ArgumentParser() algo_args(parser) args = parser.parse_args() for eps in [0.05, 0.2]: fname = '{}-{}.csv'.format(args.algo, eps) fpath = os.path.join("exps", fname) with open(fpath, "w+") as fp: total_rewards = 0 options = { "qlearn": lambda: QLearn(eps=eps), "sarsa": lambda: SARSA(eps=eps), "esarsa": lambda: ExpectedSARSA(eps=eps) } algo = options.get(args.algo)() for episode in range(10000): grid = GridWorld() agent = Agent() s = agent.position() actions = [(0, 1), (1, 0), (-1, 0), (0, -1)] a = random.choice(actions) episode_reward = 0 def step(s, a): s_ = grid.move(s, a)
class AgentQlearn: def __init__(self, env): self.env = env self.levels = levels self.ai = QLearn(self.levels) def update(self, t, i, force_execution=False): aiState = ActionState(t, i) a = self.ai.chooseAction(aiState) # print('Random action: ' + str(level) + ' for state: ' + str(aiState)) action = self.env.createAction(level=a, state=aiState, force_execution=force_execution) action.run(self.env.orderbook) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t) reward = action.getReward() state_next = ActionState(action.getState().getT(), action.getState().getI(), action.getState().getMarket()) state_next.setT(t_next) state_next.setI(i_next) #print("Reward " + str(reward) + ": " + str(action.getState()) + " with " + str(action.getA()) + " -> " + str(state_next)) self.ai.learn(state1=action.getState(), action1=action.getA(), reward=reward, state2=state_next) return (t_next, i_next) def train(self, episodes=1, force_execution=False): for episode in range(int(episodes)): for t in self.env.T: logging.info("\n" + "t==" + str(t)) for i in self.env.I: logging.info(" i==" + str(i)) logging.info("Action run " + str((t, i))) (t_next, i_next) = self.update(t, i, force_execution) while i_next != 0: if force_execution: raise Exception("Enforced execution left " + str(i_next) + " unexecuted.") logging.info("Action transition " + str((t, i)) + " -> " + str((t_next, i_next))) (t_next, i_next) = self.update(t_next, i_next, force_execution) def backtest(self, q=None, episodes=10, average=False, fixed_a=None): if q is None: q = self.ai.q else: self.ai.q = q if not q: raise Exception('Q-Table is empty, please train first.') Ms = [] #T = self.T[1:len(self.T)] for t in [self.env.T[-1]]: logging.info("\n" + "t==" + str(t)) for i in [self.env.I[-1]]: logging.info(" i==" + str(i)) actions = [] state = ActionState(t, i, {}) #print(state) if fixed_a is not None: a = fixed_a else: try: a = self.ai.getQAction(state, 0) print("t: " + str(t)) print("i: " + str(i)) print("Action: " + str(a)) # print("Q action for state " + str(state) + ": " + str(a)) except: # State might not be in Q-Table yet, more training requried. logging.info("State " + str(state) + " not in Q-Table.") break actions.append(a) action = self.env.createAction(level=a, state=state, force_execution=False) midPrice = action.getReferencePrice() #print("before...") #print(action) action.run(self.env.orderbook) #print("after...") #print(action) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t) # print("i_next: " + str(i_next)) while i_next != 0: state_next = ActionState(t_next, i_next, {}) if fixed_a is not None: a_next = fixed_a else: try: a_next = self.ai.getQAction(state_next, 0) print("t: " + str(t_next)) print("i: " + str(i_next)) print("Action: " + str(a_next)) # print("Q action for next state " + str(state_next) + ": " + str(a_next)) except: # State might not be in Q-Table yet, more training requried. # print("State " + str(state_next) + " not in Q-Table.") break actions.append(a_next) #print("Action transition " + str((t, i)) + " -> " + str(aiState_next) + " with " + str(runtime_next) + "s runtime.") runtime_next = self.env.determineRuntime(t_next) action.setState(state_next) action.update(a_next, runtime_next) action.run(self.env.orderbook) #print(action) i_next = self.env.determineNextInventory(action) t_next = self.env.determineNextTime(t_next) price = action.getAvgPrice() # TODO: last column is for for the BUY scenario only if action.getOrder().getSide() == OrderSide.BUY: profit = midPrice - price else: profit = price - midPrice Ms.append([state, midPrice, actions, price, profit]) if not average: return Ms return self.averageBacktest(Ms) def averageBacktest(self, M): # Average states within M N = [] observed = [] for x in M: state = x[0] if state in observed: continue observed.append(state) paid = [] reward = [] for y in M: if y[0] == state: paid.append(y[3]) reward.append(y[4]) N.append([state, x[1], x[2], np.average(paid), np.average(reward)]) return N def run(self, epochs_train=1, epochs_test=10): if epochs_train > 0: agent.train(episodes=epochs_train) M = agent.backtest(episodes=epochs_test, average=False) M = np.array(M) return np.mean(M[0:, 4]) def simulate(self, epochs_train=1, epochs_test=10, interval=100): from agent_utils.ui import UI UI.animate(lambda: self.run(epochs_train, epochs_test), interval=interval)
def __init__(self, env): self.env = env self.levels = levels self.ai = QLearn(self.levels)
bins=n_bins, retbins=True)[1][1:-1] joint3_bins = pandas.cut([-numpy.pi / 2, numpy.pi / 2], bins=n_bins, retbins=True)[1][1:-1] # print("joint1_bins: ", joint1_bins) # Generate posible actions # TODO program this # actions = [item for innerlist in outerlist ] actions = [(0.0, 0.0, 0.0), (numpy.pi / 2, numpy.pi / 2, numpy.pi / 2), (0, 0, numpy.pi / 2)] # The Q-learn algorithm qlearn = QLearn(actions=actions, alpha=0.5, gamma=0.90, epsilon=0.1) for i_episode in range(30): # episodes observation = env.reset() joint1_position, joint2_position, joint3_position = observation[:3] state = build_state([ to_bin(joint1_position, joint1_bins), to_bin(joint2_position, joint2_bins), to_bin(joint3_position, joint3_bins) ]) for t in range(max_number_of_steps): env.render() # Pick an action based on the current state
from maze_generator import MazeEnv, read_maze from value_iteration import value_iteration from policy_iteration import policy_improvement import numpy as np from mdp_graph import graph_value_policy import matplotlib.pyplot as plt from time import time from qlearn import QLearn import time maze_shape = (32, 32) maze_file = 'maze/mazeLarge.png' # p = 0.1 qlearn = QLearn(num_states=32 * 32, num_actions=4, alpha=0.2, gamma=0.99, epsilon=0.1, softmax=True) env = MazeEnv(maze_file=maze_file) total_reward_hist = [] cum_total_reward = 0 q_start_hist = [] episodes = 10000 for e in range(episodes): done = False obs = env.reset() if (e % 100 == 0): print("Episode {}".format(e))
########################################### # Debug Q Values from QLearningMouse # # Curtis Long 20190221 ########################################### f = open("resources/world.txt", 'r') lines = f.readlines() f.close() height = len(lines) width = max([len(x) for x in lines]) ai = QLearn(actions=range(cfg.directions), alpha=cfg.alpha, gamma=cfg.gamma, epsilon=cfg.epsilon) if (os.path.isfile('mouse.pickle')): with open('mouse.pickle', 'rb') as p: ai.q = pickle.load(p) pprint(ai.q) print('Items: ' + str(len(ai.q))) #exit() dirs = [(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)] actions = range(cfg.directions) i = 0 j = 0 for line in lines: print("\n", end='')
notrl_tot_steps = 0 notrl_returns = [] notrl_steps = [] # create grid-world instance grid = GridWorld(9) grid.make_maps() possible_actions = grid.possible_actions world = grid.world grid.list_of_maps.reverse() # Direct learning on final grid print("Direct learning on final grid") qlearn = QLearn(grid.final_grid, possible_actions, world) Q, returns, episodes, steps = do_task( qlearn, grid, len(grid.list_of_maps) - 1) notrl_returns.append(returns) notrl_steps.append(steps) notrl_tot_steps += steps[-1] print("-" * 80) # Incremental transfer learning print("Incremental transfer learning") Q = None for task, current_map in enumerate(grid.list_of_maps): print("-" * 50) # creates QLearn instance exploit = False if task == 0 else True qlearn = QLearn(current_map, possible_actions, world, Q)
from maze_generator import WindyMazeEnv, read_maze from value_iteration import value_iteration from policy_iteration import policy_improvement import numpy as np from mdp_graph import graph_value_policy import matplotlib.pyplot as plt from time import time from qlearn import QLearn import time maze_shape = (16, 16) maze_file = 'maze/maze.png' p = 0.1 qlearn = QLearn(num_states=16 * 16, num_actions=4, alpha=0.1, gamma=0.9, epsilon=0.1) env = WindyMazeEnv(maze_file=maze_file, wind_prob=p) total_reward_hist = [] cum_total_reward = 0 q_start_hist = [] episodes = 35000 for e in range(episodes): done = False obs = env.reset() if (e % 1000 == 0): print("Episode {}".format(e)) gamma = 0.9 gamma_pow = 1 total_reward = 0
def main(iteration): world = 4 # saving directories window = 5 # moving mean window main_dir = 'qlearn_plots' sub_dir = ['4by4can', '4by4nocan', '9by9'] sub_sub_dir = ['steps', 'episodes'] for sub_d in sub_dir: for ss_d in sub_sub_dir: dir_name = '/'.join([main_dir, sub_d, 'win' + str(window), ss_d]) if not os.path.exists(dir_name): os.makedirs(dir_name) # print("-" * 100) # Evaluation tot_steps = 0 all_returns = [] all_steps = [] all_episodes = [] notrl_tot_steps = 0 notrl_returns = [] notrl_steps = [] notrl_episodes = [] # create grid-world instance if world == 4: canyon = False grid = GridWorld(world, canyon) if canyon: canyon_str = "(CANYON)" else: canyon_str = "(NO CANYON)" elif world == 9: canyon_str = '' grid = GridWorld(9) grid.make_maps() possible_actions = grid.possible_actions grid.list_of_maps.reverse() # Direct learning on final grid # print("Direct learning on final grid") qlearn = QLearn(grid.final_grid, possible_actions, world) Q, returns, episodes, steps = do_task(qlearn, grid, len(grid.list_of_maps) - 1) notrl_returns.append(returns) notrl_steps.append(steps) notrl_episodes.append(episodes) notrl_tot_steps += steps[-1] # print("-" * 80) # Incremental transfer learning # print("Incremental transfer learning", canyon_str) Q = None for task, current_map in enumerate(grid.list_of_maps, 0): # print("-" * 50) # creates qlearn instance exploit = False if task == 0 else False qlearn = QLearn(current_map, possible_actions, world, Q) Q, returns, episodes, steps = do_task(qlearn, grid, task, exploit) all_returns.append(returns) tot_counter = 0 epi_counter = 0 if task != 0: tot_counter += all_steps[task - 1][-1] epi_counter += all_episodes[task - 1][-1] all_steps.append([i + tot_counter for i in steps]) all_episodes.append([i + epi_counter for i in episodes]) else: all_steps.append([i for i in steps]) all_episodes.append([i for i in episodes]) # print("-" * 100) # print("Incremental Transfer Cumulative total of steps", # all_steps[-1][-1] - all_steps[0][-1]) # print("Direct Cumulative total of steps", notrl_steps[-1][-1]) flat_episodes = [item for sublist in all_episodes for item in sublist] flat_returns = [item for sublist in all_returns for item in sublist] flat_steps = [item for sublist in all_steps for item in sublist] tmp_array = np.array(flat_returns) notrl_avg_returns = [] avg_returns = [] for t in range(len(flat_returns)): avg_returns.append(tmp_array[max(0, t - window):(t + 1)].mean()) notrl_flat_returns = [ item for sublist in notrl_returns for item in sublist ] tmp_array_1 = np.array(notrl_flat_returns) for t in range(len(notrl_flat_returns)): notrl_avg_returns.append(tmp_array_1[max(0, t - window):(t + 1)].mean()) fig = plt.figure() a0 = fig.add_subplot(1, 1, 1) val = 0 for j, i in enumerate(all_steps): if j == len(all_steps) - 1: a0.axvline(x=i[-1], linestyle='--', color='#ccc5c6', label='Task Switch') else: a0.axvline(x=i[-1], linestyle='--', color='#ccc5c6') a0.plot(flat_steps, avg_returns, label="Task Interpolation", color='#d73236', linewidth=1, linestyle='-') x_steps = [ i + all_steps[0][-1] - notrl_steps[0][0] for i in notrl_steps[0] ] a0.plot(x_steps, notrl_avg_returns, label="Tabula Rasa", color='#80bbe5', linestyle='-', linewidth=1) plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) plt.xlabel("Steps") plt.ylabel("Accumulated Reward") plt.legend(loc="lower right") plt.axis([None, None, -20, 1]) if world == 4: if canyon: step_save = 'qlearn_plots/4by4can/' + 'win' + str( window) + '/steps/4by4_canyon_steps' plt_title = '4x4 Maze Canyon' else: step_save = 'qlearn_plots/4by4nocan/' + 'win' + str( window) + '/steps/4by4_nocanyon_steps' plt_title = '4x4 Maze Non-Canyon' elif world == 9: step_save = 'qlearn_plots/9by9/' + 'win' + str( window) + '/steps/9by9_steps' plt_title = '9x9 Maze' plt.title(plt_title) plt.savefig(step_save + iteration + '.eps', format='eps', dpi=1000) # fig.show() fig1 = plt.figure() a1 = fig1.add_subplot(1, 1, 1) val = 0 for j, i in enumerate(all_episodes): if j == len(all_episodes) - 1: a1.axvline(x=i[-1], linestyle='--', color='#ccc5c6', label='Task Switch') else: a1.axvline(x=i[-1], linestyle='--', color='#ccc5c6') a1.plot(flat_episodes, avg_returns, label="Task Interpolation", color='#d73236', linewidth=1, linestyle='-') x_episodes = [ i + all_episodes[0][-1] - notrl_episodes[0][0] for i in notrl_episodes[0] ] a1.plot(x_episodes, notrl_avg_returns, label="Tabula Rasa", color='#80bbe5', linestyle='-', linewidth=1) plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) plt.xlabel("Episodes") plt.ylabel("Accumulated Reward") plt.legend(loc="lower right") plt.axis([None, None, -20, 1]) plt.title(plt_title) if world == 4: if canyon: epi_save = 'qlearn_plots/4by4can/' + 'win' + str( window) + '/episodes/4by4_canyon_episodes' else: epi_save = 'qlearn_plots/4by4nocan/' + 'win' + str( window) + '/episodes/4by4_nocanyon_episodes' elif world == 9: epi_save = 'qlearn_plots/9by9/' + 'win' + str( window) + '/episodes/9by9_episodes' plt.savefig(epi_save + iteration + '.eps', format='eps', dpi=1000)