def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs=[], output=True, a = 0, b = 0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False): tic = time() self.v_e = 0 # Value function error self.p_e = 0 # Policy function error T = XA.shape[0] self.value_error = np.zeros(ITER) if not(tilesg): grid, m = buildgrid(X1, sg_points, self.radius, scale=True, stopnum=X1.shape[0]) else: nn = int(X1.shape[0]*sg_samp) tic = time() tile = TilecodeSamplegrid(X1.shape[1], 25, mem_max=sgmem_max, cores=self.CORES) grid = tile.fit(X1[0:nn], self.radius, prop=sg_prop) toc = time() print 'State grid points: ' + str(grid.shape[0]) + ', of maximum: ' + str(tile.max_points) + ', Time taken: ' + str(toc - tic) del tile points = grid.shape[0] ticfit = time() if self.first: self.W_f.fit(grid, np.zeros(points), NS=NS) self.V_f.fit(grid, np.zeros(points), NS=NS) self.first = False Al = np.zeros(points) Ah = np.zeros(points) if Ascaled: for i in range(points): Ws = self.W_f.predict(grid[i,:]) Al[i] = A_low(grid[i,:], Ws) Ah[i] = A_high(grid[i,:], Ws) minpol = 0 maxpol = 1 else: for i in range(points): Al[i] = A_low(grid[i,:]) Ah[i] = A_high(grid[i,:]) minpol = np.min(Al) maxpol = np.max(Ah) tocfit = time() print 'Constraint time: ' + str(tocfit - ticfit) if ITER == 1: precompute = False else: precompute = True # ------------------ # Q-learning # ------------------ #First iteration j = 0 # Q values ticfit = time() Q = u + self.beta * self.V_f.predict(X1, store_XS=precompute) tocfit = time() print 'V prediction time: ' + str(tocfit - ticfit) # Fit Q function ticfit = time() self.Q_f.fit(XA, Q, pa=minpol, pb=maxpol , copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1* (1 / min(T, maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp) tocfit = time() print 'Q Fitting time: ' + str(tocfit - ticfit) # Optimise Q function self.value_error[0], W_opt, state = self.maximise(grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS) for j in range(1, ITER): # Q values Q = u + self.beta * self.V_f.fast_values() # Fit Q function self.Q_f.partial_fit(Q, 0) # Optimise Q function self.value_error[j], W_opt, state = self.maximise(grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS) ticfit = time() NN = min(X1.shape[0], 20000) W_opt_old = self.W_f.predict(X1[0:NN,:]) self.W_f.fit(state, W_opt, sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS) W_opt_new = self.W_f.predict(X1[0:NN,:]) self.pe = np.mean((W_opt_old - W_opt_new))/np.mean(W_opt_old) toc = time() tocfit = time() print 'Policy time: ' + str(tocfit - ticfit) print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(self.pe) if plot: xargstemp = xargs self.W_f.plot(xargs, showdata=True) pylab.show() self.V_f.plot(xargstemp, showdata=True) pylab.show()
def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs =[], output=True, gridsamp=1): tic = time() self.v_e = 0 # Value function error self.p_e = 0 # Policy function error tic = time() N = int(gridsamp * X1.shape[0]) grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True) points = grid.shape[0] toc = time() print 'State grid points: ' + str(points) + ', of maximum: ' + str(m) + ', Time taken: ' + str(toc - tic) if self.first: self.W_f.fit(grid, np.zeros(points)) self.V_f.fit(grid, np.zeros(points)) self.first = False Al = np.zeros(points) Ah = np.zeros(points) if Ascaled: for i in range(points): Ws = self.W_f_old.predict(grid[i,:]) Al[i] = A_low(grid[i,:], Ws) Ah[i] = A_high(grid[i,:], Ws) else: for i in range(points): Al[i] = A_low(grid[i,:]) Ah[i] = A_high(grid[i,:]) # ------------------ # Q-learning # ------------------ #First iteration j = 0 # Q values Q = u + self.beta * self.V_f.predict(X1, store_XS=True) # Fit Q function self.Q_f.fit(XA, Q) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) for j in range(ITER): # Q values Q = u + self.beta * self.V_f.fast_values() # Fit Q function tic = time() self.Q_f.fit(XA, Q) toc = time() print 'Fit time: ' + str(toc - tic) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) toc = time() print 'Solve time: ' + str(toc - tic) if plot: self.W_f.plot(xargs, showdata=True) pylab.show()
def iterate(self, XA, X1, u, A_low, A_high, ITER=50, plot=True, xargs=[], output=True, a = 0, b = 0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False): tic = time() M = self.M self.v_e = 0 # Value function error self.p_e = 0 # Policy function error T = [XA[m].shape[0] for m in M] self.value_error = np.zeros(ITER) grid = [0,0] tile = [0,0] if not(tilesg): grid[0], _ = buildgrid(X1[0], sg_points, self.radius, scale=True, stopnum=X1[0].shape[0]) grid[1], _ = buildgrid(X1[1], sg_points, self.radius, scale=True, stopnum=X1[1].shape[0]) else: for m in range(2): nn = int(X1[m].shape[0]*sg_samp) tic = time() tile[m] = TilecodeSamplegrid(X1[m].shape[1], 25, mem_max=sgmem_max, cores=self.CORES) grid[m] = tile[m].fit(X1[m][0:nn], self.radius, prop=sg_prop) toc = time() print 'State grid points: ' + str(grid[m].shape[0]) + ', of maximum: ' + str(tile[m].max_points) + ', Time taken: ' + str(toc - tic) del tile #import pdb; pdb.set_trace() points = [grid[m].shape[0] for m in M] ticfit = time() if self.first: [self.W_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M] [self.V_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M] self.first = False Al = [np.zeros(points[m]) for m in M] Ah = [np.zeros(points[m]) for m in M] for m in range(2): for i in range(points[m]): Al[m][i] = A_low(grid[m][i,:]) Ah[m][i] = A_high(grid[m][i,:]) minpol = [np.min(Al[m]) for m in M] maxpol = [np.max(Ah[m]) for m in M] tocfit = time() print 'Constraint time: ' + str(tocfit - ticfit) if ITER == 1: precompute = False else: precompute = True W_opt = [0,0] state = [0,0] Q = [0,0] # ------------------ # Q-learning # ------------------ #First iteration j = 0 ticfit = time() m1 = 1 for m in range(2): # Q values Q[m] = u[m] + self.beta * self.V_f[m1].predict(X1[m], store_XS=precompute) tocfit = time() print 'V prediction time: ' + str(tocfit - ticfit) # Fit Q function ticfit = time() self.Q_f[m].fit(XA[m], Q[m], pa=minpol[m], pb=maxpol[m] , copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1* (1 / min(T[m], maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp) tocfit = time() print 'Q Fitting time: ' + str(tocfit - ticfit) # Optimise Q function value_error, W_opt[m], state[m] = self.maximise(m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS) m1 = 0 if test: import pdb; pdb.set_trace() for j in range(1, ITER): m1 = 1 for m in range(2): # Q values Q[m] = u[m] + self.beta * self.V_f[m1].fast_values() # Fit Q function self.Q_f[m].partial_fit(Q[m], 0) # Optimise Q function value_error, W_opt[m], state[m] = self.maximise(m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS) m1 = 0 if test: import pdb; pdb.set_trace() self.pe = [0,0] for m in range(2): ticfit = time() NN = min(X1[m].shape[0], 20000) W_opt_old = self.W_f[m].predict(X1[m][0:NN,:]) self.W_f[m].fit(state[m], W_opt[m], sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS) W_opt_new = self.W_f[m].predict(X1[m][0:NN,:]) self.pe[m] = np.mean((W_opt_old - W_opt_new)/W_opt_old) toc = time() tocfit = time() print 'Policy time: ' + str(tocfit - ticfit) print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str(self.pe) if plot: xargstemp1 = xargs xargstemp2 = xargs for m in range(2): xargs1 = xargstemp1 self.W_f[m].plot(xargs1, showdata=True) pylab.show() xargs2 = xargstemp2 self.V_f[m].plot(xargs2, showdata=True) pylab.show() xargstemp1 = xargs xargstemp2 = xargs
def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs=[], output=True, a=0, b=0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False): tic = time() self.v_e = 0 # Value function error self.p_e = 0 # Policy function error T = XA.shape[0] self.value_error = np.zeros(ITER) if not (tilesg): grid, m = buildgrid(X1, sg_points, self.radius, scale=True, stopnum=X1.shape[0]) else: nn = int(X1.shape[0] * sg_samp) tic = time() tile = TilecodeSamplegrid(X1.shape[1], 25, mem_max=sgmem_max, cores=self.CORES) grid = tile.fit(X1[0:nn], self.radius, prop=sg_prop) toc = time() print 'State grid points: ' + str( grid.shape[0]) + ', of maximum: ' + str( tile.max_points) + ', Time taken: ' + str(toc - tic) del tile points = grid.shape[0] ticfit = time() if self.first: self.W_f.fit(grid, np.zeros(points), NS=NS) self.V_f.fit(grid, np.zeros(points), NS=NS) self.first = False Al = np.zeros(points) Ah = np.zeros(points) if Ascaled: for i in range(points): Ws = self.W_f.predict(grid[i, :]) Al[i] = A_low(grid[i, :], Ws) Ah[i] = A_high(grid[i, :], Ws) minpol = 0 maxpol = 1 else: for i in range(points): Al[i] = A_low(grid[i, :]) Ah[i] = A_high(grid[i, :]) minpol = np.min(Al) maxpol = np.max(Ah) tocfit = time() print 'Constraint time: ' + str(tocfit - ticfit) if ITER == 1: precompute = False else: precompute = True # ------------------ # Q-learning # ------------------ #First iteration j = 0 # Q values ticfit = time() Q = u + self.beta * self.V_f.predict(X1, store_XS=precompute) tocfit = time() print 'V prediction time: ' + str(tocfit - ticfit) # Fit Q function ticfit = time() self.Q_f.fit(XA, Q, pa=minpol, pb=maxpol, copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1 * (1 / min(T, maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp) tocfit = time() print 'Q Fitting time: ' + str(tocfit - ticfit) # Optimise Q function self.value_error[0], W_opt, state = self.maximise(grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS) for j in range(1, ITER): # Q values Q = u + self.beta * self.V_f.fast_values() # Fit Q function self.Q_f.partial_fit(Q, 0) # Optimise Q function self.value_error[j], W_opt, state = self.maximise( grid, Al, Ah, Ascaled, output=output, plotiter=plotiter, xargs=xargs, NS=NS) ticfit = time() NN = min(X1.shape[0], 20000) W_opt_old = self.W_f.predict(X1[0:NN, :]) self.W_f.fit(state, W_opt, sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS) W_opt_new = self.W_f.predict(X1[0:NN, :]) self.pe = np.mean((W_opt_old - W_opt_new)) / np.mean(W_opt_old) toc = time() tocfit = time() print 'Policy time: ' + str(tocfit - ticfit) print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str( self.pe) if plot: xargstemp = xargs self.W_f.plot(xargs, showdata=True) pylab.show() self.V_f.plot(xargstemp, showdata=True) pylab.show()
def iterate(self, XA, X1, u, A_low, A_high, ITER=50, Ascaled=False, plot=True, xargs=[], output=True, gridsamp=1): tic = time() self.v_e = 0 # Value function error self.p_e = 0 # Policy function error tic = time() N = int(gridsamp * X1.shape[0]) grid, m = buildgrid(X1[0:N, :], self.maxgrid, self.radius, scale=True) points = grid.shape[0] toc = time() print 'State grid points: ' + str(points) + ', of maximum: ' + str( m) + ', Time taken: ' + str(toc - tic) if self.first: self.W_f.fit(grid, np.zeros(points)) self.V_f.fit(grid, np.zeros(points)) self.first = False Al = np.zeros(points) Ah = np.zeros(points) if Ascaled: for i in range(points): Ws = self.W_f_old.predict(grid[i, :]) Al[i] = A_low(grid[i, :], Ws) Ah[i] = A_high(grid[i, :], Ws) else: for i in range(points): Al[i] = A_low(grid[i, :]) Ah[i] = A_high(grid[i, :]) # ------------------ # Q-learning # ------------------ #First iteration j = 0 # Q values Q = u + self.beta * self.V_f.predict(X1, store_XS=True) # Fit Q function self.Q_f.fit(XA, Q) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) for j in range(ITER): # Q values Q = u + self.beta * self.V_f.fast_values() # Fit Q function tic = time() self.Q_f.fit(XA, Q) toc = time() print 'Fit time: ' + str(toc - tic) # Optimise Q function ERROR = self.maximise(grid, Al, Ah, Ascaled, output=output) toc = time() print 'Solve time: ' + str(toc - tic) if plot: self.W_f.plot(xargs, showdata=True) pylab.show()
def iterate(self, XA, X1, u, A_low, A_high, ITER=50, plot=True, xargs=[], output=True, a=0, b=0, pc_samp=1, maxT=60000, eta=0.8, tilesg=False, sg_prop=0.96, sg_samp=1, sg_points=100, sgmem_max=0.4, plotiter=False, test=False, NS=False): tic = time() M = self.M self.v_e = 0 # Value function error self.p_e = 0 # Policy function error T = [XA[m].shape[0] for m in M] self.value_error = np.zeros(ITER) grid = [0, 0] tile = [0, 0] if not (tilesg): grid[0], _ = buildgrid(X1[0], sg_points, self.radius, scale=True, stopnum=X1[0].shape[0]) grid[1], _ = buildgrid(X1[1], sg_points, self.radius, scale=True, stopnum=X1[1].shape[0]) else: for m in range(2): nn = int(X1[m].shape[0] * sg_samp) tic = time() tile[m] = TilecodeSamplegrid(X1[m].shape[1], 25, mem_max=sgmem_max, cores=self.CORES) grid[m] = tile[m].fit(X1[m][0:nn], self.radius, prop=sg_prop) toc = time() print 'State grid points: ' + str( grid[m].shape[0]) + ', of maximum: ' + str( tile[m].max_points) + ', Time taken: ' + str(toc - tic) del tile #import pdb; pdb.set_trace() points = [grid[m].shape[0] for m in M] ticfit = time() if self.first: [self.W_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M] [self.V_f[m].fit(grid[m], np.zeros(points[m]), NS=NS) for m in M] self.first = False Al = [np.zeros(points[m]) for m in M] Ah = [np.zeros(points[m]) for m in M] for m in range(2): for i in range(points[m]): Al[m][i] = A_low(grid[m][i, :]) Ah[m][i] = A_high(grid[m][i, :]) minpol = [np.min(Al[m]) for m in M] maxpol = [np.max(Ah[m]) for m in M] tocfit = time() print 'Constraint time: ' + str(tocfit - ticfit) if ITER == 1: precompute = False else: precompute = True W_opt = [0, 0] state = [0, 0] Q = [0, 0] # ------------------ # Q-learning # ------------------ #First iteration j = 0 ticfit = time() m1 = 1 for m in range(2): # Q values Q[m] = u[m] + self.beta * self.V_f[m1].predict(X1[m], store_XS=precompute) tocfit = time() print 'V prediction time: ' + str(tocfit - ticfit) # Fit Q function ticfit = time() self.Q_f[m].fit(XA[m], Q[m], pa=minpol[m], pb=maxpol[m], copy=True, unsupervised=precompute, sgd=self.asgd, asgd=self.asgd, eta=eta, n_iters=1, scale=1 * (1 / min(T[m], maxT)), storeindex=(self.asgd and precompute), a=a, b=b, pc_samp=pc_samp) tocfit = time() print 'Q Fitting time: ' + str(tocfit - ticfit) # Optimise Q function value_error, W_opt[m], state[m] = self.maximise(m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS) m1 = 0 if test: import pdb pdb.set_trace() for j in range(1, ITER): m1 = 1 for m in range(2): # Q values Q[m] = u[m] + self.beta * self.V_f[m1].fast_values() # Fit Q function self.Q_f[m].partial_fit(Q[m], 0) # Optimise Q function value_error, W_opt[m], state[m] = self.maximise( m, grid[m], Al[m], Ah[m], output=output, plotiter=plotiter, xargs=xargs, NS=NS) m1 = 0 if test: import pdb pdb.set_trace() self.pe = [0, 0] for m in range(2): ticfit = time() NN = min(X1[m].shape[0], 20000) W_opt_old = self.W_f[m].predict(X1[m][0:NN, :]) self.W_f[m].fit(state[m], W_opt[m], sgd=0, eta=0.1, n_iters=5, scale=0, NS=NS) W_opt_new = self.W_f[m].predict(X1[m][0:NN, :]) self.pe[m] = np.mean((W_opt_old - W_opt_new) / W_opt_old) toc = time() tocfit = time() print 'Policy time: ' + str(tocfit - ticfit) print 'Solve time: ' + str(toc - tic) + ', Policy change: ' + str( self.pe) if plot: xargstemp1 = xargs xargstemp2 = xargs for m in range(2): xargs1 = xargstemp1 self.W_f[m].plot(xargs1, showdata=True) pylab.show() xargs2 = xargstemp2 self.V_f[m].plot(xargs2, showdata=True) pylab.show() xargstemp1 = xargs xargstemp2 = xargs