def run_dp_train(self): sa_costs = None with open('../Pendulum/dp_sa_cost.p', 'rb') as f: sa_costs = pickle.load(f).flatten() sa_samples = [] a_grid = np.linspace(-2, 2, 9) s1_grid = np.linspace(0, 2*np.pi, 51) s2_grid = np.linspace(-10, 10, 51) for i, c in enumerate(sa_costs): a = i / (51 * 51) ss = i % (51 * 51) s2, s1 = ss / 51, ss % 51 a = a_grid[a] s1 = s1_grid[s1] s2 = s2_grid[s2] sa_samples.append([s1, s2, a, c]) sa_samples = np.array(sa_samples) n = len(sa_samples) for i in range(0): sample_size = n if i%100==0 else 50 inds = np.random.choice(range(n), sample_size, False) #t = np.array(random.sample(sa_samples, sample_size)) t = sa_samples[inds] if i%100 == 0: print 'iter', i mse = self.current_net.mse_q(t[:,0:3], -t[:,3][:,np.newaxis]) learn_rate = self.current_net.get_learn_rate() print 'mse', mse, 'learn_rate', learn_rate self.mse_hist.append(mse) self.learn_rate_hist.append(learn_rate) else: self.current_net.train(t[:,0:3], -t[:,3][:,np.newaxis]) #plt.plot(mses); plt.show() vis = NetVisualizer(self.current_net) vis.q_heat_map()