def run_stock(): n_epoch = 100 mean = [] for x in range(n_epoch): env = Env(STOCK.Baidu) env.set_count(999) # from 1000 to 2000 for i in range(1000): action = random.random() * 2 - 1 # [-1, 1] env.step(action) mean.append(env.asset - 10000) # while True: # action = random.random() * 2 - 1 # [-1, 1] # observation_, reward, done = env.step(action) # # if done: # mean.append(env.asset - 10000) # break print(np.mean(mean), np.var(mean)) # end of game print('game over')
from stock_env import StockEnv env = StockEnv() if __name__ == '__main__': env.render() # print(env.step(1)) s,r,done = env.step(1) print(s) print(s.shape) print(r) print("=====================") s,r,done = env.step(0) print(s) print(r) print("====================") s,r,done = env.step(2) print(s) print(r)
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name, globalAC) def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.render() a = self.AC.choose_action(s) s_, r, done = self.env.step(a) if ep_t == MAX_EP_STEP - 1: done = True ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } test = self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], '| Var:', test, ) GLOBAL_EP += 1 break
agent = q_agent(len(env.sectors)) fig = plt.figure() ax = fig.add_subplot(111) fig.suptitle('Hard Coded Agent') for episode in range(NUM_EPISODES): # reset the environment and initialize the portfolio value agent.reset() p0 = env.reset() agent.update_value(p0) for t in range(MAX_T): # select the next action action = agent.select_action(p0) # execute the next action and get next state and reward p = env.step() for i, a in enumerate(action): agent.act(i, a, p[i]) agent.update_value(p) # render the portfolio value graph env.render(ax, agent.value) # prepare for next iteration p0 = p if agent.value[-1] >= TERMINAL_VALUE: print( "Episode %d finished after %f time steps with total value = %f"
# e = 0.2 / (episode / 5000 + 1) * 0.5 * (1 + np.cos(2 * np.pi * episode/5000)) if episode > 0.9 * num_episods: e = 0.0 state = env.reset() reward_sum = 0.0 for step in range(5000): # if episode % 100 == 0: # env.render() if np.random.rand(1) < e: a = env.random_action() else: Qs = predDQN(state) _, i = torch.max(Qs.data, 0) a = i[0] new_state, reward, done, info = env.step(a) replay_buffer.append(Replay(state, a, new_state, reward, done)) if len(replay_buffer) > BUFFER_SIZE: replay_buffer.popleft() state = new_state reward_sum += reward if done: # print(f"Episode: {episode}, Step: {step}, Reward: {reward_sum}") reward_history.append(reward_sum) duration_history.append(info.duration) break if episode % 20 == 0: print(f"Episode: {episode}, Return: {reward_history[-1]:.5}, Duration: {duration_history[-1]}, e: {e}")
stock_df = df[df.Name == env.test_stock_name] policy_mlp = ac.policy_mlp policy_mlp.load_state_dict(torch.load("policy_mlp.pth")) value_mlp = ac.value_mlp value_mlp.load_state_dict(torch.load("value_mlp.pth")) obss = [] actions = [] rewards = [] obs = env.reset() while True: obss.append(obs) action, _ = policy_mlp(torch.as_tensor(obs, dtype=torch.float32)) obs, reward, done, _ = env.step(action.detach().numpy()) actions.append(action) rewards.append(reward) if done: break obss_passive = [] actions_passive = [] rewards_passive = [] obs = env.reset() while True: obss_passive.append(obs) obs, reward, done, _ = env.step(1) actions_passive.append(action)
class Worker(object): def __init__(self, name, globalAC): self.env = StockEnv() self.name = name self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC) def _update_global_reward(self, ep_r): global GLOBAL_RUNNING_R, GLOBAL_EP if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) logger.debug( [self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1]] ) GLOBAL_EP += 1 def _update_globa_acnet(self, done, s_, buffer_s, buffer_a, buffer_r): if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.array(buffer_a), np.vstack(buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, } self.AC.update_global(feed_dict) def work(self): total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] self.env.reset() if self.name == 'W_0': self.env.render() while not COORD.should_stop(): ep_r = 0 while True: s = self.env._get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() # s = s_ total_step += 1 if done: self._update_global_reward(ep_r) break if self.name == 'W_0': logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) time.sleep(0.5) def train(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = self.env.reset() ep_r = 0 while True: # if self.name == 'W_0': # self.env.render() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -0.5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self._update_globa_acnet(done, s_, buffer_s, buffer_a, buffer_r) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() if done: self._update_global_reward(ep_r) logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total]) break s = s_ total_step += 1
warnings.filterwarnings("ignore") # ignore warnings stockHMM = StockHMM(STOCK.Baidu) # load model stockHMM.model = joblib.load("BaiDuHMM.pkl") print('Continuous: ') env = StockEnv(STOCK.Baidu) # [1000, 2000) env.set_count(999) for x in range(1000): p_states = stockHMM.predict(x + 1000) # order: 2 3 4 1 0 my_action = p_states[ 2] + p_states[3] * 0.5 - p_states[1] * 0.5 - p_states[0] env.step(my_action) print(env.asset - 10000) print('Discrete: ') env = StockEnvD(STOCKD.Baidu) # [1000, 2000) env.set_count(999) for x in range(1000): p_states = stockHMM.predict(x + 1000) # order: 2 3 4 1 0 my_action = round(5 * (p_states[2] + p_states[3] * 0.5 - p_states[1] * 0.5 - p_states[0]) + 5) env.step(my_action) print(env.asset - 10000)
class Worker(object): GAMMA = 0.9 GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 def __init__(self, sess, name, N_S, N_A, globalAC): self.SESS = sess self.N_S = N_S self.N_A = N_A self.env = StockEnv() self.name = name self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC) # self.saver = tf.train.Saver() def _record_global_reward_and_print(self, global_runing_rs, ep_r, global_ep, total_step): global_runing_rs.append(ep_r) try: print(self.name, "Ep:", global_ep, "| Ep_r: %i" % global_runing_rs[-1], "| total step:", total_step) except Exception as e: print(e) def train(self): buffer_s, buffer_a, buffer_r = [], [], [] s = self.env.reset() ep_r = 0 total_step = 1 def reset(): nonlocal ep_r, total_step self.env.reset() ep_r = 0 total_step = 1 while not COORD.should_stop() and self.GLOBAL_EP < MAX_GLOBAL_EP: # s = self.env.reset() # ep_r = 0 # total_step = 1 reset() while total_step < MAX_TOTAL_STEP: try: s = self.env.get_state() a, p = self.AC.choose_action(s) s_, r, done = self.env.step(a) if done: r = -2 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net self.AC.update(done, s_, buffer_r, buffer_s, buffer_a) buffer_s, buffer_a, buffer_r = [], [], [] if done: self._record_global_reward_and_print( self.GLOBAL_RUNNING_R, ep_r, self.GLOBAL_EP, total_step) self.GLOBAL_EP += 1 reset() # s = s_ total_step += 1 if self.name == 'W_0': self.env.render() time.sleep(0.05) logger.debug([ "s ", s, " v ", self.AC.get_v(s), " a ", a, " p ", p, " ep_r ", ep_r, " total ", self.env.total, " acct ", self.env.acct ]) except Exception as e: print(e) try: print(self.name, " not done,may be donkey!", " total_step:", total_step) except Exception as e: print(e)