示例#1
0
def run_stock():
    n_epoch = 100
    mean = []
    for x in range(n_epoch):
        env = Env(STOCK.Baidu)
        env.set_count(999)  # from 1000 to 2000
        for i in range(1000):
            action = random.random() * 2 - 1  # [-1, 1]
            env.step(action)

        mean.append(env.asset - 10000)

        # while True:
        #     action = random.random() * 2 - 1  # [-1, 1]
        #     observation_, reward, done = env.step(action)
        #
        #     if done:
        #         mean.append(env.asset - 10000)
        #         break

    print(np.mean(mean), np.var(mean))
    # end of game
    print('game over')
示例#2
0
from stock_env import StockEnv


env = StockEnv()

if __name__ == '__main__':
    env.render()
    # print(env.step(1))
    s,r,done = env.step(1)
    print(s)
    print(s.shape)
    print(r)

    print("=====================")
    s,r,done = env.step(0)
    print(s)
    print(r)

    print("====================")
    s,r,done = env.step(2)
    print(s)
    print(r)


    
示例#3
0
文件: A3C.py 项目: linbirg/RL
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = StockEnv()
        self.name = name
        self.AC = ACNet(name, globalAC)

    def work(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            for ep_t in range(MAX_EP_STEP):
                if self.name == 'W_0':
                    self.env.render()
                a = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if ep_t == MAX_EP_STEP - 1: done = True
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = SESS.run(self.AC.v,
                                        {self.AC.s: s_[np.newaxis, :]})[0, 0]
                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    buffer_s, buffer_a, buffer_v_target = np.vstack(
                        buffer_s), np.vstack(buffer_a), np.vstack(
                            buffer_v_target)
                    feed_dict = {
                        self.AC.s: buffer_s,
                        self.AC.a_his: buffer_a,
                        self.AC.v_target: buffer_v_target,
                    }
                    test = self.AC.update_global(feed_dict)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] +
                                                0.1 * ep_r)
                    print(
                        self.name,
                        "Ep:",
                        GLOBAL_EP,
                        "| Ep_r: %i" % GLOBAL_RUNNING_R[-1],
                        '| Var:',
                        test,
                    )
                    GLOBAL_EP += 1
                    break
示例#4
0
agent = q_agent(len(env.sectors))
fig = plt.figure()
ax = fig.add_subplot(111)
fig.suptitle('Hard Coded Agent')

for episode in range(NUM_EPISODES):
    # reset the environment and initialize the portfolio value
    agent.reset()
    p0 = env.reset()
    agent.update_value(p0)

    for t in range(MAX_T):
        # select the next action
        action = agent.select_action(p0)
        # execute the next action and get next state and reward
        p = env.step()

        for i, a in enumerate(action):
            agent.act(i, a, p[i])

        agent.update_value(p)

        # render the portfolio value graph
        env.render(ax, agent.value)

        # prepare for next iteration
        p0 = p

        if agent.value[-1] >= TERMINAL_VALUE:
            print(
                "Episode %d finished after %f time steps with total value = %f"
示例#5
0
    # e = 0.2 / (episode / 5000 + 1) * 0.5 * (1 + np.cos(2 * np.pi * episode/5000))
    if episode > 0.9 * num_episods:
        e = 0.0
    state = env.reset()

    reward_sum = 0.0
    for step in range(5000):
        # if episode % 100 == 0:
        #     env.render()
        if np.random.rand(1) < e:
            a = env.random_action()
        else:
            Qs = predDQN(state)
            _, i = torch.max(Qs.data, 0)
            a = i[0]
        new_state, reward, done, info = env.step(a)

        replay_buffer.append(Replay(state, a, new_state, reward, done))
        if len(replay_buffer) > BUFFER_SIZE:
            replay_buffer.popleft()

        state = new_state
        reward_sum += reward
        if done:
            # print(f"Episode: {episode}, Step: {step}, Reward: {reward_sum}")
            reward_history.append(reward_sum)
            duration_history.append(info.duration)
            break

    if episode % 20 == 0:
        print(f"Episode: {episode}, Return: {reward_history[-1]:.5}, Duration: {duration_history[-1]}, e: {e}")
    stock_df = df[df.Name == env.test_stock_name]

    policy_mlp = ac.policy_mlp
    policy_mlp.load_state_dict(torch.load("policy_mlp.pth"))
    value_mlp = ac.value_mlp
    value_mlp.load_state_dict(torch.load("value_mlp.pth"))

    obss = []
    actions = []
    rewards = []

    obs = env.reset()
    while True:
        obss.append(obs)
        action, _ = policy_mlp(torch.as_tensor(obs, dtype=torch.float32))
        obs, reward, done, _ = env.step(action.detach().numpy())
        actions.append(action)
        rewards.append(reward)

        if done:
            break

    obss_passive = []
    actions_passive = []
    rewards_passive = []

    obs = env.reset()
    while True:
        obss_passive.append(obs)
        obs, reward, done, _ = env.step(1)
        actions_passive.append(action)
示例#7
0
class Worker(object):
    def __init__(self, name, globalAC):
        self.env = StockEnv()
        self.name = name
        self.AC = ACNet(name,self.env.get_state().shape[0], 4, globalAC)

    def _update_global_reward(self, ep_r):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
            GLOBAL_RUNNING_R.append(ep_r)
        else:
            GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r)
            logger.debug(
                [self.name,
                "Ep:",
                GLOBAL_EP,
                "| Ep_r: %i" % GLOBAL_RUNNING_R[-1]]
            )
            GLOBAL_EP += 1

    def _update_globa_acnet(self, done, s_, buffer_s, buffer_a, buffer_r):
        if done:
            v_s_ = 0  # terminal
        else:
            v_s_ = SESS.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
            buffer_v_target = []
            for r in buffer_r[::-1]:  # reverse buffer r
                v_s_ = r + GAMMA * v_s_
                buffer_v_target.append(v_s_)
            buffer_v_target.reverse()

            buffer_s, buffer_a, buffer_v_target = np.vstack(
                buffer_s), np.array(buffer_a), np.vstack(buffer_v_target)
            feed_dict = {
                self.AC.s: buffer_s,
                self.AC.a_his: buffer_a,
                self.AC.v_target: buffer_v_target,
            }
            self.AC.update_global(feed_dict)

    def work(self):
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        self.env.reset()
        if self.name == 'W_0':
            self.env.render()
        while not COORD.should_stop():
            ep_r = 0
            while True:
                s = self.env._get_state()
                a, p = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if done: 
                    r = -0.5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:
                    self._update_globa_acnet(done, s_, buffer_s, buffer_a,
                                             buffer_r)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()
                # s = s_
                total_step += 1
                if done:
                    self._update_global_reward(ep_r)
                    break
                
                if self.name == 'W_0':
                    logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total])
                    time.sleep(0.5)

    def train(self):
        global GLOBAL_RUNNING_R, GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = self.env.reset()
            ep_r = 0
            while True:
                # if self.name == 'W_0':
                    # self.env.render()
                a, p = self.AC.choose_action(s)
                s_, r, done = self.env.step(a)
                if done: r = -0.5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    self._update_globa_acnet(done, s_, buffer_s, buffer_a,
                                             buffer_r)
                    buffer_s, buffer_a, buffer_r = [], [], []
                    self.AC.pull_global()

                if done:
                    self._update_global_reward(ep_r)
                    logger.debug(["s", s, " a:", a, " p:", p, " r:", r, " total_step:", total_step, 'total', self.env.total])
                    break

                s = s_
                total_step += 1
示例#8
0
    warnings.filterwarnings("ignore")  # ignore warnings

    stockHMM = StockHMM(STOCK.Baidu)
    # load model
    stockHMM.model = joblib.load("BaiDuHMM.pkl")

    print('Continuous: ')
    env = StockEnv(STOCK.Baidu)
    # [1000, 2000)
    env.set_count(999)
    for x in range(1000):
        p_states = stockHMM.predict(x + 1000)
        # order: 2 3 4 1 0
        my_action = p_states[
            2] + p_states[3] * 0.5 - p_states[1] * 0.5 - p_states[0]
        env.step(my_action)

    print(env.asset - 10000)

    print('Discrete: ')
    env = StockEnvD(STOCKD.Baidu)
    # [1000, 2000)
    env.set_count(999)
    for x in range(1000):
        p_states = stockHMM.predict(x + 1000)
        # order: 2 3 4 1 0
        my_action = round(5 * (p_states[2] + p_states[3] * 0.5 -
                               p_states[1] * 0.5 - p_states[0]) + 5)
        env.step(my_action)

    print(env.asset - 10000)
示例#9
0
class Worker(object):
    GAMMA = 0.9
    GLOBAL_RUNNING_R = []
    GLOBAL_EP = 0

    def __init__(self, sess, name, N_S, N_A, globalAC):
        self.SESS = sess
        self.N_S = N_S
        self.N_A = N_A
        self.env = StockEnv()
        self.name = name
        self.AC = A3CNet(self.SESS, self.name, self.N_S, self.N_A, globalAC)
        # self.saver = tf.train.Saver()

    def _record_global_reward_and_print(self, global_runing_rs, ep_r,
                                        global_ep, total_step):
        global_runing_rs.append(ep_r)
        try:
            print(self.name, "Ep:", global_ep,
                  "| Ep_r: %i" % global_runing_rs[-1], "| total step:",
                  total_step)
        except Exception as e:
            print(e)

    def train(self):
        buffer_s, buffer_a, buffer_r = [], [], []
        s = self.env.reset()
        ep_r = 0
        total_step = 1

        def reset():
            nonlocal ep_r, total_step
            self.env.reset()
            ep_r = 0
            total_step = 1

        while not COORD.should_stop() and self.GLOBAL_EP < MAX_GLOBAL_EP:
            # s = self.env.reset()
            # ep_r = 0
            # total_step = 1
            reset()
            while total_step < MAX_TOTAL_STEP:
                try:
                    s = self.env.get_state()
                    a, p = self.AC.choose_action(s)
                    s_, r, done = self.env.step(a)
                    if done:
                        r = -2

                    ep_r += r
                    buffer_s.append(s)
                    buffer_a.append(a)
                    buffer_r.append(r)

                    if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                        self.AC.update(done, s_, buffer_r, buffer_s, buffer_a)
                        buffer_s, buffer_a, buffer_r = [], [], []

                    if done:
                        self._record_global_reward_and_print(
                            self.GLOBAL_RUNNING_R, ep_r, self.GLOBAL_EP,
                            total_step)
                        self.GLOBAL_EP += 1
                        reset()

                    # s = s_
                    total_step += 1
                    if self.name == 'W_0':
                        self.env.render()
                        time.sleep(0.05)
                        logger.debug([
                            "s ", s, " v ",
                            self.AC.get_v(s), " a ", a, " p ", p, " ep_r ",
                            ep_r, " total ", self.env.total, " acct ",
                            self.env.acct
                        ])
                except Exception as e:
                    print(e)

            try:
                print(self.name, " not done,may be donkey!", " total_step:",
                      total_step)
            except Exception as e:
                print(e)