Пример #1
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """

    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames,) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if(len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[0,:])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if(self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay


    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done  = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if(self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting
            if(args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if(self.with_per):
            q_val = self.agent.predict(new_state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)
Пример #2
0
class DDQN:
    """ Deep Q-Learning Main Algorithm          深度Q学习主要算法
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization      初始化
        """
        # Environment and DDQN parameters       环境和DDQN参数
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = (args.consecutive_frames, ) + state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        if (len(state_dim) < 3):
            self.tau = 1e-2
        else:
            self.tau = 1.0
        # Create actor and critic networks      建立演员和评论家网络
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling)
        # Memory Buffer for Experience Replay   用于经验重播的内存缓冲区
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action          应用epsilon-greedy策略选择下一步操作
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer            从缓冲区采样的批次训练Q网络
        """
        # Sample experience from memory buffer (optionally with PER)    来自内存缓冲区的示例体验(可选配PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN     在批次样本里应用Bellman方程来训练我们的DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree                   更新PER Sum树
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch                                批量训练
        self.agent.fit(s, q)
        # Decay epsilon                                 衰变epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm                DDQN主要训练算法
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        for e in tqdm_e:
            # Reset episode                             重设episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)                      演员选择动作(遵循政策)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal     检索新状态,奖励以及该状态是否为终端
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay                                    保存经验重播
                self.memorize(old_state, a, r, done, new_state)
                # Update current state                                              更新当前状态
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network                 训练DDQN并将权重转移到目标网络
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting                   收集每个情节的统计数据以进行绘图
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard                            为Tensorboard导出结果
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score             显示分数
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer                           将经验存储在内存缓冲区中
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(self.agent.predict(new_state))
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def save_weights(self, path):
        path += '_LR_{}'.format(self.lr)
        if (self.with_per):
            path += '_PER'
        self.agent.save(path)

    def load_weights(self, path):
        self.agent.load_weights(path)
Пример #3
0
class ddpgAgent():
    """Deep Deterministic Policy Gradient(DDPG) Agent
	"""
    def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=1e-4,
                              tau_=1e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=1e-3,
                                tau_=1e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        a = np.clip(action_ + self.noise.generate(t) if noise else 0,
                    -self.action_bound, self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)

        # get next action and Q-value Gradient
        n_actions = self.actor.network.predict(obs)
        q_grads = self.critic.Qgradient(obs, n_actions)

        # update actor
        self.actor.train(obs, self.critic.network, q_grads)

        # update target networks
        self.actor.target_update()
        self.critic.target_update()

    def replay(self, replay_num_):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        for _ in range(replay_num_):
            # sample from buffer
            states, actions, rewards, dones, new_states, idx = self.sample_batch(
                self.batch_size)

            # get target q-value using target network
            q_vals = self.critic.target_predict(
                [new_states, self.actor.target_predict(new_states)])

            # bellman iteration for target critic value
            critic_target = np.asarray(q_vals)
            for i in range(q_vals.shape[0]):
                if dones[i]:
                    critic_target[i] = rewards[i]
                else:
                    critic_target[
                        i] = self.discount_factor * q_vals[i] + rewards[i]

                if self.with_per:
                    self.buffer.update(idx[i],
                                       abs(q_vals[i] - critic_target[i]))

            # train(or update) the actor & critic and target networks
            self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)
Пример #4
0
class DDQN:
    """ Deep Q-Learning Main Algorithm
    """
    def __init__(self, action_dim, state_dim, args):
        """ Initialization
        """
        # Environment and DDQN parameters
        self.with_per = args.with_per
        self.action_dim = action_dim
        self.state_dim = state_dim
        #
        self.lr = 2.5e-4
        self.gamma = 0.95
        self.epsilon = 0.8
        self.epsilon_decay = 0.99
        self.buffer_size = 20000
        #
        self.tau = 1e-2

        # Create actor and critic networks
        self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau,
                           args.dueling, args.hidden_dim)
        # Memory Buffer for Experience Replay
        self.buffer = MemoryBuffer(self.buffer_size, args.with_per)

    def policy_action(self, s):
        """ Apply an espilon-greedy policy to pick next action
        """
        if random() <= self.epsilon:
            return randrange(self.action_dim)
        else:
            return np.argmax(self.agent.predict(s)[0])

    def train_agent(self, batch_size):
        """ Train Q-network on batch sampled from the buffer
        """
        # Sample experience from memory buffer (optionally with PER)
        s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)

        # Apply Bellman Equation on batch samples to train our DDQN
        q = self.agent.predict(s)
        next_q = self.agent.predict(new_s)
        q_targ = self.agent.target_predict(new_s)

        for i in range(s.shape[0]):
            old_q = q[i, a[i]]
            if d[i]:
                q[i, a[i]] = r[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
            if (self.with_per):
                # Update PER Sum Tree
                self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
        # Train on batch
        self.agent.fit(s, q)
        # Decay epsilon
        self.epsilon *= self.epsilon_decay

    def train(self, env, args, summary_writer, envtest=None):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        epoch = 0
        gross_profit = 0
        WritetoCsvFile("logFile_1.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit",
            "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss",
            "maxdrop", "Total profit", "total_reward", "TRADES", "epoch"
        ])
        WritetoCsvFile("logFileDetail.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit',
            'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit',
            'gross profit', "total_reward", 'TRADES', 'epoch'
        ])

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            ##########################################
            total_reward = 0
            total_profit = 0
            total_loss = 0
            total_profitMax = 0
            total_profitMin = 0
            max_drop = 0
            profitLst = []
            lossLst = []
            trades = 0
            step = 0
            #####################################3####

            while not done:
                #if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                #new_state, r, done, _ = env.step(a)

                #######################################################
                new_state, r, done, buy, sell, profit = env.step(a)

                total_reward += r
                if profit != 0:
                    trades += 1
                    total_profit += profit
                    if total_profit > total_profitMax:
                        total_profitMax = total_profit
                        total_profitMin = total_profit
                    if total_profit < total_profitMin:
                        total_profitMin = total_profit
                        try:
                            if total_profitMax != 0 and max_drop < (
                                    total_profitMax -
                                    total_profitMin) / total_profitMax:
                                max_drop = (total_profitMax -
                                            total_profitMin) / total_profitMax
                        except:
                            max_drop = 0

                if profit > 0:
                    profitLst.append(profit)
                elif profit < 0:
                    lossLst.append(profit)

                step += 1
                if step % 1500 == 0:
                    print(
                        'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {}  '
                        .format(np.max(profitLst + [0]),
                                -np.min(lossLst + [0]),
                                np.mean(profitLst + [0]),
                                -np.mean(lossLst + [0]), max_drop,
                                total_profit, gross_profit, trades))

                    WritetoCsvFile("logFileDetail.csv", [
                        "train", args.trainf, args.history_win, args.stop,
                        args.usevol, args.dueling, args.traineval,
                        args.allprices, args.allprices2, args.allprices3,
                        args.ma1, args.ma2, args.madifference, args.hidema,
                        args.candlenum, args.hidden_dim,
                        np.max(profitLst + [0]), -np.min(lossLst + [0]),
                        np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                        max_drop, total_profit, gross_profit, total_reward,
                        trades, epoch
                    ])
                #done = True if step == len(env.data) - 3 else False
                ######################################################
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            gross_profit += total_profit
            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            l_profit = tfSummary('profit', total_profit)
            l_aprofit = tfSummary('average profit', np.mean(profitLst))
            l_aloss = tfSummary('l_aloss', -np.mean(lossLst))
            l_trades = tfSummary('l_trades', trades)
            np.mean(profitLst), -np.mean(lossLst)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.add_summary(l_profit, global_step=e)
            summary_writer.add_summary(l_aprofit, global_step=e)
            summary_writer.add_summary(l_aloss, global_step=e)
            summary_writer.add_summary(l_trades, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            self.agent.saveModel("./models/model_ep", "")
            results = [
                np.max(profitLst + [0]), -np.min(lossLst + [0]),
                np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                len(profitLst),
                len(lossLst), max_drop, total_profit, total_reward, trades
            ]

            WritetoCsvFile("logFile_1.csv", [
                "train", args.trainf, args.history_win, args.stop, args.usevol,
                args.dueling, args.traineval, args.allprices, args.allprices2,
                args.allprices3, args.ma1, args.ma2, args.madifference,
                args.hidema, args.candlenum, args.hidden_dim
            ] + results + [epoch])
            if envtest:  # Если задано окружение для тестирования то тестируем каждую эпоху
                newargs = args
                newargs.traineval = False
                self.evaluate(envtest,
                              newargs,
                              summary_writer,
                              model=None,
                              epoch=epoch)

            epoch += 1
        return results

    def memorize(self, state, action, reward, done, new_state):
        """ Store experience in memory buffer
        """

        if (self.with_per):
            q_val = self.agent.predict(state)
            q_val_t = self.agent.target_predict(new_state)
            next_best_action = np.argmax(q_val)
            new_val = reward + self.gamma * q_val_t[0, next_best_action]
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0
        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def evaluate(self, env, args, summary_writer, model, epoch=0):
        """ Evaluate            """
        results = []
        if model:
            self.agent.loadModel_versoin(model, "")
        done = False
        old_state = env.reset()
        ##########################################
        total_reward = 0
        total_profit = 0
        total_loss = 0
        total_profitMax = 0
        total_profitMin = 0
        max_drop = 0
        profitLst = []
        lossLst = []
        step = 0
        trades = 0
        #####################################3####
        while not done:
            # if args.render: env.render()
            # Actor picks an action (following the policy)
            a = self.policy_action(old_state)
            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, buy, sell, profit = env.step(a)

            #######################################################
            total_reward += r
            if profit != 0:
                trades += 1
                total_profit += profit
                if total_profit > total_profitMax:
                    total_profitMax = total_profit
                    total_profitMin = total_profit
                if total_profit < total_profitMin:
                    total_profitMin = total_profit
                    try:
                        if total_profitMax != 0 and max_drop < (
                                total_profitMax -
                                total_profitMin) / total_profitMax:
                            max_drop = (total_profitMax -
                                        total_profitMin) / total_profitMax
                    except:
                        max_drop = 0
            if profit > 0:
                profitLst.append(profit)

            elif profit < 0:
                lossLst.append(profit)
            step += 1
            if step % 1500 == 0:
                print(
                    'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}  Total reward: {}  TRADES: {}  '
                    .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                            max_drop, total_profit, total_reward, trades))
                WritetoCsvFile("logFileDetail.csv", [
                    "eval", args.trainf, args.history_win, args.stop,
                    args.usevol, args.dueling, args.traineval, args.allprices,
                    args.allprices2, args.allprices3, args.ma1, args.ma2,
                    args.madifference, args.hidema, args.candlenum,
                    args.hidden_dim,
                    np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_profit, total_reward, trades,
                    epoch
                ])
            #done = True if step == len(env.data) - 2 else False
            ######################################################
            # Memorize for experience replay
            if args.traineval:
                self.memorize(old_state, a, r, done, new_state)
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()
            # Update current state
            old_state = new_state
        print(
            'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {} Total reward: {} TRADES: {}  '
            .format(np.max(profitLst + [0]), -np.min(lossLst + [0]),
                    np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                    max_drop, total_profit, total_reward, trades))
        results = [
            np.max(profitLst + [0]), -np.min(lossLst + [0]),
            np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
            len(profitLst),
            len(lossLst), max_drop, total_profit, total_reward, trades
        ]
        WritetoCsvFile("logFile_1.csv", [
            "eval", args.trainf, args.history_win, args.stop, args.usevol,
            args.dueling, args.traineval, args.allprices, args.allprices2,
            args.allprices3, args.ma1, args.ma2, args.madifference,
            args.hidema, args.candlenum, args.hidden_dim
        ] + results + [epoch])
        return results
Пример #5
0
class Agent:
    """ Stock Trading Bot """
    def __init__(self,
                 buffer_size,
                 state_size,
                 action_size=3,
                 learning_rate=0.001):

        # agent config
        self.buffer = MemoryBuffer(buffer_size, True)
        self.state_size = state_size
        self.action_size = action_size
        self.inventory = []

        # model config
        self.gamma = 0.95  # affinity for long term reward
        self.loss = huber_loss
        self.optimizer = Adam(lr=learning_rate)

        # target network
        self.model = self._model()
        self.target_model = clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def _model(self):

        inputs = Input(shape=self.state_size)
        x = Dense(64, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)

        value = Dense(self.action_size, activation='linear')(x)
        a = Dense(self.action_size, activation='linear')(x)
        meam = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(a)
        advantage = Subtract()([a, meam])
        q = Add()([value, advantage])

        model = Model(inputs=inputs, outputs=q)
        model.compile(loss=self.loss, optimizer=self.optimizer)
        return model

    def remember(self, state, action, reward, next_state, done):

        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, epsilon, is_eval=False):

        # take random action in order to diversify experience at the beginning
        if not is_eval and random.random() <= epsilon:
            return random.randrange(self.action_size)

        state = state.reshape((-1, ) + self.state_size)
        action_probs = self.model.predict(state)
        return np.argmax(action_probs[0])

    def epsilon_decay(self, epsilon, epsilon_min, epsilon_decay):
        if epsilon > epsilon_min:
            epsilon *= epsilon_decay
        return epsilon

    def remember_sumtree(
        self,
        state,
        action,
        reward,
        new_state,
        done,
    ):

        state = state.reshape((-1, ) + self.state_size)
        new_state = new_state.reshape((-1, ) + self.state_size)

        q_val = self.model.predict(state)
        q_val_t = self.target_model.predict(new_state)
        next_best_action = np.argmax(q_val)
        new_val = reward + self.gamma * q_val_t[0, next_best_action]
        td_error = abs(new_val - q_val + 1e-8)[0]

        self.buffer.memorize(state, action, reward, done, new_state, td_error)

    def target_model_update(self,
                            done,
                            tau=0.1,
                            type='reset',
                            reset_every=5000):

        if type == 'reset':
            if self.n_iter % reset_every == 0:
                print('update target model')
                # reset target model weights
                self.target_model.set_weights(self.model.get_weights())

        if type == 'transfer':
            if done:
                W = self.model.get_weights()
                tgt_W = self.target_model.get_weights()
                for i in range(len(W)):
                    tgt_W[i] = tau * W[i] + (1 - tau) * tgt_W[i]
                self.target_model.set_weights(tgt_W)

    def train_experience_replay_sumtree(
        self,
        batch_size,
    ):

        state, action, reward, done, new_state, idx = self.buffer.sample_batch(
            batch_size)

        state = state.reshape((-1, ) + self.state_size)
        new_state = new_state.reshape((-1, ) + self.state_size)

        q = self.model.predict(state)

        next_q = self.model.predict(new_state)
        q_targ = self.target_model.predict(new_state)

        for i in range(state.shape[0]):
            old_q = q[i, action[i]]
            if done[i]:
                q[i, action[i]] = reward[i]
            else:
                next_best_action = np.argmax(next_q[i, :])
                q[i, action[i]] = reward[i] + self.gamma * q_targ[
                    i, next_best_action]
            self.buffer.update(idx[i], abs(old_q - q[i, action[i]]))

        loss = self.model.fit((state), q, epochs=1,
                              verbose=0).history["loss"][0]

        return loss

    def save(self, name):
        if not os.path.exists('save/' + name):
            os.makedirs('save/' + name)
            np.save('save/' + name + '/data.npy', self.buffer.buffer.data)
            np.save('save/' + name + '/tree.npy', self.buffer.buffer.tree)
            self.model.save('save/' + name + '/model.h5')
            self.target_model.save('save/' + name + '/target_model.h5')
        else:
            print('already exist, please check.')

    def load(self, name):
        if not os.path.exists('save/' + name):
            print('not exist, please check.')
        else:
            self.buffer.buffer.data = np.load('save/' + name + '/data.npy',
                                              allow_pickle=True)
            self.buffer.buffer.tree = np.load('save/' + name + '/tree.npy',
                                              allow_pickle=True)
            self.model = load_model('save/' + name + '/model.h5')
            self.target_model = load_model('save/' + name + '/target_model.h5')
Пример #6
0
class td3Agent():
    """Twin Delayed Deep Deterministic Policy Gradient(TD3) Agent
	"""
    def __init__(self,
                 env_,
                 is_discrete=False,
                 batch_size=100,
                 w_per=True,
                 update_delay=2):
        # gym environments
        self.env = env_
        self.discrete = is_discrete
        self.obs_dim = env_.observation_space.shape[0]
        self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[
            0]

        self.action_bound = (env_.action_space.high - env_.action_space.low
                             ) / 2 if not is_discrete else 1.
        self.action_shift = (env_.action_space.high + env_.action_space.low
                             ) / 2 if not is_discrete else 0.

        # initialize actor & critic and its targets
        self.discount_factor = 0.99
        self.actor = ActorNet(self.obs_dim,
                              self.act_dim,
                              self.action_bound,
                              lr_=3e-4,
                              tau_=5e-3)
        self.critic = CriticNet(self.obs_dim,
                                self.act_dim,
                                lr_=3e-4,
                                tau_=5e-3,
                                discount_factor=self.discount_factor)

        # Experience Buffer
        self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per)
        self.with_per = w_per
        self.batch_size = batch_size
        # OU-Noise-Process
        self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

        # for Delayed Policy Update
        self._update_step = 0
        self._target_update_interval = update_delay

    ###################################################
    # Network Related
    ###################################################
    def make_action(self, obs, t, noise=True):
        """ predict next action from Actor's Policy
		"""
        action_ = self.actor.predict(obs)[0]
        sigma = 0.1  # std of gaussian
        a = np.clip(
            action_ +
            np.random.normal(0, self.action_bound * sigma) if noise else 0,
            -self.action_bound, self.action_bound)
        #a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound)
        return a

    def make_target_action(self, obs, noise=True):
        """ predict next action from Actor's Target Policy
		"""
        action_ = self.actor.target_predict(obs)
        sigma = 0.2
        #return action_
        cliped_noise = np.clip(np.random.normal(0, self.action_bound * sigma),
                               -self.action_bound * 0.5,
                               self.action_bound * 0.5)
        a = np.clip(action_ + cliped_noise if noise else 0, -self.action_bound,
                    self.action_bound)
        return a

    def update_networks(self, obs, acts, critic_target):
        """ Train actor & critic from sampled experience
		"""
        # update critic
        self.critic.train(obs, acts, critic_target)
        if self._update_step % self._target_update_interval == 0:
            # update actor
            self.actor.train(obs, self.critic.network_1)

            # update target networks
            self.actor.target_update()
            self.critic.target_update()
        self._update_step = self._update_step + 1

    def train(self):
        if self.with_per and (self.buffer.size() <= self.batch_size): return

        # sample from buffer
        states, actions, rewards, dones, new_states, idx = self.sample_batch(
            self.batch_size)

        # get target q-value using target network
        new_actions = self.make_target_action(new_states)
        q1_vals = self.critic.target_network_1.predict(
            [new_states, new_actions])
        q2_vals = self.critic.target_network_2.predict(
            [new_states, new_actions])

        # bellman iteration for target critic value
        q_vals = np.min(np.vstack([q1_vals.transpose(),
                                   q2_vals.transpose()]),
                        axis=0)
        critic_target = np.asarray(q_vals)
        # print(np.vstack([q1_vals.transpose(),q2_vals.transpose()]))
        # print(q_vals)
        for i in range(q1_vals.shape[0]):
            if dones[i]:
                critic_target[i] = rewards[i]
            else:
                critic_target[
                    i] = self.discount_factor * q_vals[i] + rewards[i]

            if self.with_per:
                self.buffer.update(idx[i], abs(q_vals[i] - critic_target[i]))

        # train(or update) the actor & critic and target networks
        self.update_networks(states, actions, critic_target)

    ####################################################
    # Buffer Related
    ####################################################

    def memorize(self, obs, act, reward, done, new_obs):
        """store experience in the buffer
		"""
        if self.with_per:
            # not implemented for td3, yet.
            q_val = self.critic.network(
                [np.expand_dims(obs, axis=0),
                 self.actor.predict(obs)])
            next_action = self.actor.target_network.predict(
                np.expand_dims(new_obs, axis=0))
            q_val_t = self.critic.target_network.predict(
                [np.expand_dims(new_obs, axis=0), next_action])
            new_val = reward + self.discount_factor * q_val_t
            td_error = abs(new_val - q_val)[0]
        else:
            td_error = 0

        self.buffer.memorize(obs, act, reward, done, new_obs, td_error)

    def sample_batch(self, batch_size):
        """ Sampling from the batch
		"""
        return self.buffer.sample_batch(batch_size)

    ###################################################
    # Save & Load Networks
    ###################################################
    def save_weights(self, path):
        """ Agent's Weights Saver
		"""
        self.actor.save_network(path)
        self.critic.save_network(path)

    def load_weights(self, pretrained):
        """ Agent's Weights Loader
		"""
        self.actor.load_network(pretrained)
        self.critic.load_network(pretrained)