Пример #1
0
    def test_config_init(self, train_config):
        c = train_config
        config = DDPGPer.generate_config({})
        config["frame_config"]["models"] = [
            "Actor", "Actor", "Critic", "Critic"
        ]
        config["frame_config"][
            "model_kwargs"] = [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim,
                "action_range": c.action_range,
            }] * 2 + [{
                "state_dim": c.observe_dim,
                "action_dim": c.action_dim
            }] * 2
        ddpg_per = DDPGPer.init_from_config(config)

        old_state = state = t.zeros([1, c.observe_dim], dtype=t.float32)
        action = t.zeros([1, c.action_dim], dtype=t.float32)
        ddpg_per.store_transition({
            "state": {
                "state": old_state
            },
            "action": {
                "action": action
            },
            "next_state": {
                "state": state
            },
            "reward": 0,
            "terminal": False,
        })
        ddpg_per.update()
Пример #2
0
 def ddpg_per(self, train_config, device, dtype):
     c = train_config
     actor = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device),
         device,
         device,
     )
     actor_t = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device),
         device,
         device,
     )
     critic = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     ddpg_per = DDPGPer(
         actor,
         actor_t,
         critic,
         critic_t,
         t.optim.Adam,
         nn.MSELoss(reduction="sum"),
         replay_device="cpu",
         replay_size=c.replay_size,
     )
     return ddpg_per
Пример #3
0
 def ddpg_per_vis(self, train_config, device, dtype, tmpdir):
     # not used for training, only used for testing apis
     c = train_config
     tmp_dir = tmpdir.make_numbered_dir()
     actor = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device),
         device,
         device,
     )
     actor_t = smw(
         Actor(c.observe_dim, c.action_dim,
               c.action_range).type(dtype).to(device),
         device,
         device,
     )
     critic = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     critic_t = smw(
         Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device,
         device)
     ddpg_per = DDPGPer(
         actor,
         actor_t,
         critic,
         critic_t,
         t.optim.Adam,
         nn.MSELoss(reduction="sum"),
         replay_device="cpu",
         replay_size=c.replay_size,
         visualize=True,
         visualize_dir=str(tmp_dir),
     )
     return ddpg_per
Пример #4
0
    def test_criterion(self, train_config, device, dtype):
        c = train_config
        actor = smw(
            Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device),
            device,
            device,
        )
        actor_t = smw(
            Actor(c.observe_dim, c.action_dim, c.action_range).type(dtype).to(device),
            device,
            device,
        )
        critic = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device
        )
        critic_t = smw(
            Critic(c.observe_dim, c.action_dim).type(dtype).to(device), device, device
        )
        with pytest.raises(
            RuntimeError, match="Criterion does not have the " "'reduction' property"
        ):

            def criterion(a, b):
                return a - b

            _ = DDPGPer(
                actor,
                actor_t,
                critic,
                critic_t,
                t.optim.Adam,
                criterion,
                replay_device="cpu",
                replay_size=c.replay_size,
            )
Пример #5
0
    def _build_model(self):

        actor = self._build_actor()
        actor_target = self._build_actor()
        critic = self._build_critic()
        critic_target = self._build_critic()

        optimizer = lambda params, lr: torch.optim.Adam(
            params, lr=lr, weight_decay=self.l2_reg)
        criterion = nn.MSELoss(reduction='sum')

        # DDPG with prioritized replay
        self.ddpg_per = DDPGPer(actor,
                                actor_target,
                                critic,
                                critic_target,
                                optimizer=optimizer,
                                criterion=criterion,
                                batch_size=self.batch_size,
                                actor_learning_rate=self.actor_learning_rate,
                                critic_learning_rate=self.critic_learning_rate,
                                discount=self.gamma,
                                replay_size=self.replay_capacity)
Пример #6
0
 def ddpg_per_train(self, train_config):
     c = train_config
     # cpu is faster for testing full training.
     actor = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu")
     actor_t = smw(Actor(c.observe_dim, c.action_dim, c.action_range), "cpu", "cpu")
     critic = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu")
     critic_t = smw(Critic(c.observe_dim, c.action_dim), "cpu", "cpu")
     ddpg_per = DDPGPer(
         actor,
         actor_t,
         critic,
         critic_t,
         t.optim.Adam,
         nn.MSELoss(reduction="sum"),
         replay_device="cpu",
         replay_size=c.replay_size,
     )
     return ddpg_per
Пример #7
0
 def ddpg_per(self, train_config):
     c = train_config
     actor = smw(
         Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device),
         c.device, c.device)
     actor_t = smw(
         Actor(c.observe_dim, c.action_dim, c.action_range).to(c.device),
         c.device, c.device)
     critic = smw(
         Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
         c.device)
     critic_t = smw(
         Critic(c.observe_dim, c.action_dim).to(c.device), c.device,
         c.device)
     ddpg_per = DDPGPer(actor,
                        actor_t,
                        critic,
                        critic_t,
                        t.optim.Adam,
                        nn.MSELoss(reduction='sum'),
                        replay_device=c.device,
                        replay_size=c.replay_size)
     return ddpg_per
Пример #8
0
class RLAgent:
    """
	Base off-policy DDPG agent with prioritized replay

	"""
    def __init__(
        self,
        state_dim,  # n_feat
        action_dim,  # 1+n_assets
        k,  # time series length
        network_params,
        actor_learning_rate,
        critic_learning_rate,
        last_action_as_state=True,  # whether to add last action to state
        ipm_params=None,  # IPM params
        bcm_params=None,  # BCM params
        gamma=0.99,  # discount factor
        batch_size=128,  # mini batch size
        replay_capacity=int(1e6),  # size of experience history
        l2_reg=1e-6,  # L2 regularization weight
        noise_mode='normal',  # distribution of action noise
        noise_param=(0, 0.01)  # params of action noise
    ):

        # base params
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.k = k
        self.network_params = network_params
        self.last_action_as_state = last_action_as_state
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.gamma = gamma
        self.batch_size = batch_size
        self.replay_capacity = replay_capacity
        self.l2_reg = l2_reg
        self.noise_mode = noise_mode
        self.noise_param = noise_param

        self.ipm_active = False
        self.bcm_active = False

        # build IPM
        self.ipm_params = ipm_params
        if self.ipm_params:
            self.ipm_active = True
            self._build_ipm()
        else:
            self.ipm_dim = 0

        # build BCM
        self.bcm_params = bcm_params
        if self.bcm_params:
            self.bcm_active = True
            self._build_bcm()

        # build base learning agent
        self._build_model()

    def _build_model(self):

        actor = self._build_actor()
        actor_target = self._build_actor()
        critic = self._build_critic()
        critic_target = self._build_critic()

        optimizer = lambda params, lr: torch.optim.Adam(
            params, lr=lr, weight_decay=self.l2_reg)
        criterion = nn.MSELoss(reduction='sum')

        # DDPG with prioritized replay
        self.ddpg_per = DDPGPer(actor,
                                actor_target,
                                critic,
                                critic_target,
                                optimizer=optimizer,
                                criterion=criterion,
                                batch_size=self.batch_size,
                                actor_learning_rate=self.actor_learning_rate,
                                critic_learning_rate=self.critic_learning_rate,
                                discount=self.gamma,
                                replay_size=self.replay_capacity)

    def _build_actor(self):
        return Actor(self.state_dim, self.action_dim, self.ipm_dim, self.k,
                     self.network_params['actor'], self.last_action_as_state)

    def _build_critic(self):
        return Critic(self.state_dim, self.action_dim, self.ipm_dim, self.k,
                      self.network_params['critic'], self.last_action_as_state)

    def _build_ipm(self):
        """
		Build IPM module based on NDybM

		"""

        params = self.ipm_params

        self.ipm_dim = params['input_dim']
        self.ipm_learning_rate = params['learning_rate']

        self.ipm = RNNGaussianDyBM(self.ipm_dim,
                                   self.ipm_dim,
                                   params['rnn_dim'],
                                   spectral_radius=params['spectral_radius'],
                                   sparsity=params['sparsity'],
                                   delay=params['delay'],
                                   decay_rates=params['decay_rates'],
                                   SGD=RMSProp())
        self.ipm.set_learning_rate(self.ipm_learning_rate)

        self.ipm_base_input_noise = lambda x: np.random.normal(
            params['noise_mean'], params['noise_std'], size=x)
        self.ipm_savgol_filter = lambda x: savgol_filter(
            x,
            window_length=params['filter_window_length'],
            polyorder=params['filter_polyorder'])

        self.ipm_loss = []

    def _build_bcm(self):
        """
		Init bounds and constraints of BCM optimization problem
		"""

        n = self.action_dim
        params = self.bcm_params

        self.last_action = np.append([1],
                                     np.zeros(n - 1))  # init weight (all cash)
        self.cost_bps = params['cost_bps']
        self.bcm_update_rate = params['update_rate']

        self.bcm_bounds = Bounds(np.zeros(n * 2), np.ones(n * 2))

        self.bcm_constraints = np.block([[np.eye(n), -1 * np.eye(n)],
                                         [np.eye(n), np.eye(n)],
                                         [np.ones(n), np.zeros(n)]])

    # ======================================================================
    # Base methods

    def get_action(self,
                   state,
                   last_action=None,
                   ipm_predict=None,
                   with_noise=True):
        """
		Get action 

		Args:
			state (tensor): current state
			last_action (tensor): last action, concatenated to state if not None
			ipm_predict (tensor): IPM prediction, concatenated to state if not None
			with_noise (boolean): whether to add noise to action, True for exploration
		"""

        inputs = {'state': state}
        if self.last_action_as_state:
            inputs['last_action'] = last_action

        if self.ipm_active:
            inputs['ipm'] = ipm_predict

        if with_noise:
            # action with noise
            action = self.ddpg_per.act_with_noise(inputs,
                                                  noise_param=self.noise_param,
                                                  mode=self.noise_mode)
            return action / action.sum()

        else:
            # action without noise
            return self.ddpg_per.act(inputs)

    def store_transition(self, experience):
        """
		Store transition to replay buffer

		Args:
			experience (dict): a transition sample with 'state', 'action', 'next_state', 'reward' and 'terminal'
		"""

        self.ddpg_per.store_transition(experience)

    def update(self, return_loss=False):
        """
		Update actor anc critic networks
		"""

        if self.bcm_active:
            act_loss, value_loss = self._update_with_bcm()
        else:
            act_loss, value_loss = self.ddpg_per.update()

        if return_loss:
            return act_loss, value_loss

    def load(self, model_dir):
        """
		Load model from given dir (TO BE DEVELOPED)

		"""

        self.ddpg_per.load(model_dir)

    def save(self, model_dir):
        """
		Save model to given dir (TO BE DEVELOPED)

		"""

        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        self.ddpg_per.save(model_dir)

    # ======================================================================
    # IPM methods

    def ipm_init(self):
        """
		Restart IPM prediction
		call at the beginning of each episode
		"""

        self.ipm.init_state()
        self.ipm_loss = []

    def ipm_predict_and_learn(self, in_step, out_step=None):
        """
		Generate IPM prediction and update 
		"""
        prediction = self.ipm.predict_next()
        if out_step is not None:
            self.ipm.learn_one_step(out_step)
            self.ipm_loss.append(np.sum(np.square(prediction - out_step)))

        in_step += self._generate_ipm_input_noise(in_step.shape)
        self.ipm._update_state(in_step)

        return prediction

    def get_ipm_loss(self):
        """
		Return IPM loss as RMSE
		"""

        return np.sqrt(np.mean(self.ipm_loss))

    def _generate_ipm_input_noise(self, size):
        base = self.ipm_base_input_noise(size)
        smth = self.ipm_savgol_filter(base)

        return smth

    # ======================================================================
    # BCM methods

    def get_bcm_action(self, prices, next_prices):
        """
		Get BCM one-step greedy action
		by solving optimization problem

		max (u_t+1)^w_t - c sum_i |w'_i,t - w_i,t|
		s.t. sum_i w_i,t = 1
			 0 <= w_i,t <= 1


		equivalent to 

		max (u_t+1)^w_t - c sum_i z_i
		s.t. w'_i,t - w_i,t <= z_i
			 w'_i,t - w_i,t >= -z_i
			 sum_i w_i,t = 1
			 0 <= w_i,t <= 1
			 0 <= z_i <= 1


		"""
        # objective
        u = next_prices / prices
        obj = lambda x: self._bcm_objective(x, u)

        # linear constraint
        temp = np.dot(u, self.last_action)
        w_end = self.last_action * u / temp  # weight at the end of current period
        left_bnd = np.concatenate([-1 * np.ones(self.action_dim), w_end, [1]])
        right_bnd = np.concatenate([w_end, 2 * np.ones(self.action_dim), [1]])

        lin_constr = LinearConstraint(self.bcm_constraints, left_bnd,
                                      right_bnd)

        # solve optimization problem
        z0 = np.abs(w_end - self.last_action)
        x0 = np.append(self.last_action,
                       z0)  # use last action as init solution
        res = minimize(obj,
                       x0,
                       method='trust-constr',
                       constraints=[lin_constr],
                       bounds=self.bcm_bounds)

        self.last_action = res.x[:self.action_dim]

        return self.last_action

    def _bcm_objective(self, x, u):
        return -np.dot(u, x[:self.action_dim]) + self.cost_bps * np.sum(
            x[self.action_dim + 1:])

    def _update_with_bcm(self):
        """
		Update with BCM

		mostly copy from machin.frame.algorithms.ddpg_per
		"""

        mod = self.ddpg_per
        concatenate_samples = True

        mod.actor.train()
        mod.critic.train()

        # sample batch via prioritized replay
        batch_size, (state, action, reward, next_state, bcm_action, terminal, others), index, is_weight = \
        mod.replay_buffer.sample_batch(mod.batch_size, concatenate_samples,
                  sample_attrs=['state','action','reward','next_state','bcm_action','terminal','*'])

        # update critic network
        # - generate y_i using target actor and target critic
        with torch.no_grad():
            next_action = mod.action_transform_function(
                mod._act(next_state, True), next_state, others)
            next_value = mod._criticize(next_state, next_action, True)
            next_value = next_value.view(batch_size, -1)
            y_i = mod.reward_function(reward, mod.discount, next_value,
                                      terminal, others)

        # - critic loss
        cur_value = mod._criticize(state, action)
        value_loss = mod.criterion(cur_value, y_i.to(cur_value.device))
        value_loss = value_loss * torch.from_numpy(is_weight).view(
            [batch_size, 1]).to(value_loss.device)
        value_loss = value_loss.mean()

        # - update critic
        mod.critic.zero_grad()
        value_loss.backward()
        nn.utils.clip_grad_norm_(mod.critic.parameters(), mod.grad_max)
        mod.critic_optim.step()

        # update actor network
        # - actor loss
        cur_action = mod.action_transform_function(mod._act(state), state,
                                                   others)
        act_value = mod._criticize(state, cur_action)

        act_policy_loss = -act_value.mean()

        # - add BCM loss
        bcm_loss = self._bcm_loss(cur_action['action'], bcm_action)
        act_policy_loss += self.bcm_update_rate * bcm_loss

        # - update actor
        mod.actor.zero_grad()
        act_policy_loss.backward()
        nn.utils.clip_grad_norm_(mod.actor.parameters(), mod.grad_max)
        mod.actor_optim.step()

        # update target networks
        soft_update(mod.actor_target, mod.actor, mod.update_rate)
        soft_update(mod.critic_target, mod.critic, mod.update_rate)

        mod.actor.eval()
        mod.critic.eval()

        self.ddpg_per = mod

        return (act_value.mean().item(), -bcm_loss), value_loss.item()

    def _bcm_loss(self, action, bcm_action):
        """
		Compute log loss due to BCM action and actor's action

		action: tensor(batch_size, action_dim)
		bcm_action: List(batch_size)

		"""

        bcm_action = torch.stack(bcm_action).view(-1, self.action_dim)
        loss = bcm_action * torch.log(action + eps) + (
            1 - bcm_action) * torch.log(1 - action + eps)

        return -loss.mean()