def __init__(self, observation_space, action_space, config, dqn_config):
        BaselinePolicy.__init__(self, observation_space, action_space, config)

        self.dqn_config = dqn_config
        self.epsilon = 1
        mixed_dqn_config = mixed_dqn_net_config_example.copy()
        mixed_dqn_config.update({
            "controllable_state_num" : int(np.product(observation_space.shape)),
            "action_num": int(action_space.n),
            "uncontrollable_state_num": 31,
            "uncontrollable_pred_num": 3,
        })
        self.num_states =  int(np.product(observation_space.shape))
        self.num_actions = int(action_space.n)
        print(f'dqn state space:{self.num_states}, action space:{self.num_actions}')
        self.use_unc_part = dqn_config['use_unc_part']

        self.eval_net = mixed_dqn_net(mixed_dqn_net_config_example)
        self.target_net = mixed_dqn_net(mixed_dqn_net_config_example)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'])

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=dqn_config['lr'])
        self.loss_func = nn.SmoothL1Loss()

        self.rand_action = 0
        self.greedy_action = 0
    def __init__(self, observation_space, action_space, config, dqn_config):
        BaselinePolicy.__init__(self, observation_space, action_space, config)

        self.dqn_config = dqn_config
        self.epsilon = 1

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = int(action_space.n)
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )
        self.pred_head = dqn_config['pred']

        self.eval_net = dqn_net(self.num_states, self.num_actions,
                                self.pred_head)
        self.target_net = dqn_net(self.num_states, self.num_actions,
                                  self.pred_head)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'])

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        self.loss_func = nn.SmoothL1Loss()

        self.rand_action = 0
        self.greedy_action = 0
Пример #3
0
    def __init__(self, observation_space, action_space, config, dqn_config):
        BaselinePolicy.__init__(self, observation_space, action_space, config)

        self.dqn_config = dqn_config
        self.epsilon = 1
        self.Vmin = -1
        self.Vmax = 1
        self.atoms = 51
        self.device = torch.device('cpu')

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = int(action_space.n)
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        self.eval_net = c51_net(self.num_states, self.num_actions, self.atoms)
        self.target_net = c51_net(self.num_states, self.num_actions,
                                  self.atoms)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'])

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms)
Пример #4
0
    def __init__(self, observation_space, action_space, config, dqn_config):
        BaselinePolicy.__init__(self, observation_space, action_space, config)

        self.dqn_config = dqn_config
        self.epsilon = 1
        mixed_dqn_config = mixed_dqn_net_config_example.copy()
        mixed_dqn_config.update({
            "controllable_state_num":
            int(np.product(observation_space.shape)),
            "action_num":
            int(action_space.n),
            "uncontrollable_state_num":
            21 * 7,  # 31
            "uncontrollable_pred_num":
            6,
            'fixed_uncontrollable_param':
            dqn_config['fixed_uncontrollable_param'],
            'uncontrollable_use_cnn':
            dqn_config['use_cnn_state'],
            'embeddingmerge':
            dqn_config['embeddingmerge'],
            'activation_func':
            dqn_config['activation_func'],
            'use_bn':
            dqn_config['use_bn'],
        })
        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = int(action_space.n)
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        self.use_unc_part = dqn_config['use_unc_part']
        #self.pre_train = dqn_config['pretrain']
        self.fixed_uncontrollable_param = dqn_config['pretrain']

        if dqn_config['use_cnn_state']:
            dqn_net = mixed_dqn_unc_cnn_net
        else:
            dqn_net = mixed_dqn_net
        self.eval_net = dqn_net(mixed_dqn_config)
        self.target_net = dqn_net(mixed_dqn_config)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'])
        self.eval_memory = replay_memory(dqn_config['replay_capacity'])

        self.optimizer = torch.optim.Adam(
            self.eval_net.parameters(),
            lr=dqn_config['lr'],
            weight_decay=dqn_config['weight_decay'])
        self.loss_func = nn.SmoothL1Loss()

        self.rand_action = 0
        self.greedy_action = 0
        self.evaluation = False  # only for epsilon-greedy

        # augmentation setting
        self.train_augmentation = dqn_config['train_augmentation']
        self.demand_augmentation = demand_augmentation(
            noise_type=dqn_config['train_augmentation'],
            noise_scale=dqn_config['noise_scale'],
            sparse_scale=dqn_config['sparse_scale'])
Пример #5
0
 def __init__(self, observation_space, action_space, config):
     BaselinePolicy.__init__(self, observation_space, action_space, config)