Пример #1
0
    def __init__(self, env):
        super(Agent, self).__init__(env)

        self.model = DQN(self.obs_dim, self.action_dim)
        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)

        self.set_gui_flag(False, False)
    def __init__(self,
                 env,
                 network,
                 n_quantiles=20,
                 mean_prior=0,
                 std_prior=0.01,
                 noise_scale=0.01,
                 logging=True,
                 train_freq=10,
                 updates_per_train=100,
                 batch_size=32,
                 start_train_step=10,
                 log_folder_details=None,
                 learning_rate=1e-3,
                 verbose=False):

        self.env = env
        self.network1 = network(env.n_features, n_quantiles, mean_prior,
                                std_prior)
        self.network2 = network(env.n_features, n_quantiles, mean_prior,
                                std_prior)
        self.optimizer = optim.Adam(list(self.network1.parameters()) +
                                    list(self.network2.parameters()),
                                    lr=learning_rate,
                                    eps=1e-8)

        self.logging = logging
        self.replay_buffer = ReplayBuffer()
        self.batch_size = batch_size
        self.log_folder_details = log_folder_details
        self.n_quantiles = n_quantiles
        self.train_freq = train_freq
        self.start_train_step = start_train_step
        self.updates_per_train = updates_per_train
        self.n_samples = 0
        self.noise_scale = noise_scale
        self.std_prior = std_prior
        self.verbose = verbose

        self.prior1 = [
            p.data.clone() for p in list(self.network1.features.parameters())
        ]
        self.prior2 = [
            p.data.clone() for p in list(self.network2.features.parameters())
        ]

        self.train_parameters = {
            'n_quantiles': n_quantiles,
            'mean_prior': mean_prior,
            'std_prior': std_prior,
            'train_freq': train_freq,
            'updates_per_train': updates_per_train,
            'batch_size': batch_size,
            'start_train_step': start_train_step,
            'learning_rate': learning_rate,
            'noise_scale': noise_scale
        }
Пример #3
0
    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("Q-network Agent is created")

        self.action_dim = env.action_space.n
        self.obs_dim = np.power(int(env.observation_space.high[0] + 1), 2)

        self.model = Q_Network(self.obs_dim, self.action_dim, train_step)

        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)
Пример #4
0
    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("DQN Agent")

        self.action_dim = env.action_space.n
        self.obs_dim = observation_dim(env.observation_space)
            
        self.model = DQN(self.obs_dim, self.action_dim)

        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)
Пример #5
0
    def __init__(self, env):
        super(Agent, self).__init__(env)
        print("DDPG Agent")

        self.action_dim = action_dim(
            env.action_space)  ### KH: for continuous action task
        self.obs_dim = observation_dim(env.observation_space)
        self.action_max = env.action_space.high  ### KH: DDPG action bound
        self.action_min = env.action_space.low  ### KH: DDPG action bound
        self.model = self.set_model()
        self.replay_buffer = ReplayBuffer(minibatch_size=minibatch_size)
    def __init__(self,
                 env,
                 network,
                 epsilon=0.05,
                 n_quantiles=20,
                 mean_prior=0,
                 std_prior=0.01,
                 logging=True,
                 train_freq=10,
                 updates_per_train=100,
                 batch_size=32,
                 start_train_step=10,
                 log_folder_details=None,
                 learning_rate=1e-3,
                 verbose=False):

        self.env = env
        self.network = network(env.n_features, n_quantiles, mean_prior,
                               std_prior)
        self.logging = logging
        self.replay_buffer = ReplayBuffer()
        self.batch_size = batch_size
        self.log_folder_details = log_folder_details
        self.epsilon = epsilon
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=learning_rate,
                                    eps=1e-8)
        self.n_quantiles = n_quantiles
        self.train_freq = train_freq
        self.start_train_step = start_train_step
        self.updates_per_train = updates_per_train
        self.verbose = verbose

        self.n_samples = 0

        self.train_parameters = {
            'epsilon': epsilon,
            'n_quantiles': n_quantiles,
            'mean_prior': mean_prior,
            'std_prior': std_prior,
            'train_freq': train_freq,
            'updates_per_train': updates_per_train,
            'batch_size': batch_size,
            'start_train_step': start_train_step,
            'learning_rate': learning_rate
        }
    def __init__(self,
                 env,
                 network,
                 mean_prior=0,
                 std_prior=0.01,
                 noise_scale=0.01,
                 logging=True,
                 train_freq=10,
                 updates_per_train=100,
                 batch_size=32,
                 start_train_step=10,
                 log_folder_details=None,
                 learning_rate=1e-3,
                 bayesian_sample_size=20,
                 verbose=False):

        self.env = env
        self.network = BayesianNetwork(env.n_features, torch.device('cpu'),
                                       std_prior, noise_scale)
        self.logging = logging
        self.replay_buffer = ReplayBuffer()
        self.batch_size = batch_size
        self.log_folder_details = log_folder_details
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=learning_rate,
                                    eps=1e-8)
        self.train_freq = train_freq
        self.start_train_step = start_train_step
        self.updates_per_train = updates_per_train
        self.bayesian_sample_size = bayesian_sample_size
        self.verbose = verbose

        self.n_samples = 0
        self.timestep = 0

        self.train_parameters = {
            'mean_prior': mean_prior,
            'std_prior': std_prior,
            'noise_scale': noise_scale,
            'train_freq': train_freq,
            'updates_per_train': updates_per_train,
            'batch_size': batch_size,
            'start_train_step': start_train_step,
            'learning_rate': learning_rate,
            'bayesian_sample_size': bayesian_sample_size
        }
Пример #8
0
    def __init__(self,
                 env,
                 network,
                 dropout=0.1,
                 std_prior=0.01,
                 logging=True,
                 train_freq=10,
                 updates_per_train=100,
                 weight_decay=1e-5,
                 batch_size=32,
                 start_train_step=10,
                 log_folder_details=None,
                 learning_rate=1e-3,
                 verbose=False):

        self.env = env
        self.network = network(env.n_features, std_prior, dropout=dropout)
        self.logging = logging
        self.replay_buffer = ReplayBuffer()
        self.batch_size = batch_size
        self.log_folder_details = log_folder_details
        self.train_freq = train_freq
        self.start_train_step = start_train_step
        self.updates_per_train = updates_per_train
        self.verbose = verbose
        self.dropout = dropout
        self.weight_decay = weight_decay

        self.n_samples = 0
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=learning_rate,
                                    eps=1e-8,
                                    weight_decay=self.weight_decay)

        self.train_parameters = {
            'dropout': dropout,
            'weight_decay': weight_decay,
            'std_prior': std_prior,
            'train_freq': train_freq,
            'updates_per_train': updates_per_train,
            'batch_size': batch_size,
            'start_train_step': start_train_step,
            'learning_rate': learning_rate
        }
Пример #9
0
    def __init__(
        self,
        env,
        network,
        gamma=0.99,
        replay_start_size=50000,
        replay_buffer_size=1000000,
        n_quantiles=50,
        kappa=1,
        weight_scale=3,
        noise_scale=0.1,
        epistemic_factor=1,
        aleatoric_factor=1,
        update_target_frequency=10000,
        minibatch_size=32,
        update_frequency=1,
        learning_rate=1e-3,
        seed=None,
        adam_epsilon=1e-8,
        biased_aleatoric=False,
        logging=False,
        log_folder_details=None,
        save_period=250000,
        notes=None,
        render=False,
    ):

        # Agent parameters
        self.env = env
        self.gamma = gamma
        self.replay_start_size = replay_start_size
        self.replay_buffer_size = replay_buffer_size
        self.n_quantiles = n_quantiles
        self.kappa = kappa
        self.weight_scale = weight_scale
        self.noise_scale = noise_scale
        self.epistemic_factor = epistemic_factor,
        self.aleatoric_factor = aleatoric_factor,
        self.update_target_frequency = update_target_frequency
        self.minibatch_size = minibatch_size
        self.update_frequency = update_frequency
        self.learning_rate = learning_rate
        self.seed = random.randint(0, 1e6) if seed is None else seed
        self.adam_epsilon = adam_epsilon
        self.biased_aleatoric = biased_aleatoric
        self.logging = logging
        self.log_folder_details = log_folder_details
        self.save_period = save_period
        self.render = render
        self.notes = notes

        # Set global seed before creating network
        set_global_seed(self.seed, self.env)

        # Initialize agent
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.logger = None
        self.loss = quantile_huber_loss
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)

        # Initialize main Q learning network
        n_outputs = self.env.action_space.n*self.n_quantiles
        self.network = network(self.env.observation_space, n_outputs).to(self.device)
        self.target_network = network(self.env.observation_space, n_outputs).to(self.device)
        self.target_network.load_state_dict(self.network.state_dict())

        # Initialize anchored networks
        self.posterior1 = network(self.env.observation_space, n_outputs, weight_scale=weight_scale).to(self.device)
        self.posterior2 = network(self.env.observation_space, n_outputs, weight_scale=weight_scale).to(self.device)
        self.anchor1 = [p.data.clone() for p in list(self.posterior1.parameters())]
        self.anchor2 = [p.data.clone() for p in list(self.posterior2.parameters())]

        # Initialize optimizer
        params = list(self.network.parameters()) + list(self.posterior1.parameters()) + list(self.posterior2.parameters())
        self.optimizer = optim.Adam(params, lr=self.learning_rate, eps=self.adam_epsilon)

        # Figure out what the scale of the prior is from empirical std of network weights
        with torch.no_grad():
            std_list = []
            for i, p in enumerate(self.posterior1.parameters()):
                std_list.append(torch.std(p))
        self.prior_scale = torch.stack(std_list).mean().item()

        # Parameters to save to log file
        self.train_parameters = {
                    'Notes': notes,
                    'env': str(env),
                    'network': str(self.network),
                    'n_quantiles': n_quantiles,
                    'replay_start_size': replay_start_size,
                    'replay_buffer_size': replay_buffer_size,
                    'gamma': gamma,
                    'update_target_frequency': update_target_frequency,
                    'minibatch_size': minibatch_size,
                    'learning_rate': learning_rate,
                    'update_frequency': update_frequency,
                    'weight_scale': weight_scale,
                    'noise_scale': noise_scale,
                    'epistemic_factor': epistemic_factor,
                    'aleatoric_factor': aleatoric_factor,
                    'biased_aleatoric': biased_aleatoric,
                    'adam_epsilon': adam_epsilon,
                    'seed': self.seed
                    }
Пример #10
0
    def __init__(
        self,
        env,
        network,
        replay_start_size=50000,
        replay_buffer_size=1000000,
        gamma=0.99,
        update_target_frequency=10000,
        minibatch_size=32,
        learning_rate=1e-3,
        update_frequency=1,
        initial_exploration_rate=1,
        final_exploration_rate=0.1,
        final_exploration_step=1000000,
        adam_epsilon=1e-8,
        logging=False,
        log_folder_details=None,
        seed=None,
        render=False,
        loss="huber",
        notes=None
    ):

        

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.replay_start_size = replay_start_size
        self.replay_buffer_size = replay_buffer_size
        self.gamma = gamma
        self.update_target_frequency = update_target_frequency
        self.minibatch_size = minibatch_size
        self.learning_rate = learning_rate
        self.update_frequency = update_frequency
        self.initial_exploration_rate = initial_exploration_rate
        self.epsilon = self.initial_exploration_rate
        self.final_exploration_rate = final_exploration_rate
        self.final_exploration_step = final_exploration_step
        self.adam_epsilon = adam_epsilon
        self.logging = logging
        self.render=render
        self.log_folder_details = log_folder_details
        if callable(loss):
            self.loss = loss
        else:
            try:
                self.loss = {'huber': F.smooth_l1_loss, 'mse': F.mse_loss}[loss]
            except KeyError:
                raise ValueError("loss must be 'huber', 'mse' or a callable")

        self.env = env
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.seed = random.randint(0, 1e6) if seed is None else seed
        self.logger = None

        set_global_seed(self.seed, self.env)

        self.network = network(self.env.observation_space, self.env.action_space.n).to(self.device)
        self.target_network = network(self.env.observation_space, self.env.action_space.n).to(self.device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate, eps=self.adam_epsilon)

        self.train_parameters = {'Notes':notes,
                'env':env.unwrapped.spec.id,
                'network':str(self.network),
                'replay_start_size':replay_start_size,
                'replay_buffer_size':replay_buffer_size,
                'gamma':gamma,
                'update_target_frequency':update_target_frequency,
                'minibatch_size':minibatch_size,
                'learning_rate':learning_rate,
                'update_frequency':update_frequency,
                'initial_exploration_rate':initial_exploration_rate,
                'final_exploration_rate':final_exploration_rate,
                'weight_scale':self.network.weight_scale,
                'final_exploration_step':final_exploration_step,
                'adam_epsilon':adam_epsilon,
                'loss':loss,
                'seed':self.seed}
Пример #11
0
    def __init__(self,
                 env,
                 network,
                 n_quantiles=20,
                 kappa=1,
                 lamda=0.1,
                 replay_start_size=50000,
                 replay_buffer_size=1000000,
                 gamma=0.99,
                 update_target_frequency=10000,
                 epsilon_12=0.00001,
                 minibatch_size=32,
                 learning_rate=1e-4,
                 update_frequency=1,
                 prior=0.01,
                 adam_epsilon=1e-8,
                 logging=False,
                 log_folder_details=None,
                 seed=None,
                 notes=None):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.replay_start_size = replay_start_size
        self.replay_buffer_size = replay_buffer_size
        self.gamma = gamma
        self.epsilon_12 = epsilon_12
        self.lamda = lamda
        self.update_target_frequency = update_target_frequency
        self.minibatch_size = minibatch_size
        self.learning_rate = learning_rate
        self.update_frequency = update_frequency
        self.adam_epsilon = adam_epsilon
        self.logging = logging
        self.logger = None
        self.timestep = 0
        self.log_folder_details = log_folder_details

        self.env = env
        self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
        self.seed = random.randint(0, 1e6) if seed is None else seed
        set_global_seed(self.seed, self.env)

        self.n_quantiles = n_quantiles

        self.network = network(self.env.observation_space,
                               self.env.action_space.n * self.n_quantiles,
                               self.env.action_space.n * self.n_quantiles).to(
                                   self.device)
        self.target_network = network(
            self.env.observation_space,
            self.env.action_space.n * self.n_quantiles,
            self.env.action_space.n * self.n_quantiles).to(self.device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate,
                                    eps=self.adam_epsilon)

        self.anchor1 = [
            p.data.clone() for p in list(self.network.output_1.parameters())
        ]
        self.anchor2 = [
            p.data.clone() for p in list(self.network.output_2.parameters())
        ]

        self.loss = quantile_huber_loss
        self.kappa = kappa
        self.prior = prior

        self.train_parameters = {
            'Notes': notes,
            'env': env.unwrapped.spec.id,
            'network': str(self.network),
            'replay_start_size': replay_start_size,
            'replay_buffer_size': replay_buffer_size,
            'gamma': gamma,
            'lambda': lamda,
            'epsilon_1and2': epsilon_12,
            'update_target_frequency': update_target_frequency,
            'minibatch_size': minibatch_size,
            'learning_rate': learning_rate,
            'update_frequency': update_frequency,
            'kappa': kappa,
            'weight_scale': self.network.weight_scale,
            'n_quantiles': n_quantiles,
            'prior': prior,
            'adam_epsilon': adam_epsilon,
            'seed': self.seed
        }

        self.n_greedy_actions = 0
        self.timestep = 0