示例#1
0
    def __init__(self,
                 n_states,
                 n_actions,
                 hidden_dim=90,
                 device="cpu",
                 critic_lr=5e-3,
                 actor_lr=5e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128):
        self.device = device
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)

        for target_param, param in zip(self.target_critic.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.memory = ReplayBuffer(memory_capacity)
        self.batch_size = batch_size
        self.soft_tau = soft_tau
        self.gamma = gamma
	def __init__(self, env):
		self.env  = env
		self.num_robots = env.num_robots

		self.learning_rate = 0.0001
		self.epsilon = .9
		self.epsilon_decay = .99995
		self.eps_counter = 0
		self.gamma = .90
		self.tau   = .01


		self.buffer_size = 1000000
		self.batch_size = 512

		self.hyper_parameters_lambda3 = 0.2
		self.hyper_parameters_eps = 0.2
		self.hyper_parameters_eps_d = 0.4

		self.demo_size = 1000
		self.time_str = time.strftime("%Y%m%d-%H%M%S")
		self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights"
		self.save_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/results/trained_weights/exp2/HLERnoisy/" 
		self.path = os.path.join(self.parent_dir, self.time_str)
		os.mkdir(self.path)

        # Replay buffer
		self.memory = deque(maxlen=1000000)
		# Replay Buffer
		self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000*256, type_buffer="HER")
		# File name
		self.file_name = "reward_{}_{}_{}".format(self.time_str, self.num_robots, self.replay_buffer.type_buffer)
		# Hidden Layer list
		self.hid_list = [512, 512, 512]
		# ===================================================================== #
		#                               Actor Model                             #
		# Chain rule: find the gradient of chaging the actor network params in  #
		# getting closest to the final value network predictions, i.e. de/dA    #
		# Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act #
		# ===================================================================== #

		self.actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list)
		self.target_actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list)
		self.actor_optim = optim.Adam(self.actor_model.parameters(), lr=self.learning_rate)

		# ===================================================================== #
		#                              Critic Model                             #
		# ===================================================================== #

		self.critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list)
		self.target_critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list)
		self.critic_optim = optim.Adam(self.critic_model.parameters(), lr=self.learning_rate)
		

		hard_update(self.target_actor_model, self.actor_model) # Make sure target is with the same weight
		hard_update(self.target_critic_model, self.critic_model)

		self.cuda()
示例#3
0
    def __init__(self, hparams):
        '''
        Initializations
        '''
        super().__init__()
        self.hparams = hparams

        # Position of human
        source_position = torch.tensor([[self.hparams.environment.position.end.x],
                                        [self.hparams.environment.position.end.y],
                                        [self.hparams.environment.position.end.z]]).float()

        # Position of agent
        agent_position  = torch.tensor([[self.hparams.environment.position.start.x],
                                        [self.hparams.environment.position.start.y],
                                        [self.hparams.environment.position.start.z]]).float()


        # Initialize Replay buffer
        self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size)


        # Initialize drone
        self.agent = Drone(start_position = agent_position,
                           goal_position = source_position,
                           velocity_factor = self.hparams.environment.agent.velocity_factor,
                           hparams = self.hparams,
                           buffer = self.replay_buffer)

        # Actor networks
        self.net = Actor(**self.hparams.model.actor)
        self.target_net = Actor(**self.hparams.model.actor)

        # Critic networks
        self.critic = Critic(**self.hparams.model.critic)
        self.target_critic = Critic(**self.hparams.model.critic)

        # Hard update
        self.target_net.load_state_dict(self.net.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.total_reward = -10000
        self.episode_steps = 0.0
        self.max_episode_steps = self.hparams.model.max_episode
        self.episode_reward = 0.0
        self.populate(self.hparams.model.replay_buffer_size)
    def __init__(self, env, env_obs, gamma=0.99, tau=0.001, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0.1, batch_size=64, subpolicies=1, action_shape=2, replay_buffer_size=5000, replay_buffer_type="rb", noise=0.1, noise_decay=0.999, max_action=1, min_action=-1, teacher=False, alpha=0.1, bc=None):

        self.env = env
        self.subpolicies = subpolicies
        self.total_obs = np.sum(env_obs)
        self.weight_decay = weight_decay
        self.env_obs = env_obs
        self.max_action = max_action
        self.min_action = min_action
        self.action_shape = action_shape
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.replay_buffer_type = replay_buffer_type
        self.replay_buffer_size = replay_buffer_size
        self.init_noise = noise
        self.noise = noise
        self.noise_decay = noise_decay
        self.teacher = teacher
        self.bc = bc
        self.alpha = alpha

        self.mul = 1 if self.teacher is False else 2

        self.actors = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)]
        self.actors_targets = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)]
        self.critics = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents]
        self.critics_targets = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents]

        self.actors_optimizers = [[torch.optim.RMSprop(self.actors[agent][i].parameters(), lr=lr_actor, weight_decay=weight_decay) for i in range(self.subpolicies)] for agent in range(len(env.agents))]
        self.critics_optimisers = [torch.optim.RMSprop(self.critics[agent].parameters(), lr=lr_critic ,weight_decay=weight_decay) for agent in range(len(env.agents))]

        if self.subpolicies > 1:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = [[ReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
            else:
                self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
        else:
            if self.replay_buffer_type == "rb":
                self.replay_buffers = ReplayBuffer(self.replay_buffer_size)
            else:
                self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
示例#5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # initialize state
        self.last_state = self.task.reset()
示例#6
0
 def __init__(self, env, GAMMA=0.9):
     self.env = env
     print('obs space shape: {}'.format(self.env.observation_space.shape))
     print('action space shape: {}'.format(self.env.action_space.shape))
     self.states_dim = self.env.observation_space.shape[0]
     self.action_dim = self.env.action_space.shape[0]
     print('states dim: {}\t\t actions dim: {}'.format(
         self.states_dim, self.action_dim))
     self.actor = Actor(self.states_dim, self.action_dim, lr=0.0001)
     self.critic = Critic(self.states_dim, self.action_dim, lr=0.0001)
     self.GAMMA = GAMMA
     self.RANDOM_PROB = 0.025
     self.replay_buffer = ReplayBuffer(1280)
import gym
import tensorflow as tf
from ActorCritic import Actor
from ActorCritic import Critic

LR_A = 0.001
LR_C = 0.01

env = gym.make('MountainCar-v0')
env = env.unwrapped

sess = tf.Session()

actor = Actor(sess,
              n_features=env.observation_space.shape[0],
              n_actions=env.action_space.n,
              learning_rate=LR_A)

critic = Critic(sess,
                n_features=env.observation_space.shape[0],
                learning_rate=LR_C)

sess.run(tf.global_variables_initializer())

for i_episode in range(1000):
    s = env.reset()
    t = 0
    track_r = []
    while True:
        # if RENDER: env.render()
        env.render()
示例#8
0
import matplotlib.pyplot as plt

DISPLAY_REWARD_THRESHOLD = -90

RENDER = False  # rendering wastes time

env = gym.make('MountainCar-v0')
env.seed(1)  # reproducible, general Policy gradient has high variance
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

actor = Actor(epsilon=0)
critic = Critic()
Tmax = 1000
for i_episode in range(3000):

    observation = env.reset()
    action = actor.choose_action(observation)

    running_reward = 0
    critic.reset()
    count = 0
    while count < Tmax:
        count += 1
        if RENDER: env.render()

        observation_, reward, done, info = env.step(