def __init__(self, actor_id, config, dev, shared_state, shared_queue, eps): # self.env = suite.load(domain_name="walker", task_name="run") # self.action_size = self.env.action_spec().shape[0] # self.obs_size = get_obs(self.env.reset().observation).shape[1] self.env = env_cover(config, dev) self.num_env = config['num_envs'] self.shared_queue = shared_queue self.shared_state = shared_state self.dev = dev self.actor_id = actor_id self.burn_in_length = config['burn_in_length'] # 40-80 self.learning_length = config['learning_length'] self.sequence_length = self.burn_in_length + self.learning_length self.n_step = config['n_step'] self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) # self.memory_sequence_size = 1000 # self.memory = ReplayMemory(memory_sequence_size=self.memory_sequence_size) # self.memory_save_interval = 3 self.max_frame = config['actor_max_frame'] self.gamma = config['gamma'] # self.actor_parameter_update_interval = config['actor_parameter_update_interval'] self.max_shared_q_size = config['max_shared_q_size'] self.model_path = './' self.memory_path = './' self.actor = ActorNet(dev, config).to(self.dev) self.target_actor = ActorNet(dev, config).to(self.dev) self.critic = CriticNet(dev, config).to(self.dev) self.target_critic = CriticNet(dev, config).to(self.dev) self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict( self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict( self.shared_state["target_critic"].state_dict()) # self.actor.load_state_dict(self.shared_state["actor"]) # self.target_actor.load_state_dict(self.shared_state["target_actor"]) # self.critic.load_state_dict(self.shared_state["critic"]) # self.target_critic.load_state_dict(self.shared_state["target_critic"]) self.action_argmax = config['action_argmax'] # self.load_model() self.epsilon = eps
def load_model(self): if os.path.isfile(self.model_path + 'model.pt'): while True: try: # TODO: Delete self.actor = ActorNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, self.actor_id % 2 + 1).cuda().eval() self.target_critic = deepcopy(self.critic) #model_dict = torch.load(self.model_path + 'model.pt', map_location={'cuda:0':'cuda:{}'.format(self.actor_id%2+1)}) model_dict = torch.load(self.model_path + 'model.pt') self.actor.load_state_dict(model_dict['actor']) self.target_actor.load_state_dict( model_dict['target_actor']) self.critic.load_state_dict(model_dict['critic']) self.target_critic.load_state_dict( model_dict['target_critic']) self.actor.cuda(self.actor_id % 2 + 1) self.target_actor.cuda(self.actor_id % 2 + 1) self.critic.cuda(self.actor_id % 2 + 1) self.target_critic.cuda(self.actor_id % 2 + 1) except: sleep(np.random.rand() * 5 + 2) else: break
def __init__(self, actor_id): self.env = suite.load(domain_name="walker", task_name="run") self.action_size = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.actor_id = actor_id self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.memory_sequence_size = 1000 self.memory = ReplayMemory( memory_sequence_size=self.memory_sequence_size) self.memory_save_interval = 3 self.gamma = 0.997 self.actor_parameter_update_interval = 500 self.model_path = './model_data/' self.actor = ActorNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_actor = deepcopy(self.actor) self.critic = CriticNet(self.obs_size, self.action_size, cuda_id=self.actor_id % 2 + 1).cuda(self.actor_id % 2 + 1).eval() self.target_critic = deepcopy(self.critic) self.load_model() self.epsilon = 1 self.last_obs = None
def reset(self): self.action_space = self.env.action_space obs_space = self.env.observation_space.spaces obs_len = obs_space['observation'].shape[0] goal_len = obs_space['desired_goal'].shape[0] self.state_size = obs_len + goal_len self.actions_size = self.action_space.shape[0] max_action = float(self.env.action_space.high[0]) self.actor = ActorNet(self.state_size, *self.config['net_sizes'], self.actions_size, max_action) self.critic = CriticNet(self.state_size, *self.config['net_sizes'], self.actions_size) self.actor_target = ActorNet(self.state_size, *self.config['net_sizes'], self.actions_size, max_action) self.critic_target = CriticNet(self.state_size, *self.config['net_sizes'], self.actions_size) self.actor_optim = Adam(self.actor.parameters(), lr=self.config['learning_rate']) self.critic_optim = Adam(self.critic.parameters(), lr=self.config['learning_rate']) self.update(self.critic_target, self.critic, 1) self.update(self.actor_target, self.actor, 1) self.epsilon = self.config['epsilon'] self.epsilon_decay = self.config['epsilon_decay'] self.gamma = self.config['gamma'] if self.config['PER']: self.memory = self.memory = PrioritizedMemory( self.config['memory_size'], self.config["memory_alpha"], self.config["memory_epsilon"], self.config["memory_beta"], self.config["memory_beta_increment"]) else: self.memory = ReplayBuffer(self.config['memory_size']) self.batch_size = self.config['batch_size'] self.normalizer = Normalizer(obs_len, goal_len) # warm up the normalizer self.normalizer.observe(self.env.reset())
def __init__(self, state_size, action_size, num_agents, hidden_actor, hidden_critic, lr_actor, lr_critic, buffer_size, agent_id, use_PER=False, seed=0): super(DDPGAgent, self).__init__() self.seed = torch.manual_seed(seed) self.agent_id = agent_id # num_agents*action_size self.actor_local = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device) self.critic_local = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device) self.actor_target = ActorNet(state_size, hidden_actor, action_size, seed=seed).to(device) self.critic_target = CriticNet(num_agents*state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0.) #weight_decay=1.e-5 self.memory = ReplayBuffer(buffer_size, num_agents, state_size, action_size, use_PER) # initialize targets same as original networks hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local)
def __init__(self, n_actors): self.env = suite.load(domain_name="walker", task_name="run") self.n_actions = self.env.action_spec().shape[0] self.obs_size = get_obs(self.env.reset().observation).shape[1] self.n_actors = n_actors self.burn_in_length = 20 # 40-80 self.learning_length = 40 self.sequence_length = self.burn_in_length + self.learning_length self.n_step = 5 self.memory_sequence_size = 5000000 self.batch_size = 32 self.memory = LearnerReplayMemory( memory_sequence_size=self.memory_sequence_size, batch_size=self.batch_size) self.model_path = './model_data/' self.memory_path = './memory_data/' self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda() self.target_actor = deepcopy(self.actor).eval() self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda() self.target_critic = deepcopy(self.critic).eval() self.model_save_interval = 50 # 50 self.memory_update_interval = 50 # 50 self.target_update_inverval = 500 # 100 self.gamma = 0.997 self.actor_lr = 1e-4 self.critic_lr = 1e-3 self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.actor_criterion = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.critic_criterion = nn.MSELoss() self.save_model()
def __init__(self, learner_id, config, dev, shared_state, shared_queue): self.action_size = config['action_space'] self.obs_size = config['obs_space'] self.shared_queue = shared_queue self.shared_state = shared_state self.dev = dev self.id = learner_id self.burn_in_length = config['burn_in_length'] # 40-80 self.learning_length = config['learning_length'] self.sequence_length = self.burn_in_length + self.learning_length self.n_step = config['n_step'] self.sequence = [] self.recurrent_state = [] self.priority = [] self.td_loss = deque(maxlen=self.learning_length) self.gamma = config['gamma'] # self.actor_parameter_update_interval = config['actor_parameter_update_interval'] self.actor = ActorNet(dev, config).to(self.dev) self.target_actor = ActorNet(dev, config).to(self.dev) self.critic = CriticNet(dev, config).to(self.dev) self.target_critic = CriticNet(dev, config).to(self.dev) self.actor.load_state_dict(self.shared_state["actor"].state_dict()) self.target_actor.load_state_dict( self.shared_state["target_actor"].state_dict()) self.critic.load_state_dict(self.shared_state["critic"].state_dict()) self.target_critic.load_state_dict( self.shared_state["target_critic"].state_dict()) # self.actor.load_state_dict(self.shared_state["actor"]) # self.target_actor.load_state_dict(self.shared_state["target_actor"]) # self.critic.load_state_dict(self.shared_state["critic"]) # self.target_critic.load_state_dict(self.shared_state["target_critic"]) self.learner_actor_rate = config['learner_actor_rate'] self.num_actors = learner_id self.n_actions = 1 self.max_frame = config['learner_max_frame'] self.memory_sequence_size = config['memory_sequence_size'] self.batch_size = config['batch_size'] self.memory = LearnerReplayMemory(self.memory_sequence_size, config, dev) self.model_path = './' # self.memory_path = './memory_data/' # self.model_save_interval = 10 # 50 self.learner_parameter_update_interval = config[ 'learner_parameter_update_interval'] # 50 self.target_update_inverval = config['target_update_interval'] # 100 self.gamma = config['gamma'] self.actor_lr = config['actor_lr'] self.critic_lr = config['critic_lr'] self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.actor_criterion = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.critic_criterion = nn.MSELoss()
num_processes = config['num_processes'] use_cuda = torch.cuda.is_available() dev_cpu = torch.device('cpu') dev_gpu = torch.device('cuda' if use_cuda else 'cpu') # manager = mp.Manager() # shared_state = manager.dict() # shared_queue = manager.Queue() shared_queue = mp.Queue() # shared_queue = queue.Queue() shared_state = dict() shared_state["actor"] = ActorNet(dev_cpu, config).share_memory() shared_state["critic"] = CriticNet(dev_cpu, config).share_memory() shared_state["target_actor"] = ActorNet(dev_cpu, config).share_memory() shared_state["target_critic"] = CriticNet(dev_cpu, config).share_memory() # shared_state["frame"] = mp.Array('i', [0 for i in range(num_processes)]) # shared_state["sleep"] = mp.Array('i', [0 for i in range(num_processes)]) shared_state["update"] = mp.Array('i', [0 for i in range(num_processes)]) # shared_state["actor"] = ActorNet(config['obs_space'], config['action_space'],dev_cpu) # shared_state["critic"] = CriticNet(config['obs_space'], config['action_space'],dev_cpu) # shared_state["target_actor"] = ActorNet(config['obs_space'], config['action_space'],dev_cpu) # shared_state["target_critic"] = CriticNet(config['obs_space'], config['action_space'],dev_cpu) # shared_state["frame"] = [0 for i in range(num_processes)] # shared_state["sleep"] = [0 for i in range(num_processes)] # shared_state["update"]=False
def __init__(self, env, args, e_decay=1, e_min=0.05, l2_decay=0.0001, update_type="hard"): """ Initialize a D4PG Agent. """ self.device = args.device self.framework = "D4PG" self.eval = args.eval self.agent_count = env.agent_count self.actor_learn_rate = args.actor_learn_rate self.critic_learn_rate = args.critic_learn_rate self.batch_size = args.batch_size self.buffer_size = args.buffer_size self.action_size = env.action_size self.state_size = env.state_size self.C = args.C self._e = args.e self.e_decay = e_decay self.e_min = e_min self.gamma = args.gamma self.rollout = args.rollout self.tau = args.tau self.update_type = update_type self.num_atoms = args.num_atoms self.vmin = args.vmin self.vmax = args.vmax self.atoms = torch.linspace(self.vmin, self.vmax, self.num_atoms).to(self.device) self.t_step = 0 self.episode = 0 # Set up memory buffers, currently only standard replay is implemented # self.memory = ReplayBuffer(self.device, self.buffer_size, self.gamma, self.rollout) # Initialize ACTOR networks # self.actor = ActorNet(args.layer_sizes, self.state_size, self.action_size).to(self.device) self.actor_target = ActorNet(args.layer_sizes, self.state_size, self.action_size).to(self.device) self._hard_update(self.actor, self.actor_target) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.actor_learn_rate, weight_decay=l2_decay) # Initialize CRITIC networks # self.critic = CriticNet(args.layer_sizes, self.state_size, self.action_size, self.num_atoms).to(self.device) self.critic_target = CriticNet(args.layer_sizes, self.state_size, self.action_size, self.num_atoms).to(self.device) self._hard_update(self.actor, self.actor_target) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.critic_learn_rate, weight_decay=l2_decay) self.new_episode()