예제 #1
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda and torch.cuda.is_available():
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)

        # add tensorboardX tool

        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(self.args.save_dir,
                                           self.args.env_name)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
            print('model path', self.model_path)

        # self.writer = SummaryWriter('./logs')
        self.reward_list = []
        self.reward_record = []
        self.success_rate_list = []
        self.success_list = []
예제 #2
0
    def __init__(self,
                 observation_space,
                 action_space,
                 discount=0.99,
                 td_lambda=0.95,
                 hidden_size=(128, 64),
                 temp=1.,
                 max_weight=20,
                 action_std=0.4,
                 actor_lr=0.0001,
                 critic_lr=0.01,
                 device='cpu',
                 batch_size=256,
                 pipe=None,
                 optimizer='SGD',
                 activation='relu'):

        self.device = device
        inp_dim = observation_space.shape[0]
        self.actor = actor(inp_dim,
                           action_space.low.shape[0],
                           std=action_std,
                           hidden_size=hidden_size,
                           activation=activation).to(device)
        self.critic = critic(inp_dim,
                             hidden_size=hidden_size,
                             activation=activation).to(device)
        self.normalizer = Normalizer((inp_dim, ),
                                     default_clip_range=5).to(device)
        self.normalizer.count += 1  #unbiased ...
        self.temp = temp
        self.max_weight = max_weight

        # NOTE: optimizer is different
        if optimizer == 'SGD':
            self.optim_actor = torch.optim.SGD(self.actor.parameters(),
                                               actor_lr,
                                               momentum=0.9)
            self.optim_critic = torch.optim.SGD(self.critic.parameters(),
                                                critic_lr,
                                                momentum=0.9)
        else:
            self.optim_actor = torch.optim.Adam(self.actor.parameters(),
                                                actor_lr)
            self.optim_critic = torch.optim.Adam(self.critic.parameters(),
                                                 critic_lr)
        self.pipe = pipe
        self.batch_size = batch_size
        self.mse = nn.MSELoss()

        self.discount = discount
        self.td_lambda = td_lambda
        self.val_norm = 1.0 / (1.0 - self.discount)

        self.action_mean = ((action_space.high + action_space.low) /
                            2)[None, :]
        self.action_std = ((action_space.high - action_space.low) / 2)[None, :]
예제 #3
0
    def __init__(self, obs_dim, act_dim, env, memory_size=50000, batch_size=64,\
                 lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, n_steps = 1):

        self.gamma = gamma
        self.batch_size = batch_size
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.memory_size = memory_size
        self.tau = tau
        self.env = env
        self.n_steps = n_steps
        self.n_step_gamma = self.gamma**self.n_steps

        # actor
        self.actor = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())

        # critic
        self.critic = critic(state_size=obs_dim,
                             action_size=act_dim,
                             output_size=1)
        self.critic_target = critic(state_size=obs_dim,
                                    action_size=act_dim,
                                    output_size=1)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizers
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # critic loss
        self.critic_loss = nn.MSELoss()

        # noise
        # self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000)
        self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000)

        # replay buffer
        #self.replayBuffer = Replay(self.memory_size, window_length=1)
        self.replayBuffer = Replay(self.memory_size, self.env)
    def __init__(self, args, env, env_params, image=True):
        self.args = args
        self.env = env
        self.env_params = env_params
        self.image = image

        # create the network
        if self.image:
            self.actor_network = actor_image(env_params, env_params['obs'])
            self.critic_network = critic_image(env_params, env_params['obs'] + env_params['action'])
        else:
            self.actor_network = actor(env_params, env_params['obs'])
            self.critic_network = critic(env_params, env_params['obs'] + env_params['action'])

        # load model if load_path is not None
        if self.args.load_dir != '':
            actor_load_path = self.args.load_dir + '/actor.pt'
            model = torch.load(actor_load_path)
            self.actor_network.load_state_dict(model)
            critic_load_path = self.args.load_dir + '/critic.pt'
            model = torch.load(critic_load_path)
            self.critic_network.load_state_dict(model)

        # sync the networks across the cpus
        # sync_networks(self.actor_network)
        # sync_networks(self.critic_network)
        # build up the target network
        # if self.image:
        #     self.actor_target_network = actor_image(env_params, env_params['obs'])
        # else:
        #     self.actor_target_network = actor(env_params, env_params['obs'])
        # # load the weights into the target networks
        # self.actor_target_network.load_state_dict(self.actor_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            # self.actor_target_network.cuda()
            self.critic_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env().compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions, image=self.image)

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
예제 #5
0
		def createActor(soupobject):
			a = models.actor(ident=soupobject['ident'],
				name=soupobject['name'],
				desc=soupobject['desc']
				)

			lines = soupobject.find_all('line')

			for i in lines:
				a.addLine(i['key'],i['need'],i.text,i['gift'])

			items = soupobject.find_all('item')

			for i in items:
				if i['ident'] in self.allItems.keys():
					a.addItem({i['ident']:self.allItems[i['ident']]})
				else:
					print "Your allItems is incomplete. Ensure that it is defined in your itemsfile and that you ran populateItems first"
			return a
 def __init__(self, env, args):
     self.env = env
     self.args = args
     # get the dims and action max of the environment
     obs_dims = self.env.observation_space.shape[0]
     self.action_dims = self.env.action_space.shape[0]
     self.action_max = self.env.action_space.high[0]
     # define the network
     self.actor_net = actor(obs_dims, self.action_dims)
     self.critic_net = critic(obs_dims, self.action_dims)
     # sync the weights across the mpi
     sync_networks(self.actor_net)
     sync_networks(self.critic_net)
     # build the target newtork
     self.actor_target_net = copy.deepcopy(self.actor_net)
     self.critic_target_net = copy.deepcopy(self.critic_net)
     # create the optimizer
     self.actor_optim = torch.optim.Adam(self.actor_net.parameters(),
                                         self.args.lr_actor)
     self.critic_optim = torch.optim.Adam(
         self.critic_net.parameters(),
         self.args.lr_critic,
         weight_decay=self.args.critic_l2_reg)
     # create the replay buffer
     self.replay_buffer = replay_buffer(self.args.replay_size)
     # create the normalizer
     self.o_norm = normalizer(obs_dims,
                              default_clip_range=self.args.clip_range)
     # create the noise generator
     self.noise_generator = ounoise(std=0.2, action_dim=self.action_dims)
     # create the dir to save models
     if MPI.COMM_WORLD.Get_rank() == 0:
         if not os.path.exists(self.args.save_dir):
             os.mkdir(self.args.save_dir)
         self.model_path = os.path.join(self.args.save_dir,
                                        self.args.env_name)
         if not os.path.exists(self.model_path):
             os.mkdir(self.model_path)
     # create a eval environemnt
     self.eval_env = gym.make(self.args.env_name)
     # set seeds
     self.eval_env.seed(self.args.seed * 2 + MPI.COMM_WORLD.Get_rank())
예제 #7
0
파일: demo.py 프로젝트: sush1996/DDPG_Fetch
    # create the environment
    env = gym.make(args.env_name)
    # get the env param
    observation = env.reset()

    # get the environment params
    env_params = {
        'obs': observation['observation'].shape[0],
        'goal': observation['desired_goal'].shape[0],
        'action': env.action_space.shape[0],
        'action_max': env.action_space.high[0],
    }

    # create the actor network
    actor_network = actor(env_params)
    actor_network.load_state_dict(model)
    actor_network.eval()

    for i in range(args.demo_length):
        observation = env.reset()

        # start to do the demo
        obs = observation['observation']
        g = observation['desired_goal']

        for t in range(env._max_episode_steps):
            env.render()
            inputs = process_inputs(obs, g, o_mean, o_std, g_mean, g_std, args)

            with torch.no_grad():
예제 #8
0
    'pool_type': params.pool_type,
    'nonlinear_fc': params.nonlinear_fc,
    'encoder_type': params.encoder_type,
    'use_cuda': True,
}

# model
encoder_types = [
    'InferSent', 'BLSTMprojEncoder', 'BGRUlastEncoder',
    'InnerAttentionMILAEncoder', 'InnerAttentionYANGEncoder',
    'InnerAttentionNAACLEncoder', 'ConvNetEncoder', 'LSTMEncoder'
]
assert params.encoder_type in encoder_types, "encoder_type must be in " + \
                                             str(encoder_types)
nli_net = critic(config_nli_model)
actorModel = actor(params.enc_lstm_dim, params.word_emb_dim)
print(nli_net)
print(actorModel)

for name, x in nli_net.named_parameters():
    print(name)

for name, x in actorModel.named_parameters():
    print(name)

#print(nli_net.target_pred.enc_lstm.weight_ih_l0)
#print(nli_net.target_classifier[4].bias)

# loss
weight = torch.FloatTensor(params.n_classes).fill_(1)
loss_fn = nn.CrossEntropyLoss(weight=weight)
예제 #9
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # sync the networks across the cpus
        #sync_networks(self.actor_network)
        #sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)

        # Load the model if required
        if args.load_path != None:
            o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load(
                args.load_path, map_location=lambda storage, loc: storage)
            self.actor_network.load_state_dict(load_actor_model)
            self.critic_network.load_state_dict(load_critic_model)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        # create the dict for store the model
        #if MPI.COMM_WORLD.Get_rank() == 0:
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)

        # makeup a suffix for the model path to indicate which method is used for Training
        #self.folder_siffix = '_' + self.args.replay_strategy + '_' + self.args.env_params.reward_type
        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.model_path = os.path.join(self.model_path,
                                       'seed_' + str(self.args.seed))
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
 def __init__(self, args, env, env_params):
     self.savetime = 0
     self.args = args
     self.env = env
     self.env_params = env_params
     # create the network
     self.actor_network = actor(env_params)
     self.critic_network = critic(env_params)
     # sync the networks across the cpus
     sync_networks(self.actor_network)
     sync_networks(self.critic_network)
     # build up the target network
     self.actor_target_network = actor(env_params)
     self.critic_target_network = critic(env_params)
     # load the weights into the target networks
     self.actor_target_network.load_state_dict(
         self.actor_network.state_dict())
     self.critic_target_network.load_state_dict(
         self.critic_network.state_dict())
     # if use gpu
     if self.args.cuda:
         self.actor_network.cuda()
         self.critic_network.cuda()
         self.actor_target_network.cuda()
         self.critic_target_network.cuda()
     # create the optimizer
     self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                         lr=self.args.lr_actor)
     self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                          lr=self.args.lr_critic)
     # her sampler
     self.her_module = her_sampler(self.args.replay_strategy,
                                   self.args.replay_k,
                                   self.env.compute_reward)
     # create the replay buffer
     self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                 self.her_module.sample_her_transitions)
     # 是否加入示教数据
     if self.args.add_demo:
         self._init_demo_buffer(
         )  # initialize replay buffer with demonstration
     # create the normalizer
     self.o_norm = normalizer(size=env_params['obs'],
                              default_clip_range=self.args.clip_range)
     self.g_norm = normalizer(size=env_params['goal'],
                              default_clip_range=self.args.clip_range)
     # load the data to continue the training
     # model_path = "saved_models/bmirobot-v3/125_True12_model.pt"
     # # # model_path = args.save_dir + args.env_name + '/' + str(args.seed) + '_' + str(args.add_demo) + '_model.pt'
     # # o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage)
     # self.actor_network.load_state_dict(model)
     # self.o_norm.mean=o_mean
     # self.o_norm.std=o_std
     # self.g_norm.mean=g_mean
     # self.g_norm.std=g_std
     self.success_rates = []  # 记录每个epoch的成功率
     # create the dict for store the model
     if MPI.COMM_WORLD.Get_rank() == 0:
         if not os.path.exists(self.args.save_dir):
             os.mkdir(self.args.save_dir)
         # path to save the model
         self.model_path = os.path.join(self.args.save_dir,
                                        self.args.env_name)
         if not os.path.exists(self.model_path):
             os.mkdir(self.model_path)
예제 #11
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params

        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)

        # Load the model if required
        if args.load_path != None:
            o_mean, o_std, g_mean, g_std, load_actor_model, load_critic_model = torch.load(
                args.load_path, map_location=lambda storage, loc: storage)
            self.actor_network.load_state_dict(load_actor_model)
            self.critic_network.load_state_dict(load_critic_model)

        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        if self.args.replay_strategy == 'future':
            self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                        self.her_module.sample_her_transitions)
        else:
            self.buffer = replay_buffer(
                self.env_params, self.args.buffer_size,
                self.her_module.sample_normal_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        # create the dict for store the model
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)

        # makeup a suffix for the model path to indicate which method is used for Training
        buffer_len_epochs = int(
            self.args.buffer_size /
            (env_params['max_timesteps'] * self.args.num_rollouts_per_cycle *
             self.args.n_cycles))
        name_add_on = ''
        if self.args.exploration_strategy == 'pgg':
            if self.args.pgg_strategy == 'final':
                if self.args.replay_strategy == 'future':
                    name_add_on = '_final_distance_based_goal_generation_buffer' + str(
                        buffer_len_epochs) + 'epochs'
                else:
                    name_add_on = '_final_distance_based_goal_generation_withoutHER_buffer' + str(
                        buffer_len_epochs) + 'epochs'
            else:
                if self.args.replay_strategy == 'future':
                    name_add_on = '_distance_based_goal_generation_buffer' + str(
                        buffer_len_epochs) + 'epochs'
                else:
                    name_add_on = '_distance_based_goal_generation_withoutHER_buffer' + str(
                        buffer_len_epochs) + 'epochs'
        else:
            if self.args.replay_strategy == 'future':
                name_add_on = '_originalHER_buffer' + str(
                    buffer_len_epochs) + 'epochs'
            else:
                name_add_on = '_originalDDPG_buffer' + str(
                    buffer_len_epochs) + 'epochs'

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir,
                                       self.args.env_name + name_add_on)

        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.model_path = os.path.join(self.model_path,
                                       'seed_' + str(self.args.seed))
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
예제 #12
0
    # get the environment params
    env_params = {
        'obs': observation['observation'].shape[0],
        'goal': observation['desired_goal'].shape[0],
        'action': env.action_space.shape[0],
        'action_max': env.action_space.high[0],
        'depth': env.env.depth,
        'two_cam': env.env.two_cam
    }
    # create the actor network
    if use_image:
        actor_network = actor_image(env_params, env_params['obs'])
        critic_network = critic_image(env_params,
                                      env_params['obs'] + env_params['action'])
    else:
        actor_network = actor(env_params, env_params['obs'])
        critic_network = critic_image(env_params,
                                      env_params['obs'] + env_params['action'])
    actor_network.load_state_dict(actor_model)
    actor_network.eval()
    critic_network.load_state_dict(critic_model)
    critic_network.eval()

    total_success_rate = []
    for i in range(args.demo_length):
        path_ind = os.path.join(path, str(i))
        os.makedirs(path_ind)
        env = gym.make(args.env_name,
                       reward_type='sparse',
                       goal_type='random',
                       cam_type='fixed',
    def __init__(self,
                 args,
                 envs_lst,
                 env_params,
                 expert_lst_dir,
                 recurrent=True,
                 ee_reward=True,
                 image=True):
        self.args = args
        self.envs_lst = envs_lst
        self.env_params = env_params
        self.recurrent = recurrent
        self.ee_reward = ee_reward
        self.image = image

        # initialize expert
        self.expert_lst = []
        for dir in expert_lst_dir:
            expert_load_path = dir + '/model.pt'
            o_mean, o_std, g_mean, g_std, model = torch.load(expert_load_path)
            expert_model = actor(env_params,
                                 env_params['obs'] + env_params['goal'])
            expert_model.load_state_dict(model)
            self.expert_lst.append({
                "model": expert_model,
                "o_mean": o_mean,
                "o_std": o_std,
                "g_mean": g_mean,
                "g_std": g_std
            })

        # create the network
        if self.recurrent:
            self.actor_network = actor_recurrent(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
            # self.critic_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action'])
        else:
            self.actor_network = actor(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
        self.critic_network = critic(
            env_params,
            env_params['obs'] + 2 * env_params['goal'] + env_params['action'])

        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        self.sg_norm = normalizer(size=env_params['action'],
                                  default_clip_range=self.args.clip_range)

        # load model if load_path is not None
        if self.args.load_dir != '':
            load_path = self.args.load_dir + '/model.pt'
            # o_mean, o_std, g_mean, g_std, sg_mean, sg_std, model = torch.load(load_path)
            o_mean, o_std, g_mean, g_std, model = torch.load(load_path)
            self.o_norm.mean = o_mean
            self.o_norm.std = o_std
            self.g_norm.mean = g_mean
            self.g_norm.std = g_std
            # self.sg_norm.mean = sg_mean
            # self.sg_norm.std = sg_std
            self.actor_network.load_state_dict(model)

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        if self.recurrent:
            self.actor_target_network = actor_recurrent(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
            # self.critic_target_network = critic_recurrent(env_params, env_params['obs'] + env_params['goal'] + 2 * env_params['action'])
        else:
            self.actor_target_network = actor(
                env_params,
                env_params['obs'] + env_params['goal'] + env_params['action'],
                env_params['goal'])
        self.critic_target_network = critic(
            env_params,
            env_params['obs'] + 2 * env_params['goal'] + env_params['action'])
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())

        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module_lst = [
            her_sampler(self.args.replay_strategy, self.args.replay_k,
                        env.compute_reward) for env in self.envs_lst
        ]
        # create the replay buffer
        self.buffer_lst = [
            replay_buffer(self.env_params,
                          self.args.buffer_size,
                          her_module.sample_her_transitions,
                          ee_reward=True) for her_module in self.her_module_lst
        ]

        # path to save the model
        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
예제 #14
0
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = actor(env_params)
        self.critic_network = critic(env_params)

        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k,
                                      self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.args.clip_range)
        muscle_labels = [
            "m" + str(i) for i in np.array(range(args.num_muscles))
        ]
        env = PointModel(
            verbose=0,
            success_thres=args.success_threshold,
            dof_observation=args.dob,
            include_follow=False,
            port=args.port,
            muscle_labels=muscle_labels,
        )
        self.env = env
        # create the dict for store the model
        if MPI.COMM_WORLD.Get_rank() == 0:
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(
                self.args.save_dir, self.args.env_name,
                self.args.exp_name)  # self.args.env_name
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
예제 #15
0
    def __init__(self, obs_dim, act_dim, env = None, memory_size=50000, batch_size=64,\
                 lr_critic=1e-4, lr_actor=1e-4, gamma=0.99, tau=0.001, prioritized_replay=True,\
                 critic_dist_info=None, n_steps=1):

        self.gamma = gamma
        self.n_steps = n_steps
        self.n_step_gamma = self.gamma**self.n_steps
        self.batch_size = batch_size
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.memory_size = memory_size
        self.tau = tau
        self.env = env

        ##   critic_dist_info:
        # dictionary with information about critic output distribution.
        # parameters:
        # 1. distribution_type = 'categorical' or 'mixture_of_gaussian'
        #    if 'categorical':
        #       a.
        #    if 'mixture_of_gaussian':
        #       b.

        self.dist_type = critic_dist_info['type']
        if critic_dist_info['type'] == 'categorical':
            self.v_min = critic_dist_info['v_min']
            self.v_max = critic_dist_info['v_max']
            self.n_atoms = critic_dist_info['n_atoms']
            self.delta = (self.v_max - self.v_min) / float(self.n_atoms - 1)
            self.bin_centers = np.array([
                self.v_min + i * self.delta for i in range(self.n_atoms)
            ]).reshape(-1, 1)
        elif critic_dist_info['type'] == 'mixture_of_gaussian':
            #TODO
            pass
        else:
            print("Error: Unsupported distribution type")
            # TODO
            # throw exception

        # actor
        self.actor = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target = actor(input_size=obs_dim, output_size=act_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())

        # critic
        self.critic = critic(state_size=obs_dim,
                             action_size=act_dim,
                             dist_info=critic_dist_info)
        self.critic_target = critic(state_size=obs_dim,
                                    action_size=act_dim,
                                    dist_info=critic_dist_info)
        self.critic_target.load_state_dict(self.critic.state_dict())

        # optimizers
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic)

        # critic loss
        self.critic_loss = nn.CrossEntropyLoss()

        # noise
        #self.noise = OrnsteinUhlenbeckProcess(dimension=act_dim, num_steps=5000)
        self.noise = GaussianNoise(dimension=act_dim, num_epochs=5000)

        # replay buffer
        self.prioritized_replay = prioritized_replay
        if self.prioritized_replay:
            # Open AI prioritized replay memory
            self.replayBuffer = PrioritizedReplayBuffer(self.memory_size,
                                                        alpha=0.6)
            prioritized_replay_beta0 = 0.4  # type: float
            prioritized_replay_beta_iters = 100000
            self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,\
                                                initial_p=prioritized_replay_beta0,\
                                                final_p=1.0)
            self.prioritized_replay_eps = 1e-6
        else:
            self.replayBuffer = Replay(
                self.memory_size,
                self.env,
                n_steps=self.n_steps,
                gamma=self.gamma)  #<- self implemented memory buffer