Пример #1
0
    def __init__(self, args, env, env_params, test_env):
        self.args = args
        self.env = env
        self.test_env = test_env
        self.env_params = env_params
        self.device = args.device
        self.resume = args.resume
        self.resume_epoch = args.resume_epoch
        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        self.writer = SummaryWriter(log_dir='runs/ddpg'+current_time + '_' + str(args.env_name) + \
                                            str(args.lr_critic)+'_' + str(args.gamma)+'_'+\
                                            str(args.fps))
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
            # path to save the model
        self.model_path = os.path.join(self.args.save_dir,
                                       self.args.env_name + "_" + current_time)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.actor_network = actor(env_params)
        self.actor_target_network = actor(env_params)
        self.critic_network = criticWrapper(self.env_params, self.args)
        self.critic_target_network = criticWrapper(self.env_params, self.args)

        self.start_epoch = 0
        if self.resume == True:
            self.start_epoch = self.resume_epoch
            self.actor_network.load_state_dict(torch.load(self.args.resume_path + \
                                                          '/actor_model_' +str(self.resume_epoch) +'.pt')[0])
            self.critic_network.load_state_dict(torch.load(self.args.resume_path + \
                                                           '/critic_model_' +str(self.resume_epoch) +'.pt')[0])
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        self.actor_network.to(self.device)
        self.critic_network.to(self.device)
        self.actor_target_network.to(self.device)
        self.critic_target_network.to(self.device)
        # create the optimizer

        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k, self.args.distance,
                                      self.args.future_step)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        self.planner_policy = Planner(agent=self, replay_buffer=self.buffer, fps=args.fps, \
                                          clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)
Пример #2
0
    def __init__(self, args, env, env_params, test_env):
        self.args = args
        self.device = args.device
        self.env = env
        self.test_env = test_env
        self.env_params = env_params
        self.action_n = env.action_space.n

        self.resume = args.resume
        self.resume_epoch = args.resume_epoch
        self.init_qnets()
        self.start_epoch = 0
        if self.resume == True:
            self.start_epoch = self.resume_epoch
            print('resume from stored models ...')
            self.Q_network.load_state_dict(
                torch.load(self.args.path + '/q_model_' +
                           str(self.resume_epoch) + '.pt')[0])
            self.targetQ_network.load_state_dict(
                torch.load(self.args.path + '/q_model_' +
                           str(self.resume_epoch) + '.pt')[0])

        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        self.writer = SummaryWriter(log_dir='runs/dqn' + current_time + '_mc' +
                                    str(args.gamma) + '_' + str(args.fps))
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
            # path to save the model
        self.model_path = os.path.join(self.args.save_dir,
                                       self.args.env_name + "_" + current_time)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.eps = args.eps
        # load the weights into the target networks
        self.targetQ_network.load_state_dict(self.Q_network.state_dict())
        # create the optimizer
        self.q_optim = torch.optim.Adam(self.Q_network.parameters(),
                                        lr=self.args.lr)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k, self.args.distance)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        self.planner_policy = Planner(agent=self, replay_buffer=self.buffer, fps=args.fps, \
                                          clip_v=args.clip_v, n_landmark=args.landmark,
                                          initial_sample=args.initial_sample)
Пример #3
0
    def __init__(self,
                 args,
                 env,
                 env_params,
                 test_env,
                 test_env1=None,
                 test_env2=None):
        self.args = args
        self.env = env
        self.test_env = test_env
        self.env_params = env_params
        self.device = args.device
        self.resume = args.resume
        self.resume_epoch = args.resume_epoch
        self.not_train_low = False
        self.test_env1 = test_env1
        self.test_env2 = test_env2
        self.old_sample = args.old_sample

        self.low_dim = env_params['obs']
        self.env_params['low_dim'] = self.low_dim
        self.hi_dim = env_params['obs']
        print("hi_dim", self.hi_dim)

        self.learn_goal_space = True
        self.whole_obs = False  # use whole observation space as subgoal space
        self.abs_range = abs_range = args.abs_range  # absolute goal range
        self.feature_reg = 0.0  # feature l2 regularization
        print("abs_range", abs_range)

        if args.env_name[:5] == "Fetch":
            maze_low = self.env.env.initial_gripper_xpos[:2] - self.env.env.target_range
            maze_high = self.env.env.initial_gripper_xpos[:2] + self.env.env.target_range
            self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)
        else:
            if args.env_name != "NChain-v1":
                self.hi_act_space = self.env.env.maze_space
            else:
                self.hi_act_space = gym.spaces.Box(low=np.array([-1]),
                                                   high=np.array([1]))
        if self.learn_goal_space:
            if args.env_name == "NChain-v1":
                self.hi_act_space = gym.spaces.Box(low=np.array([-abs_range]),
                                                   high=np.array([abs_range]))
            else:
                self.hi_act_space = gym.spaces.Box(
                    low=np.array([-abs_range, -abs_range]),
                    high=np.array([abs_range, abs_range]))
        if self.whole_obs:
            vel_low = [-10.] * 4
            vel_high = [10.] * 4
            maze_low = np.concatenate(
                (self.env.env.maze_low, np.array(vel_low)))
            maze_high = np.concatenate(
                (self.env.env.maze_high, np.array(vel_high)))
            self.hi_act_space = gym.spaces.Box(low=maze_low, high=maze_high)

        dense_low = True
        self.low_use_clip = not dense_low  # only sparse reward use clip
        if args.replay_strategy == "future":
            self.low_forward = True
            assert self.low_use_clip is True
        else:
            self.low_forward = False
            assert self.low_use_clip is False
        self.hi_sparse = (self.env.env.reward_type == "sparse")

        # # params of learning phi
        resume_phi = args.resume
        self.not_update_phi = False
        phi_path = args.resume_path

        # resume_phi = True
        # phi_path = 'saved_models/AntMaze1-v1_Jun01_19-26-19'
        # self.not_update_phi = True

        self.save_fig = False
        self.save_model = False
        self.start_update_phi = args.start_update_phi
        self.early_stop = args.early_stop  # after success rate converge, don't update low policy and feature
        if args.env_name in ['AntPush-v1', 'AntFall-v1']:
            if self.not_update_phi:
                self.early_stop_thres = 900
            else:
                self.early_stop_thres = 3500
        elif args.env_name in ["PointMaze1-v1"]:
            self.early_stop_thres = 2000
        elif args.env_name == "AntMaze1-v1":
            self.early_stop_thres = 3000
        else:
            self.early_stop_thres = args.n_epochs
        print("early_stop_threshold", self.early_stop_thres)
        self.success_log = []

        # scaling = self.env.env.env.MAZE_SIZE_SCALING
        # print("scaling", scaling)

        self.count_latent = False
        if self.count_latent:
            self.hash = HashingBonusEvaluator(512, 2)
        self.count_obs = False
        if self.count_obs:
            self.hash = HashingBonusEvaluator(512, env_params['obs'])

        self.high_correct = False
        self.k = args.c
        self.delta_k = 0
        self.prediction_coeff = 0.0
        tanh_output = False
        self.use_prob = False
        print("prediction_coeff", self.prediction_coeff)

        if args.save:
            current_time = datetime.now().strftime('%b%d_%H-%M-%S')
            self.log_dir = 'runs/hier/' + str(args.env_name) + '/RB_Decay_' + current_time + \
                            "_C_" + str(args.c) + "_Image_" + str(args.image) + \
                            "_Seed_" + str(args.seed) + "_Reward_" + str(args.low_reward_coeff) + \
                            "_NoPhi_" + str(self.not_update_phi) + "_LearnG_" + str(self.learn_goal_space) + "_Early_" + str(self.early_stop_thres) + str(args.early_stop)
            self.writer = SummaryWriter(log_dir=self.log_dir)
            if not os.path.exists(self.args.save_dir):
                os.mkdir(self.args.save_dir)
            # path to save the model
            self.model_path = os.path.join(
                self.args.save_dir, self.args.env_name + "_" + current_time)
            if not os.path.exists(self.model_path):
                os.mkdir(self.model_path)
        # init low-level network
        self.real_goal_dim = self.hi_act_space.shape[
            0]  # low-level goal space and high-level action space
        self.init_network()
        # init high-level agent
        self.hi_agent = SAC(self.hi_dim + env_params['goal'],
                            self.hi_act_space, args, False, env_params['goal'],
                            args.gradient_flow_value, args.abs_range,
                            tanh_output)
        self.env_params['real_goal_dim'] = self.real_goal_dim
        self.hi_buffer = ReplayMemory(args.buffer_size)

        # her sampler
        self.c = self.args.c  # interval of high level action
        self.low_her_module = her_sampler(
            args.replay_strategy,
            args.replay_k,
            args.distance,
            args.future_step,
            dense_reward=dense_low,
            direction_reward=False,
            low_reward_coeff=args.low_reward_coeff)
        if args.env_name[:5] == "Fetch":
            self.low_buffer = replay_buffer_energy(
                self.env_params, self.args.buffer_size,
                self.low_her_module.sample_her_energy, args.env_name)
        else:
            self.low_buffer = replay_buffer(
                self.env_params, self.args.buffer_size,
                self.low_her_module.sample_her_transitions)

        not_load_buffer, not_load_high = True, False
        if self.resume is True:
            self.start_epoch = self.resume_epoch
            if not not_load_high:
                self.hi_agent.policy.load_state_dict(torch.load(self.args.resume_path + \
                                                              '/hi_actor_model.pt', map_location='cuda:4')[0])
                # self.hi_agent.critic.load_state_dict(torch.load(self.args.resume_path + \
                #                                                '/hi_critic_model.pt', map_location='cuda:4')[0])

            # print("not load low !!!")
            print("load low !!!")
            self.low_actor_network.load_state_dict(torch.load(self.args.resume_path + \
                                                             '/low_actor_model.pt', map_location='cuda:4')[0])
            self.low_critic_network.load_state_dict(torch.load(self.args.resume_path + \
                                                              '/low_critic_model.pt', map_location='cuda:4')[0])

            if not not_load_buffer:
                # self.hi_buffer = torch.load(self.args.resume_path + '/hi_buffer.pt', map_location='cuda:1')
                self.low_buffer = torch.load(self.args.resume_path +
                                             '/low_buffer.pt',
                                             map_location='cuda:1')

        # sync target network of low-level
        self.sync_target()

        if hasattr(self.env.env, 'env'):
            self.animate = self.env.env.env.visualize_goal
        else:
            self.animate = self.args.animate
        self.distance_threshold = self.args.distance

        if not (args.gradient_flow or args.use_prediction
                or args.gradient_flow_value):
            self.representation = RepresentationNetwork(
                env_params, 3, self.abs_range,
                self.real_goal_dim).to(args.device)
            if args.use_target:
                self.target_phi = RepresentationNetwork(
                    env_params, 3, self.abs_range, 2).to(args.device)
                # load the weights into the target networks
                self.target_phi.load_state_dict(
                    self.representation.state_dict())
            self.representation_optim = torch.optim.Adam(
                self.representation.parameters(), lr=0.0001)
            if resume_phi is True:
                print("load phi from: ", phi_path)
                self.representation.load_state_dict(torch.load(phi_path + \
                                                               '/phi_model_4000.pt', map_location='cuda:4')[0])
        elif args.use_prediction:
            self.representation = DynamicsNetwork(env_params,
                                                  self.abs_range,
                                                  2,
                                                  tanh_output=tanh_output,
                                                  use_prob=self.use_prob,
                                                  device=args.device).to(
                                                      args.device)
            self.representation_optim = torch.optim.Adam(
                self.representation.parameters(), lr=0.0001)
            if resume_phi is True:
                print("load phi from: ", phi_path)
                self.representation.load_state_dict(torch.load(phi_path + \
                                                               '/phi_model_4000.pt', map_location='cuda:1')[0])

        print("learn goal space", self.learn_goal_space, " update phi",
              not self.not_update_phi)
        self.train_success = 0
        self.furthest_task = 0.
Пример #4
0
    def __init__(self,
                 args,
                 env,
                 env_params,
                 test_env,
                 resume=False,
                 resume_epoch_actor=0,
                 resume_epoch_critic=0):
        self.args = args
        self.env = env
        self.test_env = test_env
        self.env_params = env_params
        self.device = args.device
        self.resume = resume
        self.resume_epoch_actor = resume_epoch_actor
        self.resume_epoch_critic = resume_epoch_critic
        current_time = datetime.now().strftime('%b%d_%H-%M-%S')
        self.writer = SummaryWriter(log_dir='runs/ddpg'+current_time + '_' + str(args.env_name) + \
                                            str(args.lr_critic)+'_' + str(args.gamma)+'_'+str(args.plan_rate)+'_'+\
                                            str(args.fps))
        if not os.path.exists(self.args.save_dir):
            os.mkdir(self.args.save_dir)
            # path to save the model
        self.model_path = os.path.join(self.args.save_dir,
                                       self.args.env_name + "_" + current_time)
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)
        self.actor_network = actor(env_params)
        self.plan_rate = args.plan_rate
        self.init_critics()
        self.actor_target_network = actor(env_params)
        if self.resume == True:
            self.actor_network.load_state_dict(
                torch.load(self.args.path + '/actor_model_' +
                           str(self.resume_epoch_actor) + '.pt')[0])
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(
            self.critic_network.state_dict())
        # if use gpu
        self.actor_network.to(self.device)
        self.critic_network.to(self.device)
        self.actor_target_network.to(self.device)
        self.critic_target_network.to(self.device)
        # create the optimizer

        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=self.args.lr_actor)
        if self.args.search == True:
            print('here true')
            self.critic_optim = torch.optim.Adam(
                [{
                    'params': self.critic_network.base.parameters()
                }, {
                    'params': self.critic_network.gamma,
                    'lr': 5e-5
                }],
                lr=self.args.lr_critic)
        else:
            self.critic_optim = torch.optim.Adam(
                self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy,
                                      self.args.replay_k, self.args.distance)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size,
                                    self.her_module.sample_her_transitions)
        if args.fps == 1:
            self.planner_policy = Planner(agent=self, framebuffer=self.buffer, fps=True, \
                                          clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)
        else:
            self.planner_policy = Planner(agent=self, framebuffer=self.buffer, fps=False, \
                                          clip_v=args.clip_v, n_landmark=args.landmark, initial_sample=args.initial_sample)