示例#1
0
 def __init__(self, load_model=False, model_path=None):
     self.preprocessor = DTPytorchWrapper()
     self.model = DDPG(state_dim=self.preprocessor.shape,
                       action_dim=2,
                       max_action=1,
                       net_type="cnn")
     self.current_image = np.zeros((640, 480, 3))
class PytorchRLBaseline:

    def init(self, context: Context):
        context.info('init()')

        self.image_processor = DTPytorchWrapper()
        self.action_processor = ActionWrapper(FakeWrap())
        from model import DDPG

        self.check_gpu_available(context)

        self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))
        self.model.load("model", directory="./models")

    def check_gpu_available(self, context: Context):
        import torch
        available = torch.cuda.is_available()
        req = os.environ.get('AIDO_REQUIRE_GPU', None)
        context.info(f'torch.cuda.is_available = {available!r} AIDO_REQUIRE_GPU = {req!r}')
        context.info('init()')
        if available:
            i = torch.cuda.current_device()
            count = torch.cuda.device_count()
            name = torch.cuda.get_device_name(i)
            context.info(f'device {i} of {count}; name = {name!r}')
        else:
            if req is not None:
                msg = 'I need a GPU; bailing.'
                context.error(msg)
                raise RuntimeError(msg)

    def on_received_seed(self, data: int):
        np.random.seed(data)

    def on_received_episode_start(self, context: Context, data: EpisodeStart):
        context.info(f'Starting episode "{data.episode_name}".')

    def on_received_observations(self, data: DB20Observations):
        camera: JPGImage = data.camera
        obs = jpg2rgb(camera.jpg_data)
        self.current_image = self.image_processor.preprocess(obs)

    def compute_action(self, observation):
        action = self.model.predict(observation)

        return self.action_processor.action(action.astype(float))

    def on_received_get_commands(self, context: Context):
        pwm_left, pwm_right = self.compute_action(self.current_image)
        pwm_left = float(np.clip(pwm_left, -1, +1))
        pwm_right = float(np.clip(pwm_right, -1, +1))
        grey = RGB(0.0, 0.0, 0.0)
        led_commands = LEDSCommands(grey, grey, grey, grey, grey)
        pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right)
        commands = DB20Commands(pwm_commands, led_commands)
        context.write('commands', commands)

    def finish(self, context: Context):
        context.info('finish()')
示例#3
0
    def __init__(self, load_model=False, model_path=None):
        self.preprocessor = DTPytorchWrapper()
        self.model = DDPG(state_dim=self.preprocessor.shape, action_dim=2, max_action=1, net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
    def init(self, context: Context):
        context.info('init()')

        self.image_processor = DTPytorchWrapper()
        self.action_processor = ActionWrapper(FakeWrap())
        from model import DDPG

        self.check_gpu_available(context)

        self.model = DDPG(state_dim=self.image_processor.shape, action_dim=2, max_action=1, net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))
        self.model.load("model", directory="./models")
示例#5
0
class PytorchRLTemplateAgent:
    def __init__(self, load_model=False, model_path=None):
        logger.info('PytorchRLTemplateAgent init')
        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            logger.info('PytorchRLTemplateAgent loading models')
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info('PytorchRLTemplateAgent init complete')

    def init(self, context: Context):
        context.info('init()')

    def on_received_seed(self, data: int):
        np.random.seed(data)

    def on_received_episode_start(self, context: Context, data: EpisodeStart):
        context.info(f'Starting episode "{data.episode_name}".')

    def on_received_observations(self, data: Duckiebot1Observations):
        camera: JPGImage = data.camera
        obs = jpg2rgb(camera.jpg_data)
        self.current_image = self.preprocessor.preprocess(obs)

    def compute_action(self, observation):
        #if observation.shape != self.preprocessor.transposed_shape:
        #    observation = self.preprocessor.preprocess(observation)
        action = self.model.predict(observation)
        return action.astype(float)

    def on_received_get_commands(self, context: Context):
        pwm_left, pwm_right = self.compute_action(self.current_image)

        pwm_left = float(np.clip(pwm_left, -1, +1))
        pwm_right = float(np.clip(pwm_right, -1, +1))

        grey = RGB(0.0, 0.0, 0.0)
        led_commands = LEDSCommands(grey, grey, grey, grey, grey)
        pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right)
        commands = Duckiebot1Commands(pwm_commands, led_commands)
        context.write('commands', commands)

    def finish(self, context: Context):
        context.info('finish()')
示例#6
0
    def __init__(self, load_model=False, model_path=None):
        logger.info('PytorchRLTemplateAgent init')
        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            logger.info('PytorchRLTemplateAgent loading models')
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info('PytorchRLTemplateAgent init complete')
    def init(self, context: Context):
        self.check_gpu_available(context)
        logger.info("PytorchRLTemplateAgent init")
        from model import DDPG

        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if self.load_model:
            logger.info("Pytorch Template Agent loading models")
            fp = self.model_path if self.model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info("PytorchRLTemplateAgent init complete")
示例#8
0
    def __init__(self, obs_space, action_space, ram, writer, device, args):
        """
        :param obs_space: Dimensions of state (int)
        :param action_space: Dimension of action (int)
        :param ram: replay memory buffer object
        :return:
        """
        self.state_dim = obs_space.shape[0]
        self.action_dim = action_space.shape[0]
        self.action_high = action_space.high
        self.action_low = action_space.low
        self.ram = ram
        self.iter = 1
        self.steps = 0
        self.gamma = args.gamma
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.decay_rate = args.decay_rate
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay
        self.start_step = args.start_learning
        self.device = device
        self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.writer = writer
        self.args = args

        # init network
        target_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        learn_net = DDPG(obs_space.shape, self.action_dim, args).to(device)
        utils.hard_update(target_net, learn_net)
        self.AC = learn_net
        self.AC_T = target_net
        self.actor_optimizer = torch.optim.Adam(
            self.AC.actor.policyNet.parameters(), args.lr_a)
        self.critic_optimizer = torch.optim.Adam(self.AC.critic.parameters(),
                                                 args.lr_c)
        self.actor = self.AC.actor
        self.target_actor = self.AC_T.actor
        self.critic = self.AC.critic
        self.target_critic = self.AC_T.critic
示例#9
0
    def __init__(self, hparams):
        super(HER, self).__init__()

        self.hparams = hparams

        self.test_env = make_env(hparams, render=self.hparams.render_test)
        sample_obs = self.test_env.observation_space['observation'].sample()
        sample_goal = self.test_env.observation_space['achieved_goal'].sample()

        # HARD CODED VALUES FOR Bullet-HRL
        action_limits, state_limits = get_env_boundaries()
        action_offset, action_bounds, action_clip_low, action_clip_high = action_limits

        state_shape = sample_obs.shape[0]
        action_shape = self.test_env.action_space.shape[0]
        goal_shape = sample_goal.shape[0]
        self.action_clips = (action_clip_low, action_clip_high)

        self.model = DDPG(params=self.hparams,
                          obs_size=state_shape,
                          goal_size=goal_shape,
                          act_size=action_shape,
                          action_clips=(action_clip_low, action_clip_high),
                          action_bounds=action_bounds,
                          action_offset=action_offset)

        self.model.actor.share_memory()
        self.model.critic.share_memory()

        self.state_normalizer = Normalizer(
            state_shape, default_clip_range=self.hparams.clip_range)
        self.goal_normalizer = Normalizer(
            goal_shape, default_clip_range=self.hparams.clip_range)

        self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size,
                                                state_shape, action_shape,
                                                goal_shape)
示例#10
0
def solve(params, cis):
    # python has dynamic typing, the line below can help IDEs with autocompletion
    assert isinstance(cis, ChallengeInterfaceSolution)
    # after this cis. will provide you with some autocompletion in some IDEs (e.g.: pycharm)
    cis.info('Creating model.')
    # you can have logging capabilties through the solution interface (cis).
    # the info you log can be retrieved from your submission files.

    # We get environment from the Evaluation Engine
    cis.info('Making environment')
    env = gym.make(params['env'])

    # === BEGIN SUBMISSION ===

    # If you created custom wrappers, you also need to copy them into this folder.

    from wrappers import NormalizeWrapper, ImgWrapper, ActionWrapper, ResizeWrapper

    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    # to make the images pytorch-conv-compatible
    env = ImgWrapper(env)
    env = ActionWrapper(env)

    # you ONLY need this wrapper if you trained your policy on [speed,steering angle]
    # instead [left speed, right speed]
    env = SteeringToWheelVelWrapper(env)

    # you have to make sure that you're wrapping at least the actions
    # and observations in the same as during training so that your model
    # receives the same kind of input, because that's what it's trained for
    # (for example if your model is trained on grayscale images and here
    # you _don't_ make it grayscale too, then your model wont work)

    # HERE YOU NEED TO CREATE THE POLICY NETWORK SAME AS YOU DID IN THE TRAINING CODE
    # if you aren't using the DDPG baseline code, then make sure to copy your model
    # into the model.py file and that it has a model.predict(state) method.
    from model import DDPG

    model = DDPG(state_dim=env.observation_space.shape,
                 action_dim=2,
                 max_action=1,
                 net_type="cnn")

    try:
        model.load("model", "models")

        # === END SUBMISSION ===

        # Then we make sure we have a connection with the environment and it is ready to go
        cis.info('Reset environment')
        observation = env.reset()

        # While there are no signal of completion (simulation done)
        # we run the predictions for a number of episodes, don't worry, we have the control on this part
        while True:
            # we passe the observation to our model, and we get an action in return
            action = model.predict(observation)
            # we tell the environment to perform this action and we get some info back in OpenAI Gym style
            observation, reward, done, info = env.step(action)
            # here you may want to compute some stats, like how much reward are you getting
            # notice, this reward may no be associated with the challenge score.

            # it is important to check for this flag, the Evalution Engine will let us know when should we finish
            # if we are not careful with this the Evaluation Engine will kill our container and we will get no score
            # from this submission
            if 'simulation_done' in info:
                cis.info('simulation_done received.')
                break
            if done:
                cis.info('Episode done; calling reset()')
                env.reset()

    finally:
        # release CPU/GPU resources, let's be friendly with other users that may need them
        cis.info('Releasing resources')
        try:
            model.close()
        except:
            msg = 'Could not call model.close():\n%s' % traceback.format_exc()
            cis.error(msg)
    cis.info('Graceful exit of solve()')
gamma = 0.99  # 用多少比例的 critic value來當作target q value
var = 3.0  # 動作搜索變異性

if __name__ == '__main__':
    # Create environment
    env = gym.make('Pendulum-v0').unwrapped
    n_state = env.observation_space.shape[0]  # 提取state的維度
    n_action = env.action_space.shape[0]  # 提取action的維度
    a_limit = env.action_space.high[0]  # 提取action連續動作中,最大的可能數值

    # Create network
    net = DDPG(n_state=n_state,
               n_action=n_action,
               a_limit=a_limit,
               model_folder=model_folder,
               memory_size=memory_size,
               batch_size=batch_size,
               tau=tau,
               gamma=gamma,
               var=var)
    net.load()

    # Train
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        for j in range(max_iter):
            # env.render()
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)
示例#12
0
def master_loop(env):

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    fileHandler = logging.FileHandler('./log/test.log')
    fileHandler.setFormatter(formatter)

    logger.addHandler(fileHandler)
    logger.setLevel(logging.INFO)

    s_dim = env.get_s_dim()
    a_dim = env.get_a_dim()
    a_high = env.get_a_high()
    a_low = env.get_a_low()
    # print(a_bound)
    print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format(
        s_dim, a_dim, a_high, a_low))
    ddpg = DDPG(a_dim,
                s_dim,
                a_high,
                a_low,
                lr_a=LR_A,
                lr_c=LR_C,
                gamma=GAMMA,
                tau=TAU,
                rpm_size=MEMORY_CAPACITY,
                batch_size=BATCH_SIZE)

    status = MPI.Status()
    start_time = time.time()
    reset_time = time.time()

    total_eps = 0
    total_step = 0

    n_step = 0
    n_eps = 0

    max_reward = -9999
    max_reward_rank = 0

    ddpg.load()

    while total_eps < MAX_EPISODES:
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
        source = status.Get_source()
        tag = status.Get_tag()

        if tag == REQ_ACTION:
            # action = env.action_space.sample()
            action = ddpg.choose_action(data)
            comm.send((action, total_eps, total_step),
                      dest=source,
                      tag=RSP_ACTION)

        elif tag == OBS_DATA:
            n_step += 1
            total_step += 1
            (s, a, r, s_, done, ep_reward, ep_step) = data
            is_done = 0.0
            if done:
                is_done = 1.0

            ddpg.store_transition(s, a, r, s_, is_done)

            if ddpg.pointer > LEARN_START and total_step % 3 == 0:
                ddpg.learn()

            if done:
                total_eps += 1
                if ep_reward > max_reward:
                    max_reward = ep_reward
                    max_reward_rank = source

                s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format(
                    total_eps, source, ep_reward, max_reward, max_reward_rank,
                    ep_step)
                #print(s)
                logging.info(s)

                if total_eps % 500 == 0:
                    ddpg.save(total_eps)
                    interval = time.time() - reset_time
                    s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format(
                        total_step, total_eps, n_eps / interval * 60,
                        n_step / interval)
                    #print(s)
                    logging.info(s)

                    n_step = 0
                    n_eps = 0
                    reset_time = time.time()
    if not os.path.exists(param_path):
        print("创建参数文件夹")
        os.makedirs(param_path)
    if not os.path.exists(log_path):
        print("创建日志文件夹")
        os.makedirs(log_path)
    env = make_env(env_name)
    obs_ls = env.reset()        # 初始化状态

    global_input_size = 0
    for cv in obs_ls:
        global_input_size += len(cv)
    for action_space in env.action_space:
        global_input_size += action_space.n
    # 初始化模型
    agent_models = [DDPG(str(i), len(obs_ls[i]),  env.action_space[i].n, global_input_size,  MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))]
    target_models = [DDPG(str(i), len(obs_ls[i]),  env.action_space[i].n, global_input_size,  MEM_LEN, LEARNING_RATE) for i in range(len(env.world.agents))]
    for idx, model in enumerate(target_models):
        model.load_state_dict(agent_models[idx].state_dict())


    if LOAD_KEY:
        for idx, model in enumerate(agent_models):
            if idx == 0:
                check_point = torch.load('./param/DDPGagent0_listener_5000.pkl')
            else:
                check_point = torch.load('./param/DDPGagent1_listener_5m000.pkl')
            model.load_state_dict(check_point)

    for epo_i in range(MAX_EPOCH):
        obs_ls = env.reset()
示例#14
0
max_iter = 200
model_folder = './model'
var = 0.0                   # 動作搜索變異性

if __name__ == '__main__':
    # Create environment
    env = gym.make('Pendulum-v0').unwrapped
    n_state = env.observation_space.shape[0]        # 提取state的維度
    n_action = env.action_space.shape[0]            # 提取action的維度
    a_limit = env.action_space.high[0]              # 提取action連續動作中,最大的可能數值

    # Create network
    net = DDPG(
        n_state = n_state,
        n_action = n_action,
        a_limit = a_limit,
        model_folder = model_folder,
        var = var
    )
    net.load()

    # Train
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        for j in range(max_iter):
            env.render()
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)
class PytorchRLTemplateAgent:
    def __init__(self):
        pass

    def init(self, context: Context, load_model=False, model_path=None):
        self.check_gpu_available(context)
        logger.info('PytorchRLTemplateAgent init')
        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if load_model:
            logger.info('PytorchRLTemplateAgent loading models')
            fp = model_path if model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info('PytorchRLTemplateAgent init complete')

    def check_gpu_available(self, context: Context):
        available = torch.cuda.is_available()
        req = os.environ.get('AIDO_REQUIRE_GPU', None)
        context.info(
            f'torch.cuda.is_available = {available!r} AIDO_REQUIRE_GPU = {req!r}'
        )
        context.info('init()')
        if available:
            i = torch.cuda.current_device()
            count = torch.cuda.device_count()
            name = torch.cuda.get_device_name(i)
            context.info(f'device {i} of {count}; name = {name!r}')
        else:
            if req is not None:
                msg = 'I need a GPU; bailing.'
                context.error(msg)
                raise Exception(msg)

    def on_received_seed(self, data: int):
        np.random.seed(data)

    def on_received_episode_start(self, context: Context, data: EpisodeStart):
        context.info(f'Starting episode "{data.episode_name}".')

    def on_received_observations(self, data: DB20Observations):
        camera: JPGImage = data.camera
        obs = jpg2rgb(camera.jpg_data)
        self.current_image = self.preprocessor.preprocess(obs)

    def compute_action(self, observation):
        #if observation.shape != self.preprocessor.transposed_shape:
        #    observation = self.preprocessor.preprocess(observation)
        action = self.model.predict(observation)
        return action.astype(float)

    def on_received_get_commands(self, context: Context):
        pwm_left, pwm_right = self.compute_action(self.current_image)

        pwm_left = float(np.clip(pwm_left, -1, +1))
        pwm_right = float(np.clip(pwm_right, -1, +1))

        grey = RGB(0.0, 0.0, 0.0)
        led_commands = LEDSCommands(grey, grey, grey, grey, grey)
        pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right)
        commands = DB20Commands(pwm_commands, led_commands)
        context.write('commands', commands)

    def finish(self, context: Context):
        context.info('finish()')
class PytorchRLTemplateAgent:
    def __init__(self, load_model: bool, model_path: Optional[str]):
        self.load_model = load_model
        self.model_path = model_path

    def init(self, context: Context):
        self.check_gpu_available(context)
        logger.info("PytorchRLTemplateAgent init")
        from model import DDPG

        self.preprocessor = DTPytorchWrapper()

        self.model = DDPG(state_dim=self.preprocessor.shape,
                          action_dim=2,
                          max_action=1,
                          net_type="cnn")
        self.current_image = np.zeros((640, 480, 3))

        if self.load_model:
            logger.info("Pytorch Template Agent loading models")
            fp = self.model_path if self.model_path else "model"
            self.model.load(fp, "models", for_inference=True)
        logger.info("PytorchRLTemplateAgent init complete")

    def check_gpu_available(self, context: Context):
        import torch

        available = torch.cuda.is_available()
        context.info(f"torch.cuda.is_available = {available!r}")
        context.info("init()")
        if available:
            i = torch.cuda.current_device()
            count = torch.cuda.device_count()
            name = torch.cuda.get_device_name(i)
            context.info(f"device {i} of {count}; name = {name!r}")
        else:
            no_hardware_GPU_available(context)

    def on_received_seed(self, data: int):
        np.random.seed(data)

    def on_received_episode_start(self, context: Context, data: EpisodeStart):
        context.info(f'Starting episode "{data.episode_name}".')

    def on_received_observations(self, data: DB20Observations):
        camera: JPGImage = data.camera
        obs = jpg2rgb(camera.jpg_data)
        self.current_image = self.preprocessor.preprocess(obs)

    def compute_action(self, observation):
        # if observation.shape != self.preprocessor.transposed_shape:
        #    observation = self.preprocessor.preprocess(observation)
        action = self.model.predict(observation)
        return action.astype(float)

    def on_received_get_commands(self, context: Context):
        pwm_left, pwm_right = self.compute_action(self.current_image)

        pwm_left = float(np.clip(pwm_left, -1, +1))
        pwm_right = float(np.clip(pwm_right, -1, +1))

        grey = RGB(0.0, 0.0, 0.0)
        led_commands = LEDSCommands(grey, grey, grey, grey, grey)
        pwm_commands = PWMCommands(motor_left=pwm_left, motor_right=pwm_right)
        commands = DB20Commands(pwm_commands, led_commands)
        context.write("commands", commands)

    def finish(self, context: Context):
        context.info("finish()")
示例#17
0
            test_set.append(
                (user, list(test_user.loc[i:i + 9, 'itemId']),
                 test_user.loc[i + 10,
                               'itemId'], test_user.loc[i + 9, 'timestamp'] -
                 test_user.loc[i + 8, 'timestamp'],
                 float(test_user.loc[i + 10, 'reward']),
                 float(test_user.loc[i + 10, 'objective1']),
                 float(test_user.loc[i + 10, 'objective2'])))
train_set = train_set[:len(train_set) // batch_size * batch_size]
test_set = test_set[:len(test_set) // batch_size * batch_size]

start_time = time.time()

gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    primary_network = DDPG(hidden_size, 'primary_network')
    target_network = DDPG(hidden_size, 'target_network')
    model = Reinforce_Model(user_count, item_count, hidden_size, batch_size,
                            primary_network, target_network)

    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' %
          evaluate(sess, model, train_set))
    sys.stdout.flush()
    lr = 1
    start_time = time.time()
    last_auc = 0.0

    for epoch in range(100):
示例#18
0
class HER(pl.LightningModule):
    def __init__(self, hparams):
        super(HER, self).__init__()

        self.hparams = hparams

        self.test_env = make_env(hparams, render=self.hparams.render_test)
        sample_obs = self.test_env.observation_space['observation'].sample()
        sample_goal = self.test_env.observation_space['achieved_goal'].sample()

        # HARD CODED VALUES FOR Bullet-HRL
        action_limits, state_limits = get_env_boundaries()
        action_offset, action_bounds, action_clip_low, action_clip_high = action_limits

        state_shape = sample_obs.shape[0]
        action_shape = self.test_env.action_space.shape[0]
        goal_shape = sample_goal.shape[0]
        self.action_clips = (action_clip_low, action_clip_high)

        self.model = DDPG(params=self.hparams,
                          obs_size=state_shape,
                          goal_size=goal_shape,
                          act_size=action_shape,
                          action_clips=(action_clip_low, action_clip_high),
                          action_bounds=action_bounds,
                          action_offset=action_offset)

        self.model.actor.share_memory()
        self.model.critic.share_memory()

        self.state_normalizer = Normalizer(
            state_shape, default_clip_range=self.hparams.clip_range)
        self.goal_normalizer = Normalizer(
            goal_shape, default_clip_range=self.hparams.clip_range)

        self.replay_buffer = SharedReplayBuffer(self.hparams.buffer_size,
                                                state_shape, action_shape,
                                                goal_shape)

    def log_func(self, d):
        self.log_dict(d, on_step=True, prog_bar=True)

    def collate_fn(self, batch):
        return collate.default_convert(batch)

    def __dataloader(self) -> DataLoader:
        dataset = RLDataset(self.replay_buffer, self.hparams.batch_size,
                            self.hparams.n_batches,
                            self.hparams.replay_initial)
        dataloader = DataLoader(dataset=dataset,
                                collate_fn=self.collate_fn,
                                batch_size=1,
                                num_workers=1,
                                pin_memory=True)
        return dataloader

    def train_dataloader(self):
        return self.__dataloader()

    def __testloader(self):
        testset = TestDataset(hparams=self.hparams,
                              test_env=self.test_env,
                              model=self.model,
                              state_normalizer=self.state_normalizer,
                              goal_normalizer=self.goal_normalizer)
        testloader = DataLoader(dataset=testset, batch_size=1)

        return testloader

    def val_dataloader(self):
        return self.__testloader()

    def configure_optimizers(self):
        return [self.model.crt_opt, self.model.act_opt], []

    def training_step(self, batch, batch_idx, optimizer_idx):
        states_v, actions_v, next_states_v, rewards_v, dones_mask, goals_v = batch[
            0]
        norm_states_v = self.state_normalizer.normalize(states_v)
        norm_goals_v = self.goal_normalizer.normalize(goals_v)
        if optimizer_idx == 0:
            norm_next_states_v = self.state_normalizer.normalize(next_states_v)
            # train critic
            q_v = self.model.critic(norm_states_v, norm_goals_v, actions_v)
            with torch.no_grad():
                next_act_v = self.model.tgt_act_net(norm_next_states_v,
                                                    norm_goals_v)
                q_next_v = self.model.tgt_crt_net(norm_next_states_v,
                                                  norm_goals_v, next_act_v)
                q_next_v[dones_mask] = 0.0
                q_ref_v = rewards_v.unsqueeze(
                    dim=-1) + q_next_v * self.hparams.gamma
                # clip the q value
                clip_return = 1 / (1 - self.hparams.gamma)
                q_ref_v = torch.clamp(q_ref_v, -clip_return, 0)
            critic_loss_v = F.mse_loss(q_v, q_ref_v.detach())
            tqdm_dict = {'critic_loss': critic_loss_v}
            self.log_dict(tqdm_dict, prog_bar=True)

            return critic_loss_v

        elif optimizer_idx == 1:
            # train actor
            self.model.actor.offset.requires_grad = False
            self.model.actor.action_bounds.requires_grad = False

            cur_actions_v = self.model.actor(norm_states_v, norm_goals_v)
            actor_loss_v = -self.model.critic(norm_states_v, norm_goals_v,
                                              cur_actions_v).mean()
            actor_loss_v += ((cur_actions_v - self.model.actor.offset) /
                             self.model.actor.action_bounds).pow(2).mean()
            tqdm_dict = {'actor_loss': actor_loss_v}
            self.log_dict(tqdm_dict, prog_bar=True)

            if batch_idx % self.hparams.sync_batches == 0:
                self.model.alpha_sync(self.hparams.polyak)

            return actor_loss_v

    def validation_step(self, batch, batch_idx):
        to_log = dict()
        for k, v in batch.items():
            to_log[k] = v.detach().cpu().numpy()
        to_log['epoch_nr'] = int(self.current_epoch)
        if self.logger is not None:
            self.logger.experiment.log(to_log)