Пример #1
0
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 explore_size=10000,
                 step_per_iter=3000,
                 lr_q=1e-3,
                 gamma=0.99,
                 batch_size=128,
                 min_update_step=1000,
                 epsilon=0.90,
                 update_target_gap=50,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.render = render
        self.num_process = num_process
        self.memory = Memory(size=memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.lr_q = lr_q
        self.gamma = gamma
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon
        self.seed = seed
        self.model_path = model_path

        self._init_model()
Пример #2
0
    def __init__(self,
                 num_states,
                 num_actions,
                 learning_rate=0.01,
                 gamma=0.90,
                 batch_size=128,
                 epsilon=0.90,
                 update_target_gap=50,
                 enable_gpu=False):

        if enable_gpu:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon

        self.num_learn_step = 0

        self.memory = Memory()
        self.eval_net, self.target_net = MLPPolicy(num_states, num_actions).to(self.device), MLPPolicy(num_states,
                                                                                                       num_actions) \
            .to(self.device)
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=learning_rate)
        self.loss_func = nn.MSELoss()
def collect_samples(pid, queue, env, policy, render, running_state,
                    min_batch_size):
    log = dict()
    memory = Memory()
    num_steps = 0
    num_episodes = 0

    min_episode_reward = float("inf")
    max_episode_reward = float("-inf")
    total_reward = 0

    while num_steps < min_batch_size:
        state = env.reset()
        episode_reward = 0
        if running_state:
            state = running_state(state)

        for t in range(10000):
            if render:
                env.render()

            state_tensor = tf.expand_dims(tf.convert_to_tensor(state,
                                                               dtype=TDOUBLE),
                                          axis=0)
            action, log_prob = policy.get_action_log_prob(state_tensor)
            action = action.numpy()[0]
            log_prob = log_prob.numpy()[0]
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward

            if running_state:
                next_state = running_state(next_state)

            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            memory.push(state, action, reward, next_state, mask, log_prob)
            num_steps += 1

            if done or num_steps >= min_batch_size:
                break

            state = next_state

        # num_steps += (t + 1)
        num_episodes += 1
        total_reward += episode_reward
        min_episode_reward = min(episode_reward, min_episode_reward)
        max_episode_reward = max(episode_reward, max_episode_reward)

    log["num_steps"] = num_steps
    log["num_episodes"] = num_episodes
    log["total_reward"] = total_reward
    log["avg_reward"] = total_reward / num_episodes
    log["max_episode_reward"] = max_episode_reward
    log["min_episode_reward"] = min_episode_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
def collect_samples(pid, queue, env, policy, render, running_state,
                    custom_reward, min_batch_size):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    num_episodes = 0

    min_episode_reward = float('inf')
    max_episode_reward = float('-inf')
    total_reward = 0

    while num_steps < min_batch_size:
        state = env.reset()
        episode_reward = 0
        if running_state:
            state = running_state(state)

        for t in range(10000):
            if render:
                env.render()
            state_tensor = FLOAT(state).unsqueeze(0)
            with torch.no_grad():
                action, log_prob = policy.get_action_log_prob(state_tensor)
            action = action.cpu().numpy()[0]
            log_prob = log_prob.cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action)
            if custom_reward:
                reward = custom_reward(state, action)
            episode_reward += reward

            if running_state:
                next_state = running_state(next_state)

            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            memory.push(state, action, reward, next_state, mask, log_prob)
            num_steps += 1
            if done or num_steps >= min_batch_size:
                break

            state = next_state

        # num_steps += (t + 1)
        num_episodes += 1
        total_reward += episode_reward
        min_episode_reward = min(episode_reward, min_episode_reward)
        max_episode_reward = max(episode_reward, max_episode_reward)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_episode_reward'] = max_episode_reward
    log['min_episode_reward'] = min_episode_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Пример #5
0
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_v=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 explore_size=10000,
                 step_per_iter=3000,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 action_noise=0.1,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.polyak = polyak
        self.memory = Memory(memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.action_noise = action_noise
        self.model_path = model_path
        self.seed = seed

        self._init_model()
Пример #6
0
class SAC_Alpha:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_a=3e-4,
                 lr_q=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 explore_size=10000,
                 step_per_iter=3000,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 target_update_delay=1,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.polyak = polyak
        self.memory = Memory(memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_a = lr_a
        self.lr_q = lr_q
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.target_update_delay = target_update_delay
        self.model_path = model_path
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "SAC is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        self.target_entropy = -np.prod(self.env.action_space.shape)
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states,
                                 self.num_actions,
                                 max_action=self.action_high,
                                 use_sac=True).to(device)

        self.q_net_1 = QValue(num_states, self.num_actions).to(device)
        self.q_net_target_1 = QValue(num_states, self.num_actions).to(device)
        self.q_net_2 = QValue(num_states, self.num_actions).to(device)
        self.q_net_target_2 = QValue(num_states, self.num_actions).to(device)

        # self.alpha init
        self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_()

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_sac_alpha.p".format(self.env_id))
            self.policy_net, self.q_net_1, self.q_net_2, self.running_state \
                = pickle.load(open('{}/{}_sac_alpha.p'.format(self.model_path, self.env_id), "rb"))

        self.q_net_target_1.load_state_dict(self.q_net_1.state_dict())
        self.q_net_target_2.load_state_dict(self.q_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a)
        self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(),
                                        lr=self.lr_q)
        self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(),
                                        lr=self.lr_q)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, _ = self.policy_net.rsample(state)
        action = action.cpu().numpy()[0]
        return action, None

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            state = self.running_state(state)
            action, _ = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """interact"""
        global_steps = (i_iter - 1) * self.step_per_iter + 1
        log = dict()
        num_steps = 0
        num_episodes = 0
        total_reward = 0
        min_episode_reward = float('inf')
        max_episode_reward = float('-inf')

        while num_steps < self.step_per_iter:
            state = self.env.reset()
            state = self.running_state(state)
            episode_reward = 0

            for t in range(10000):

                if self.render:
                    self.env.render()

                if global_steps < self.explore_size:  # explore
                    action = self.env.action_space.sample()
                else:  # action
                    action, _ = self.choose_action(state)

                next_state, reward, done, _ = self.env.step(action)
                next_state = self.running_state(next_state)
                mask = 0 if done else 1
                # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
                self.memory.push(state, action, reward, next_state, mask, None)

                episode_reward += reward
                global_steps += 1
                num_steps += 1

                if global_steps >= self.min_update_step and global_steps % self.update_step == 0:
                    for k in range(1, self.update_step + 1):
                        batch = self.memory.sample(
                            self.batch_size)  # random sample batch
                        self.update(batch, k)

                if done or num_steps >= self.step_per_iter:
                    break

                state = next_state

            num_episodes += 1
            total_reward += episode_reward
            min_episode_reward = min(episode_reward, min_episode_reward)
            max_episode_reward = max(episode_reward, max_episode_reward)

        self.env.close()

        log['num_steps'] = num_steps
        log['num_episodes'] = num_episodes
        log['total_reward'] = total_reward
        log['avg_reward'] = total_reward / num_episodes
        log['max_episode_reward'] = max_episode_reward
        log['min_episode_reward'] = min_episode_reward

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}")

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

    def update(self, batch, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by SAC Alpha
        alg_step_stats = sac_alpha_step(
            self.policy_net, self.q_net_1, self.q_net_2, self.alpha,
            self.q_net_target_1, self.q_net_target_2, self.optimizer_p,
            self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a,
            batch_state, batch_action, batch_reward, batch_next_state,
            batch_mask, self.gamma, self.polyak, self.target_entropy,
            k_iter % self.target_update_delay == 0)

    def save(self, save_path):
        """save model"""
        check_path(save_path)

        pickle.dump(
            (self.policy_net, self.q_net_1, self.q_net_2, self.running_state),
            open('{}/{}_sac_alpha.p'.format(save_path, self.env_id), 'wb'))
Пример #7
0
class DQN:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 explore_size=10000,
                 step_per_iter=3000,
                 lr_q=1e-3,
                 gamma=0.99,
                 batch_size=128,
                 min_update_step=1000,
                 epsilon=0.90,
                 update_target_gap=50,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.render = render
        self.num_process = num_process
        self.memory = Memory(size=memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.lr_q = lr_q
        self.gamma = gamma
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon
        self.seed = seed
        self.model_path = model_path

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert not env_continuous, "DQN is only applicable to discontinuous environment !!!!"

        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        # initialize networks
        self.value_net = QNet_dqn(num_states, self.num_actions).to(device)
        self.value_net_target = QNet_dqn(num_states,
                                         self.num_actions).to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        # load model if necessary
        if self.model_path:
            print("Loading Saved Model {}_dqn.p".format(self.env_id))
            self.value_net, self.running_state = pickle.load(
                open('{}/{}_dqn.p'.format(self.model_path, self.env_id), "rb"))

        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer = optim.Adam(self.value_net.parameters(), lr=self.lr_q)

    def choose_action(self, state):
        state = FLOAT(state).unsqueeze(0).to(device)
        if np.random.uniform() <= self.epsilon:
            with torch.no_grad():
                action = self.value_net.get_action(state)
            action = action.cpu().numpy()[0]
        else:  # choose action greedy
            action = np.random.randint(0, self.num_actions)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            state = self.running_state(state)
            action = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """interact"""
        global_steps = (i_iter - 1) * self.step_per_iter
        log = dict()
        num_steps = 0
        num_episodes = 0
        total_reward = 0
        min_episode_reward = float('inf')
        max_episode_reward = float('-inf')

        while num_steps < self.step_per_iter:
            state = self.env.reset()
            state = self.running_state(state)
            episode_reward = 0

            for t in range(10000):
                if self.render:
                    self.env.render()

                if global_steps < self.explore_size:  # explore
                    action = self.env.action_space.sample()
                else:  # choose according to target net
                    action = self.choose_action(state)

                next_state, reward, done, _ = self.env.step(action)
                next_state = self.running_state(next_state)
                mask = 0 if done else 1
                # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
                self.memory.push(state, action, reward, next_state, mask, None)

                episode_reward += reward
                global_steps += 1
                num_steps += 1

                if global_steps >= self.min_update_step:
                    batch = self.memory.sample(
                        self.batch_size)  # random sample batch
                    self.update(batch)

                if global_steps % self.update_target_gap == 0:
                    self.value_net_target.load_state_dict(
                        self.value_net.state_dict())

                if done or num_steps >= self.step_per_iter:
                    break

                state = next_state

            num_episodes += 1
            total_reward += episode_reward
            min_episode_reward = min(episode_reward, min_episode_reward)
            max_episode_reward = max(episode_reward, max_episode_reward)

        self.env.close()

        log['num_steps'] = num_steps
        log['num_episodes'] = num_episodes
        log['total_reward'] = total_reward
        log['avg_reward'] = total_reward / num_episodes
        log['max_episode_reward'] = max_episode_reward
        log['min_episode_reward'] = min_episode_reward

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}")

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

    def update(self, batch):
        batch_state = FLOAT(batch.state).to(device)
        batch_action = LONG(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        alg_step_stats = dqn_step(self.value_net, self.optimizer,
                                  self.value_net_target, batch_state,
                                  batch_action, batch_reward, batch_next_state,
                                  batch_mask, self.gamma)

    def save(self, save_path):
        """save model"""
        check_path(save_path)
        pickle.dump((self.value_net, self.running_state),
                    open('{}/{}_dqn.p'.format(save_path, self.env_id), 'wb'))
Пример #8
0
class NaiveDQN:
    def __init__(self,
                 num_states,
                 num_actions,
                 learning_rate=0.01,
                 gamma=0.90,
                 batch_size=128,
                 epsilon=0.90,
                 update_target_gap=50,
                 enable_gpu=False):

        if enable_gpu:
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.gamma = gamma
        self.batch_size = batch_size
        self.update_target_gap = update_target_gap
        self.epsilon = epsilon

        self.num_learn_step = 0

        self.memory = Memory()
        self.eval_net, self.target_net = MLPPolicy(num_states, num_actions).to(self.device), MLPPolicy(num_states,
                                                                                                       num_actions) \
            .to(self.device)
        self.optimizer = optim.Adam(self.eval_net.parameters(),
                                    lr=learning_rate)
        self.loss_func = nn.MSELoss()

    # greedy 策略动作选择
    def choose_action(self, state, num_actions):
        state = torch.tensor(state).unsqueeze(0).to(self.device)
        if np.random.uniform() <= self.epsilon:  # greedy policy
            action_val = self.eval_net(state.float())
            action = torch.max(action_val, 1)[1].cpu().numpy()
            return action[0]
        else:
            action = np.random.randint(0, num_actions)
            return action

    def learn(self):
        # 更新目标网络 target_net
        if self.num_learn_step % self.update_target_gap == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.num_learn_step += 1

        # 从Memory中采batch
        batch = self.memory.sample(self.batch_size)
        batch_state = torch.cat(batch.state).to(self.device)
        batch_action = torch.cat(batch.action).unsqueeze(-1).to(self.device)
        batch_reward = torch.cat(batch.reward).unsqueeze(-1).to(self.device)
        batch_next_state = torch.cat(batch.next_state).to(self.device)

        # 训练网络 eval_net
        q_eval = self.eval_net(batch_state.float()).gather(1, batch_action)
        q_next = self.target_net(batch_next_state.float())
        # current_reward + gamma * max_Q_eval(next_state)
        q_target = batch_reward + self.gamma * q_next.max(1)[0].view(
            self.batch_size, 1)
        # 计算误差
        loss = self.loss_func(q_eval, q_target)

        # 更新训练网络 eval_net 梯度
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Пример #9
0
class DDPG:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_v=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 explore_size=10000,
                 step_per_iter=3000,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 action_noise=0.1,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.polyak = polyak
        self.memory = Memory(memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.action_noise = action_noise
        self.model_path = model_path
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "DDPG is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states, self.num_actions,
                                 self.action_high).to(device)
        self.policy_net_target = Policy(num_states, self.num_actions,
                                        self.action_high).to(device)

        self.value_net = Value(num_states, self.num_actions).to(device)
        self.value_net_target = Value(num_states, self.num_actions).to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ddpg.p".format(self.env_id))
            self.policy_net, self.value_net, self.running_state = pickle.load(
                open('{}/{}_ddpg.p'.format(self.model_path, self.env_id),
                     "rb"))

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target.load_state_dict(self.value_net.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state, noise_scale):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, log_prob = self.policy_net.get_action_log_prob(state)
        action = action.cpu().numpy()[0]
        # add noise
        noise = noise_scale * np.random.randn(self.num_actions)
        action += noise
        action = np.clip(action, -self.action_high, self.action_high)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            # state = self.running_state(state)
            action = self.choose_action(state, 0)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """interact"""
        global_steps = (i_iter - 1) * self.step_per_iter
        log = dict()
        num_steps = 0
        num_episodes = 0
        total_reward = 0
        min_episode_reward = float('inf')
        max_episode_reward = float('-inf')

        while num_steps < self.step_per_iter:
            state = self.env.reset()
            # state = self.running_state(state)
            episode_reward = 0

            for t in range(10000):

                if self.render:
                    self.env.render()

                if global_steps < self.explore_size:  # explore
                    action = self.env.action_space.sample()
                else:  # action with noise
                    action = self.choose_action(state, self.action_noise)

                next_state, reward, done, _ = self.env.step(action)
                # next_state = self.running_state(next_state)
                mask = 0 if done else 1
                # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
                self.memory.push(state, action, reward, next_state, mask, None)

                episode_reward += reward
                global_steps += 1
                num_steps += 1

                if global_steps >= self.min_update_step and global_steps % self.update_step == 0:
                    for _ in range(self.update_step):
                        batch = self.memory.sample(
                            self.batch_size)  # random sample batch
                        self.update(batch)

                if done or num_steps >= self.step_per_iter:
                    break

                state = next_state

            num_episodes += 1
            total_reward += episode_reward
            min_episode_reward = min(episode_reward, min_episode_reward)
            max_episode_reward = max(episode_reward, max_episode_reward)

        self.env.close()

        log['num_steps'] = num_steps
        log['num_episodes'] = num_episodes
        log['total_reward'] = total_reward
        log['avg_reward'] = total_reward / num_episodes
        log['max_episode_reward'] = max_episode_reward
        log['min_episode_reward'] = min_episode_reward

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}")

        # record reward information
        writer.add_scalar("total reward", log['total_reward'], i_iter)
        writer.add_scalar("average reward", log['avg_reward'], i_iter)
        writer.add_scalar("min reward", log['min_episode_reward'], i_iter)
        writer.add_scalar("max reward", log['max_episode_reward'], i_iter)
        writer.add_scalar("num steps", log['num_steps'], i_iter)

    def update(self, batch):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by DDPG
        alg_step_stats = ddpg_step(self.policy_net, self.policy_net_target,
                                   self.value_net, self.value_net_target,
                                   self.optimizer_p, self.optimizer_v,
                                   batch_state, batch_action, batch_reward,
                                   batch_next_state, batch_mask, self.gamma,
                                   self.polyak)

    def save(self, save_path):
        """save model"""
        check_path(save_path)
        pickle.dump((self.policy_net, self.value_net, self.running_state),
                    open('{}/{}_ddpg.p'.format(save_path, self.env_id), 'wb'))