示例#1
0
def train_agent__off_policy(class_agent, net_dim, batch_size, repeat_times,
                            gamma, reward_scale, cwd, env_name, max_step,
                            max_memo, max_epoch, **_kwargs):  # 2020-06-01
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    '''init'''
    agent = class_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    buffer = BufferArray(max_memo,
                         state_dim,
                         action_dim=1 if is_discrete else
                         action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)  # unnecessary
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, loss_a=0, loss_c=0)
    try:
        for epoch in range(max_epoch):
            # update replay buffer by interact with environment
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     max_action, reward_scale,
                                                     gamma)

            # update network parameters by random sampling buffer for gradient descent
            buffer.init_before_sample()
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, repeat_times)

            # show/check the reward, save the max reward actor
            with torch.no_grad():  # for saving the GPU buffer
                # NOTICE! Recorder saves the agent with max reward automatically.
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("| raise KeyboardInterrupt and break training loop")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
示例#2
0
def train_agent(  # 2020-11-11
        rl_agent, env_name, gpu_id, cwd,
        net_dim, max_memo, max_step, batch_size, repeat_times, reward_scale, gamma,
        break_step, if_break_early, show_gap, eval_times1, eval_times2, **_kwargs):  # 2020-09-18
    env, state_dim, action_dim, target_reward, if_discrete = build_env(env_name, if_print=False)

    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1, eval_size2=eval_times2)
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    if bool(rl_agent.__name__ in {'AgentPPO', }):
        buffer = BufferTupleOnline(max_memo)
    elif bool(rl_agent.__name__ in {'AgentModPPO', 'AgentInterPPO'}):
        buffer = BufferArray(max_memo + max_step, state_dim, action_dim, if_ppo=True)
    else:
        buffer = BufferArray(max_memo, state_dim, action_dim=1 if if_discrete else action_dim, if_ppo=False)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step, if_discrete, reward_scale, gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)

        '''pre training and hard update before training loop'''
        buffer.update_pointer_before_sample()
        agent.update_policy(buffer, max_step, batch_size, repeat_times)
        if 'act_target' in dir(agent):
            agent.act_target.load_state_dict(agent.act.state_dict())

    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # speed up running
            rewards, steps = agent.update_buffer(env, buffer, max_step, reward_scale, gamma)

        '''update network parameters by random sampling buffer for gradient descent'''
        loss_a, loss_c = agent.update_policy(buffer, max_step, batch_size, repeat_times)

        '''saves the agent with max reward'''
        recorder.update__record_explore(steps, rewards, loss_a, loss_c)

        if_save = recorder.update__record_evaluate(env, agent.act, max_step, agent.device, if_discrete)
        recorder.save_act(cwd, agent.act, gpu_id) if if_save else None

        with torch.no_grad():  # for saving the GPU buffer
            if_solve = recorder.check_is_solved(target_reward, gpu_id, show_gap, cwd)

        '''break loop rules'''
        if_train = not ((if_break_early and if_solve)
                        or recorder.total_step > break_step
                        or os.path.exists(f'{cwd}/stop'))

    recorder.save_npy__draw_plot(cwd)
    buffer.print_state_norm(env.neg_state_avg, env.div_state_std)
示例#3
0
def train_offline_policy(rl_agent, net_dim, batch_size, repeat_times, gamma,
                         reward_scale, cwd, env_name, max_step, max_memo,
                         max_total_step, **_kwargs):  # 2020-06-01
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
        env, is_print=False)
    assert not is_discrete
    '''init: agent, buffer, recorder'''
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()
    buffer = BufferArray(max_memo, state_dim,
                         action_dim)  # experiment replay buffer
    recorder = Recorder(agent, max_step, max_action, target_reward, env_name,
                        **_kwargs)  # unnecessary
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, loss_a=0, loss_c=0)

    while True:
        # update replay buffer by interact with environment
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma)

        # update network parameters by random sampling buffer for gradient descent
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)

        # show/check the reward, save the max reward actor
        with torch.no_grad():  # for saving the GPU buffer
            # NOTICE! Recorder saves the agent with max reward automatically.
            recorder.show_reward(rewards, steps, loss_a, loss_c)

            is_solved = recorder.check_reward(cwd, loss_a, loss_c)
        if is_solved:
            print('Reach target_reward: ', target_reward, recorder.reward_max)
            break
        if recorder.total_step > max_total_step:
            print('Reach target_step: ', max_total_step, recorder.total_step)
            break

    train_time = recorder.print_and_save_npy(env_name, cwd)

    if is_solved:
        agent.save_or_load_model(cwd, is_save=True)
    draw_plot_with_npy(cwd, train_time)
示例#4
0
def train_agent_discrete(
        class_agent,
        env_name,
        cwd,
        net_dim,
        max_step,
        max_memo,
        max_epoch,  # env
        batch_size,
        gamma,
        update_gap,
        reward_scale,
        **_kwargs):  # 2020-05-20
    env = gym.make(env_name)
    '''init'''
    state_dim, action_dim, action_max, target_reward = get_env_info(
        env, is_print=True)
    assert isinstance(action_max, int)  # means Discrete action space

    agent = class_agent(env, state_dim, action_dim, net_dim)  # training agent
    buffer = BufferArray(max_memo, state_dim,
                         action_dim=1)  # experiment replay buffer
    recorder = Recorder(agent, max_step, action_max, target_reward, env_name,
                        **_kwargs)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        rewards, steps = initial_exploration(env, buffer, max_step, action_max,
                                             reward_scale, gamma, action_dim)
    recorder.show_reward(rewards, steps, 0, 0)
    try:
        for epoch in range(max_epoch):
            '''update replay buffer by interact with environment'''
            with torch.no_grad():  # for saving the GPU buffer
                rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                     action_max, reward_scale,
                                                     gamma)
            '''update network parameters by random sampling buffer for stochastic gradient descent'''
            loss_a, loss_c = agent.update_parameters(buffer, max_step,
                                                     batch_size, update_gap)
            '''show/check the reward, save the max reward actor'''
            with torch.no_grad():  # for saving the GPU buffer
                '''NOTICE! Recorder saves the agent with max reward automatically. '''
                recorder.show_reward(rewards, steps, loss_a, loss_c)

                is_solved = recorder.check_reward(cwd, loss_a, loss_c)
            if is_solved:
                break
    except KeyboardInterrupt:
        print("raise KeyboardInterrupt while training.")
    # except AssertionError:  # for BipedWalker BUG 2020-03-03
    #     print("AssertionError: OpenAI gym r.LengthSquared() > 0.0f ??? Please run again.")
    #     return False

    train_time = recorder.print_and_save_npy(env_name, cwd)

    # agent.save_or_load_model(cwd, is_save=True)  # save max reward agent in Recorder
    # buffer.save_or_load_memo(cwd, is_save=True)

    draw_plot_with_npy(cwd, train_time)
    return True
示例#5
0
def process__buffer(q_aggr, qs_dist, args, **_kwargs):
    max_memo = args.max_memo
    env_name = args.env_name
    max_step = args.max_step
    batch_size = args.batch_size
    repeat_times = 2

    # reward_scale = args.reward_scale
    # gamma = args.gamma

    '''init'''
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=False)
    buffer = BufferArray(max_memo, state_dim, action_dim)  # experiment replay buffer

    workers_num = len(qs_dist)

    '''loop'''
    is_training = True
    while is_training:
        for i in range(workers_num):
            memo_array, is_solved = q_aggr.get()
            buffer.extend_memo(memo_array)
            if is_solved:
                is_training = False

        buffer.init_before_sample()
        for i in range(max_step * repeat_times):
            # batch_arrays = buffer.random_sample(batch_size, device=None) # faster but worse
            for q_dist in qs_dist:
                batch_arrays = buffer.random_sample(batch_size, device=None)  # slower but better
                q_dist.put(batch_arrays)

    print('|| Exit: process__buffer')
示例#6
0
def process__buffer(q_aggr, qs_dist, args, **_kwargs):
    max_memo = args.max_memo
    env_name = args.env_name
    max_step = args.max_step
    batch_size = args.batch_size
    repeat_times = 2

    reward_scale = args.reward_scale
    gamma = args.gamma
    '''init'''
    env = gym.make(env_name)
    state_dim, action_dim, max_action, target_reward = get_env_info(
        env, be_quiet=False)
    buffer = BufferArray(max_memo, state_dim,
                         action_dim)  # experiment replay buffer

    workers_num = len(qs_dist)
    '''loop'''
    with torch.no_grad():  # update replay buffer
        # rewards, steps = agent.update_buffer(
        #     env, buffer, max_step, max_action, reward_scale, gamma)
        rewards, steps = initial_exploration(env, buffer, max_step, max_action,
                                             reward_scale, gamma, action_dim)

    while True:
        for _ in range(workers_num):
            memo_array = q_aggr.get()
            buffer.extend_memo(memo_array)

        buffer.init_before_sample()
        for _ in range(max_step * repeat_times):
            for q_dist in qs_dist:
                batch_arrays = buffer.random_sample(batch_size, device=None)
                q_dist.put(batch_arrays)
示例#7
0
def train_agent(rl_agent, net_dim, batch_size, repeat_times, gamma,
                reward_scale, cwd, env_name, max_memo, max_step,
                max_total_step, eval_times1, eval_times2, gpu_id, show_gap,
                if_stop, **_kwargs):  # 2020-06-01
    env, state_dim, action_dim, max_action, target_reward, is_discrete = build_gym_env(
        env_name, is_print=False)
    '''init: agent, buffer, recorder'''
    recorder = Recorder(eval_size1=eval_times1,
                        eval_size2=eval_times2)  # todo eva_size1
    agent = rl_agent(state_dim, action_dim, net_dim)  # training agent
    agent.state = env.reset()

    is_online_policy = bool(
        rl_agent.__name__ in
        {'AgentPPO', 'AgentGAE', 'AgentInterGAE', 'AgentDiscreteGAE'})
    if is_online_policy:
        buffer = BufferTupleOnline(max_memo)
    else:
        buffer = BufferArray(max_memo, state_dim,
                             1 if is_discrete else action_dim)
        with torch.no_grad():  # update replay buffer
            rewards, steps = initial_exploration(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma, action_dim)
        recorder.update__record_explore(steps, rewards, loss_a=0, loss_c=0)
    '''loop'''
    if_train = True
    while if_train:
        '''update replay buffer by interact with environment'''
        with torch.no_grad():  # for saving the GPU buffer
            rewards, steps = agent.update_buffer(env, buffer, max_step,
                                                 max_action, reward_scale,
                                                 gamma)
        '''update network parameters by random sampling buffer for gradient descent'''
        buffer.init_before_sample()
        loss_a, loss_c = agent.update_parameters(buffer, max_step, batch_size,
                                                 repeat_times)
        # if loss_c > 4:  # todo backtracking
        #     agent.save_or_load_model(cwd, if_save=False)
        '''saves the agent with max reward'''
        with torch.no_grad():  # for saving the GPU buffer
            recorder.update__record_explore(steps, rewards, loss_a, loss_c)

            if_save = recorder.update__record_evaluate(env, agent.act,
                                                       max_step, max_action,
                                                       agent.device,
                                                       is_discrete)
            recorder.save_act(cwd, agent.act, gpu_id) if if_save else None
            recorder.save_npy__plot_png(cwd)

            if_solve = recorder.check_is_solved(target_reward, gpu_id,
                                                show_gap)
        '''break loop rules'''
        if_train = not ((if_stop and if_solve)
                        or recorder.total_step > max_total_step
                        or os.path.exists(f'{cwd}/stop.mark'))
    recorder.save_npy__plot_png(cwd)
示例#8
0
def run__tutorial_discrete_action():
    """It is a DQN tutorial, we need 1min for training.
    This simplify DQN can't work well on harder task.
    Other RL algorithms can work well on harder task but complicated.
    You can change this code and make the training finish in (10 sec, 10k step) as an execrise.
    """

    env_name = 'CartPole-v0'  # a tutorial RL env. We need 10s for training.
    env = gym.make(env_name)  # an OpenAI standard env
    state_dim = 4
    action_dim = 2
    action_max = int(1)
    target_reward = 195.0
    is_discrete = True
    # from AgentRun import get_env_info
    # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(env, is_print=True)
    # assert is_discrete is True  # DQN is for discrete action space.
    """ You will see the following:
    | env_name: <CartPoleEnv<CartPole-v0>>, action space: Discrete
    | state_dim: 4, action_dim: 2, action_max: 1, target_reward: 195.0
    """
    ''' I copy the code from AgentDQN to the following for tutorial.'''
    net_dim = 2**7  # the dimension (or width) of network
    learning_rate = 2e-4  # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum)
    max_buffer = 2**12  # the max storage number of replay buffer.
    max_epoch = 2**12  # epoch or episodes when training step
    max_step = 2**9  # the max step that actor interact with env before training critic
    gamma = 0.99  # reward discount factor (gamma must less than 1.0)
    batch_size = 2**6  # batch_size for network training
    criterion = torch.nn.MSELoss()  # criterion for critic's q_value estimate
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # choose GPU or CPU automatically
    ''' QNet is an actor or critic? DQN is not a Actor-Critic Method.
    AgentDQN chooses action with the largest q value outputing by Q_Network. Q_Network is an actor.
    AgentDQN outputs q_value by Q_Network. Q_Network is also a critic.
    '''
    act = QNet(state_dim, action_dim, net_dim).to(device)
    act.train()
    act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate)

    act_target = QNet(state_dim, action_dim, net_dim).to(device)
    act_target.load_state_dict(act.state_dict())
    act_target.eval()

    # from AgentRun import BufferList # simpler but slower
    from AgentZoo import BufferArray  # faster but a bit complicated
    buffer = BufferArray(
        max_buffer, state_dim,
        action_dim=1)  # experiment replay buffer, discrete action is an int
    '''training loop'''
    self_state = env.reset()
    self_steps = 0  # steps of an episode
    self_r_sum = 0.0  # sum of rewards of an episode with exploration
    total_step = 0  # total step before training st0p

    evaluator = EvaluateRewardSV(env)  # SV: Simplify Version for tutorial
    max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max,
                                              is_discrete)
    # the max r_sum without exploration

    start_time = time.time()
    for epoch in range(max_epoch):
        '''update_buffer'''
        explore_rate = 0.1  # explore rate when update_buffer(), epsilon-greedy
        rewards = list()
        steps = list()
        for _ in range(max_step):
            if rd.rand(
            ) < explore_rate:  # epsilon-Greedy: explored policy for DQN
                action = rd.randint(action_dim)
            else:
                states = torch.tensor((self_state, ),
                                      dtype=torch.float32,
                                      device=device)
                actions = act_target(states).argmax(
                    dim=1).cpu().data.numpy()  # discrete action space
                action = actions[0]
            next_state, reward, done, _ = env.step(action)

            self_r_sum += reward
            self_steps += 1

            mask = 0.0 if done else gamma
            buffer.add_memo((reward, mask, self_state, action, next_state))

            self_state = next_state
            if done:
                rewards.append(self_r_sum)
                self_r_sum = 0.0

                steps.append(self_steps)
                self_steps = 0

                self_state = env.reset()

        total_step += sum(steps)
        avg_reward = np.average(rewards)
        print(end=f'Reward:{avg_reward:6.1f}    Step:{total_step:8}    ')
        '''update_parameters'''
        loss_c_sum = 0.0
        update_times = max_step
        buffer.init_before_sample()  # update the buffer.now_len
        for _ in range(update_times):
            with torch.no_grad():
                rewards, masks, states, actions, next_states = buffer.random_sample(
                    batch_size, device)

                next_q_target = act_target(next_states).max(dim=1,
                                                            keepdim=True)[0]
                q_target = rewards + masks * next_q_target

            act.train()
            actions = actions.type(torch.long)
            q_eval = act(states).gather(1, actions)
            critic_loss = criterion(q_eval, q_target)
            loss_c_sum += critic_loss.item()

            act_optim.zero_grad()
            critic_loss.backward()
            act_optim.step()

            soft_target_update(act_target, act, tau=5e-2)
            # soft_target_update(act_target, act, tau=5e-3)
            ''' A small tau can stabilize training in harder env. 
            You can change tau into smaller tau 5e-3. But this env is too easy. 
            You can try the harder env and other DRL Algorithms in run__xx() in AgentRun.py
            '''

        # loss_a_avg = 0.0
        loss_c_avg = loss_c_sum / update_times
        print(end=f'Loss:{loss_c_avg:6.1f}    ')

        # evaluate the true reward of this agent without exploration
        eva_reward_list = [
            evaluator.get_eva_reward__sv(act, max_step, action_max,
                                         is_discrete) for _ in range(3)
        ]
        eva_reward = np.average(eva_reward_list)
        print(f'TrueRewward:{eva_reward:6.1f}')
        if eva_reward > max_reward:
            max_reward = eva_reward

        if max_reward > target_reward:
            print(
                f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}"
            )
            break

    used_time = int(time.time() - start_time)
    print(f"|\tTraining UsedTime: {used_time}s")
示例#9
0
def run__tutorial_continuous_action():
    """It is a DDPG tutorial, we need about 300s for training.
    I hate OU Process because of its lots of hyper-parameters. So this DDPG has no OU Process.
    This simplify DDPG can't work well on harder task.
    Other RL algorithms can work well on harder task but complicated.
    You can change this code and make the training finish in 100s.
    """

    env_name = 'Pendulum-v0'  # a tutorial RL env. We need 300s for training.
    env = gym.make(env_name)  # an OpenAI standard env
    state_dim = 3
    action_dim = 1
    action_max = 2.0
    target_reward = -100.0
    is_discrete = False
    # from AgentRun import get_env_info
    # state_dim, action_dim, max_action, target_reward, is_discrete = get_env_info(
    #     env, is_print=True, target_reward=-100.0)
    # assert is_discrete is False  # DDPG is for discrete action space.
    """ You will see the following:
    | env_name: <PendulumEnv<Pendulum-v0>>, action space: Continuous
    | state_dim: 3, action_dim: 1, action_max: 2.0, target_reward: 0.0
    """
    ''' I copy the code from AgentDQN to the following for tutorial.'''
    net_dim = 2**5  # the dimension (or width) of network
    learning_rate = 2e-4  # learning rate for Adam Optimizer (ADAM = RMSProp + Momentum)
    max_buffer = 2**14  # the max storage number of replay buffer.
    max_epoch = 2**12  # epoch or episodes when training step
    max_step = 2**8  # the max step that actor interact with env before training critic
    gamma = 0.99  # reward discount factor (gamma must less than 1.0)
    batch_size = 2**7  # batch_size for network training
    update_freq = 2**7
    criterion = torch.nn.SmoothL1Loss(
    )  # criterion for critic's q_value estimate
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # choose GPU or CPU automatically

    act_dim = net_dim
    act = Actor(state_dim, action_dim, act_dim).to(device)
    act.train()
    act_optim = torch.optim.Adam(act.parameters(), lr=learning_rate)

    act_target = Actor(state_dim, action_dim, act_dim).to(device)
    act_target.load_state_dict(act.state_dict())
    act_target.eval()

    cri_dim = int(net_dim * 1.25)
    cri = Critic(state_dim, action_dim, cri_dim).to(device)
    cri.train()
    cri_optim = torch.optim.Adam(cri.parameters(), lr=learning_rate)

    cri_target = Critic(state_dim, action_dim, cri_dim).to(device)
    cri_target.load_state_dict(cri.state_dict())
    cri_target.eval()

    # from AgentRun import BufferList # simpler but slower
    from AgentZoo import BufferArray  # faster but a bit complicated
    buffer = BufferArray(max_buffer, state_dim,
                         action_dim)  # experiment replay buffer
    '''training loop'''
    self_state = env.reset()
    self_steps = 0  # the steps of an episode
    self_r_sum = 0.0  # the sum of rewards of an episode with exploration
    total_step = 0
    explore_noise = 0.05

    evaluator = EvaluateRewardSV(env)  # SV: Simplify Version for tutorial
    max_reward = evaluator.get_eva_reward__sv(act, max_step, action_max,
                                              is_discrete)
    # the max r_sum without exploration

    start_time = time.time()
    while total_step < max_step:  # collect buffer before training
        for _ in range(max_step):
            action = rd.uniform(-1, 1, size=action_dim)
            next_state, reward, done, _ = env.step(action * action_max)
            mask = 0.0 if done else gamma
            buffer.add_memo((reward, mask, self_state, action, next_state))
            total_step += 1
            if done:
                self_state = env.reset()
                break
            self_state = next_state

    for epoch in range(max_epoch):
        '''update_buffer'''
        explore_rate = 0.5  # explore rate when update_buffer(), epsilon-greedy
        reward_list = list()
        step_list = list()
        for _ in range(max_step):
            states = torch.tensor((self_state, ),
                                  dtype=torch.float32,
                                  device=device)
            actions = act_target(
                states).cpu().data.numpy()  # discrete action space
            action = actions[0]
            if rd.rand() < explore_rate:
                action = rd.normal(action, explore_noise).clip(-1, +1)

            next_state, reward, done, _ = env.step(action * action_max)

            self_r_sum += reward
            self_steps += 1

            mask = 0.0 if done else gamma
            buffer.add_memo((reward, mask, self_state, action, next_state))

            self_state = next_state
            if done:
                reward_list.append(self_r_sum)
                self_r_sum = 0.0

                step_list.append(self_steps)
                self_steps = 0

                self_state = env.reset()

        total_step += sum(step_list)
        avg_reward = np.average(reward_list)
        print(end=f'Reward:{avg_reward:8.1f}  Step:{total_step:8}  ')
        '''update_parameters'''
        loss_a_sum = 0.0
        loss_c_sum = 0.0
        update_times = max_step
        buffer.init_before_sample()  # update the buffer.now_len
        for i in range(update_times):
            for _ in range(2):  # Two Time-scale Update Rule (TTUR)
                with torch.no_grad():
                    reward, mask, state, action, next_state = buffer.random_sample(
                        batch_size, device)

                    next_action = act_target(next_state)
                    next_q_target = cri_target(next_state, next_action)
                    q_target = reward + mask * next_q_target

                q_eval = cri(state, action)
                critic_loss = criterion(q_eval, q_target)
                loss_c_sum += critic_loss.item()

                cri_optim.zero_grad()
                critic_loss.backward()
                cri_optim.step()

            action_pg = act(state)  # policy gradient
            actor_loss = -cri(state, action_pg).mean()  # policy gradient
            loss_a_sum += actor_loss.item()

            act_optim.zero_grad()
            actor_loss.backward()
            act_optim.step()
            '''soft target update'''
            # soft_target_update(cri_target, cri, tau=5e-3)
            # soft_target_update(act_target, act, tau=5e-3)
            '''hard target update'''
            if i % update_freq == 0:
                cri_target.load_state_dict(cri.state_dict())
                act_target.load_state_dict(act.state_dict())

        loss_c_avg = loss_c_sum / (update_times * 2)
        loss_a_avg = loss_a_sum / update_times
        print(end=f'LossC:{loss_c_avg:6.1f}  LossA:{loss_a_avg:6.1f}  ')

        # evaluate the true reward of this agent without exploration
        eva_reward_list = [
            evaluator.get_eva_reward__sv(act, max_step, action_max,
                                         is_discrete) for _ in range(3)
        ]
        eva_reward = np.average(eva_reward_list)
        print(f'TrueRewward:{eva_reward:8.1f}')
        if eva_reward > max_reward:
            max_reward = eva_reward

        if max_reward > target_reward:
            print(
                f"|\tReach target_reward: {max_reward:6.1f} > {target_reward:6.1f}"
            )
            break

    used_time = int(time.time() - start_time)
    print(f"|\tTraining UsedTime: {used_time}s")