Exemplo n.º 1
0
def test(args, shared_model):
    ptitle('Test Agent')
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = create_env(args.env)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.model = A3C_MLP(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.eval()
    max_score = 0
    while True:
        if player.done:
            player.model.load_state_dict(shared_model.state_dict())

        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                state_to_save = player.model.state_dict()
                torch.save(state_to_save,
                           '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
Exemplo n.º 2
0
    def test(self, iteration, show='none', save_max=False):
        env = create_env(self.args)
 
        player = Agent(None, env, self.args, None)
        player.gpu_id = self.gpu_id
        if self.args.model == 'MLP':
            player.model = A3C_MLP(
                player.env.observation_space.shape[0], player.env.action_space, self.args.stack_frames)
        if self.args.model == 'CONV':
            player.model = A3C_CONV(self.args.stack_frames, player.env.action_space)

        # load the input model
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                player.model.load_state_dict(self.shared_model.state_dict())
        else:
            player.model.load_state_dict(self.shared_model.state_dict())
        
        player.state = player.env.reset(self.args)
        player.state = torch.from_numpy(player.state).float()
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                player.model = player.model.cuda()
                player.state = player.state.cuda()
        player.model.eval()

        while True:
            player.action_test()
            if self.args.show != 'none' or show != 'none':
                player.env.render()

            self.reward_sum += player.reward

            if player.done:
                self.num_tests += 1
                self.reward_total_sum += self.reward_sum
                reward_mean = self.reward_total_sum / self.num_tests
                self.reward_sum = 0
                player.eps_len = 0
                state = player.env.reset(self.args)
                player.state = torch.from_numpy(state).float()
                if self.gpu_id >= 0:
                    with torch.cuda.device(self.gpu_id):
                        player.state = player.state.cuda()
                if self.args.show != 'none' or show != 'none':
                    player.env.close()
                break
        return self.reward_total_sum
Exemplo n.º 3
0
 def _init_model(self,
                 model_type,
                 env,
                 stack_frames=0,
                 load=False,
                 load_file="./model.bin"):
     if model_type == 'MLP':
         model = A3C_MLP(env.observation_space.shape[0], env.action_space,
                         stack_frames)
     if model_type == 'CONV':
         model = A3C_CONV(stack_frames, env.action_space)
     if load:
         saved_state = torch.load(load_file,
                                  map_location=lambda storage, loc: storage)
         model.load_state_dict(saved_state)
     return model
Exemplo n.º 4
0
def train(rank, args, shared_model, optimizer):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args.env, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if args.model == 'MLP':
        player.model = A3C_MLP(player.env.observation_space.shape[0],
                               player.env.action_space, args.stack_frames)
    if args.model == 'CONV':
        player.model = A3C_CONV(args.stack_frames, player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = torch.zeros(1, 1).cuda()
        else:
            R = torch.zeros(1, 1)
        if not player.done:
            state = player.state
            if args.model == 'CONV':
                state = state.unsqueeze(0)
            value, _, _, _ = player.model(
                (Variable(state), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
        else:
            gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            #          print(player.rewards[i])
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
Exemplo n.º 5
0
# https://github.com/pytorch/examples/tree/master/mnist_hogwild
# Training settings
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    env = create_env(args.env, args)
    if args.model == 'MLP':
        shared_model = A3C_MLP(env.observation_space.shape[0],
                               env.action_space, args.stack_frames)
    if args.model == 'CONV':
        shared_model = A3C_CONV(args.stack_frames, env.action_space)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
Exemplo n.º 6
0
torch.set_default_tensor_type('torch.FloatTensor')

print("begin loading models")
saved_state = torch.load(
    '{0}{1}.dat'.format(args.load_model_dir, args.env),
    map_location=lambda storage, loc: storage)
print("finished loading models")

torch.manual_seed(args.seed)

env = create_env(args.env, -1)
num_tests = 0
reward_total_sum = 0
player = Agent(None, env, args, None, -1)
player.model = A3C_MLP(env.observation_space, env.action_space, args.stack_frames)

if args.new_gym_eval:
    player.env = gym.wrappers.Monitor(
        player.env, "{}_monitor".format(args.env), force=True)

player.model.load_state_dict(saved_state)

player.model.eval()
for i_episode in range(1):
    speed = []
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.eps_len = 0
    reward_sum = 0
    while True:
Exemplo n.º 7
0
def test(args, shared_model):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger(
        '{}_log'.format(args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = create_env(args.env, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if args.model == 'MLP':
        player.model = A3C_MLP(
            player.env.observation_space.shape[0], player.env.action_space, args.stack_frames)
    if args.model == 'CONV':
        player.model = A3C_CONV(args.stack_frames, player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    player.model.eval()
    while True:
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())

        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args.save_score_level:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save, '{0}{1}.dat'.format(
                    args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
Exemplo n.º 8
0
def test(rank, args, shared_model):
    writer = SummaryWriter('8_27_test')
    model_buffer = Model_Buffer(args)
    test_episodes = args.test_episodes
    ptitle('Test Agent')
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    print("logfile check", r'{0} {1}_log'.format(args.log_dir, args.env))

    print("logs in test", args.log_dir)

    log['{}_log'.format(args.env)] = logging.getLogger(  # 将logger放进字典
        '{}_log'.format(args.env))
    d_args = vars(args)  # vars() 函数返回对象object的属性和属性值的字典对象。
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(
            k, d_args[k]))  # 输出参数信息

    # for i in range(100):
    #     log['{}_log'.format(args.env)].info('{0}'.format(i))

    # print('we prefix seed = -1 when testing')
    # args.seed = -1
    torch.manual_seed(args.seed)
    env = create_env(args.env, args.seed)
    # env = gym.make(args.env)
    # env.seed(args.seed)

    start_time = time.time()
    num_tests = 0  # 当前玩的回合数
    player = Agent(None, env, args, None, rank)
    player.model = A3C_MLP(player.env.observation_space,
                           player.env.action_space,
                           args.stack_frames)  # 设置model
    player.state = player.env.reset()  # 设置state
    player.state = torch.from_numpy(player.state).float()
    player.done = True

    player.model.eval()  # 设为eval模式

    is_model_empty = True
    is_testing = False
    while True:
        model_buffer.put(shared_model)
        # 测试够一大回合,初始化
        if player.done and np.mod(num_tests,
                                  test_episodes) == 0 and not is_testing:
            reward_episode = 0
            success_rate = 0
            load_model = model_buffer.get()  # 获取公共model
            model_queue_size = model_buffer.qsize()
            if load_model:
                is_testing = True
                is_model_empty = False
                training_steps = load_model[1]
                training_episodes = load_model[2]
                # 用公共model实例化player_model(传入参数) load_model[0]保存的参数
                player.model.load_state_dict(load_model[0])
            else:
                is_model_empty = True  # 未获取到model
                time.sleep(10)

        if not is_model_empty:
            player.action_test()
            # log['{}_log'.format(args.env)].info("test steps {}".format(1))
            reward_episode += player.reward
            if 'is_success' in player.info.keys():  # 判断是否因成功而done
                success_rate += 1
            if player.done:  # 到达目标位置或撞毁或太远时为done,一回合结束
                # print("crash detected")
                # eps_len_temp = player.eps_len #?
                num_tests += 1  # done时test回合数加一
                player.eps_len = 0  # player这一回合所走的步数归零
                state = player.env.reset()
                player.state = torch.from_numpy(state).float()

                if np.mod(num_tests, test_episodes) == 0:  # 测试够一大回合,开始统计信息
                    is_testing = False
                    reward_episode = reward_episode / test_episodes
                    writer.add_scalar('success_num/Test', success_rate,
                                      training_steps)
                    success_rate = success_rate / test_episodes
                    log['{}_log'.format(args.env)].info(
                        "Time {0}, training episodes {1}, training steps {2}, reward episode {3}, success_rate {4}, "
                        "model cached {5}".format(
                            time.strftime(
                                "%Hh %Mm %Ss",
                                time.gmtime(time.time() - start_time)),
                            training_episodes, training_steps, reward_episode,
                            success_rate, model_queue_size))

                    writer.add_scalar('success_rate/Test', success_rate,
                                      training_steps)
                    # save model:
                    state_to_save = player.model.state_dict()
                    # torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env))
                    # torch.save(state_to_save, '{0}{1}_pre.dat'.format(args.save_model_dir, args.env))

                    torch.save(state_to_save,
                               '{0}{1}.dat'.format(args.log_dir, args.env))
                    torch.save(state_to_save,
                               '{0}{1}_pre.dat'.format(args.log_dir, args.env))
        if training_steps > args.training_steps:
            break
Exemplo n.º 9
0
def train(rank,
          args,
          input_model=None,
          max_iter=100000,
          step_test=-1,
          log=False):
    if rank >= 0:
        torch.manual_seed(args.seed + rank)
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args)
    env.seed(args.seed + rank)

    if log:
        log = setup_logger("{0}_{1}_log".format(args.scale_legs, rank),
                           "logs/{0}_{1}_log".format(args.scale_legs, rank))

    # player initialization
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if args.model == 'MLP':
        player.model = A3C_MLP(player.env.observation_space.shape[0],
                               player.env.action_space, args.stack_frames)
    if args.model == 'CONV':
        player.model = A3C_CONV(args.stack_frames, player.env.action_space)

    # load the input model to the player
    if input_model != None:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(input_model.state_dict())
        else:
            player.model.load_state_dict(input_model.state_dict())

    # initialize the player optimizer
    optimizer = None
    if args.optimizer == 'RMSprop':
        optimizer = optim.RMSprop(player.model.dictForOptimizer(), lr=args.lr)
    if args.optimizer == 'Adam':
        optimizer = optim.Adam(player.model.dictForOptimizer(), lr=args.lr)
    else:
        optimizer = optim.SGD(player.model.dictForOptimizer(), lr=args.lr)

    # reset the environment and initialize the player state
    player.state = player.env.reset(args)
    player.state = torch.from_numpy(player.state).float()

    # If on GPU, do as GPU
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()

    player.model.train()

    last_iter = 0

    mean_buf = Buffer(5)
    # Start looping over episodes
    for iteration in range(max_iter):
        last_iter += iteration

        # reset cx and hx if the enlvironmnent is over.
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        # Roll out actions and collect reward for one episode
        for step in range(args.num_steps):
            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            # reset state
            state = player.env.reset(args)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = torch.zeros(1, 1).cuda()
        else:
            R = torch.zeros(1, 1)

        if not player.done:
            state = player.state
            if args.model == 'CONV':
                state = state.unsqueeze(0)
            value, _, _, _ = player.model(
                (Variable(state), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
        else:
            gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        optimizer.step()
        player.clear_actions()

        if step_test > 0 and iteration % step_test == 0:
            tester = Tester(args, player.model)
            score = tester.test(last_iter)
            mean_buf.push(score)
            recent_mean = sum(mean_buf.bf) / mean_buf.current_size
            text = "Iteration {0}, episode reward {1}, recent reward mean {2}".format(
                iteration, score, recent_mean)
            log.info(text)

    tester = Tester(args, player.model)
    fitness = tester.test(last_iter)

    return fitness
Exemplo n.º 10
0
parser.add_argument('--amsgrad',
                    default=True,
                    metavar='AM',
                    help='Adam optimizer amsgrad parameter')

# Based on
# https://github.com/pytorch/examples/tree/master/mnist_hogwild
# Training settings
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    env = create_env(args.env)
    shared_model = A3C_MLP(env.observation_space.shape[0], env.action_space)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        optimizer = SharedAdam(shared_model.parameters(),
                               lr=args.lr,
                               amsgrad=args.amsgrad)
        optimizer.share_memory()
    else:
        optimizer = None
Exemplo n.º 11
0
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

torch.manual_seed(args.seed)

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

env = create_env("{}".format(args.env))
num_tests = 0
reward_total_sum = 0
player = Agent(None, env, args, None)
player.model = A3C_MLP(env.observation_space.shape[0], env.action_space)

if args.new_gym_eval:
    player.env = gym.wrappers.Monitor(player.env,
                                      "{}_monitor".format(args.env),
                                      force=True)

player.model.load_state_dict(saved_state)

player.model.eval()
best_reward = 0
best_speed = 0
for i_episode in range(args.num_episodes):
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.eps_len = 0
Exemplo n.º 12
0
def train(rank, args, shared_model, optimizer):  # optimizer为shared_model的
    init = True
    ptitle('Training Agent: {}'.format(rank))
    torch.manual_seed(args.seed + rank)
    env = create_env(args.env, args.seed + rank)
    # env = gym.make(args.env)
    # env.seed(args.seed + rank)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    player = Agent(None, env, args, None, rank)
    player.model = A3C_MLP(player.env.observation_space,
                           player.env.action_space, args.stack_frames)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()  #固定使用场景为train

    if rank == 1:
        # file = open(os.path.join(args.log_dir, 'TD_Error.txt'), 'w+')
        writer = SummaryWriter('8_27_train')

    local_step_counter = 0
    while True:
        if init:  # 初始化
            shared_model.training_steps.weight.data \
                .copy_(torch.Tensor([0]))
            shared_model.training_steps.bias.data \
                .copy_(torch.Tensor([0]))
            init = False
        player.model.load_state_dict(
            shared_model.state_dict())  # synchronize parameters
        for step in range(args.num_steps):
            # print("thread", rank, local_step_counter, shared_model.training_steps.weight.data.cpu().numpy())
            local_step_counter += 1  # update step counters
            shared_model.training_steps.weight.data \
                .copy_(torch.Tensor([1]) + shared_model.training_steps.weight.data)  # 总步骤(各个worker所走步数之和)T每次加一

            player.action_train()  # core
            if player.done:
                break

        terminal = False
        if player.done or player.eps_len >= args.max_episode_length:  # 玩家完成或者超出最大迭代次数
            terminal = True
            shared_model.done_nums += 1
            if 'is_success' in player.info.keys():
                shared_model.success_num += 1

        R = torch.zeros(1)
        if not player.done:  # 结算
            state = player.state
            # A3C,value和policy net是用的同一个网络
            value, _, _ = player.model(Variable(state))
            R = value.data

        if terminal:  #重置
            shared_model.training_steps.bias.data \
                .copy_(torch.Tensor([1]) + shared_model.training_steps.bias.data)  # 总步数加一
            player.eps_len = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.reset_flag = True

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + np.float(player.rewards[i])  # reward
            advantage = R - player.values[i]  # advantage
            value_loss = value_loss + 0.5 * advantage.pow(2)  # 公式(10) 更新w
            if rank == 1:
                # file.write(str(advantage.pow(2).data.cpu().numpy()[0]))
                # file.write(' ')
                # file.write(
                #     str(int(shared_model.training_steps.weight.data.cpu().numpy()[0])))
                # file.write('\n')
                writer.add_scalar(
                    'TD-error/train',
                    advantage.pow(2).data.cpu().numpy()[0],
                    shared_model.training_steps.weight.data.cpu().numpy()[0])

            player.values[i] = player.values[i].float()
            player.values[i + 1] = player.values[i + 1].float()
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - \
                player.values[i].data  # a2c计算td-error
            # GAE算法
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())  # 更新theta 公式(9)
        """
        每个线程和环境交互到一定量的数据后,就计算在自己线程里的神经网络损失函数的梯度,
        但是这些梯度却并不更新自己线程里的神经网络,而是去更新公共的神经网络。
        也就是n个线程会独立的使用累积的梯度分别更新公共部分的神经网络模型参数。
        每隔一段时间,线程会将自己的神经网络的参数更新为公共神经网络的参数,进而指导后面的环境交互。
        """
        player.model.zero_grad()
        # policy_loss + 0.5 * value_loss即为loss
        if rank == 1:
            writer.add_scalar(
                'VLoss/train', value_loss,
                shared_model.training_steps.weight.data.cpu().numpy()[0])
            writer.add_scalar(
                'PLoss/train', policy_loss,
                shared_model.training_steps.weight.data.cpu().numpy()[0])
        (policy_loss + 0.5 * value_loss).backward()  # 计算该worder的损失函数梯度
        ensure_shared_grads(player.model, shared_model)  # 该worker将自己的参数传给公用的模型
        optimizer.step(
        )  # optimizer为shared_model的  step()将参数更新值施加到shared_model的parameters 上
        player.clear_actions()
        if shared_model.training_steps.weight.data.cpu().numpy(
        ) > args.training_steps:
            print('num of success={0},training episodes={1},success_rate={2}'.
                  format(shared_model.success_num, shared_model.done_nums,
                         shared_model.success_num / shared_model.done_nums))
            break