def globalTrain():
    torch.manual_seed(123)

    mp = _mp.get_context('spawn')

    env, num_state, num_action = gym_env(world, stage, version,
                                         actions)  # define environment
    #env.seed(123+idx)

    shared_model = A3C(num_state, num_action)
    shared_model.share_memory()

    #optimizer = Adam_global(shared_model.parameters(), lr=Args.lr, betas = Args.betas ,eps = Args.eps, weight_decay = Args.weight_decay)
    optimizer = Adam_global(shared_model.parameters(),
                            lr=lr,
                            betas=betas,
                            eps=eps,
                            weight_decay=weight_decay)

    processes = []
    counter = mp.Value('i', 0)
    lock = mp.Lock()

    for index in range(num_processes):
        process = mp.Process(target=train,
                             args=(index, shared_model, optimizer, counter,
                                   lock))
        process.start()
        processes.append(process)
    process = mp.Process(target=test, args=(num_processes, shared_model))
    process.start()
    processes.append(process)
    for process in processes:
        process.join()
示例#2
0
def dummy_test(idx):
    torch.manual_seed(123 + idx)
    env, num_state, num_action = gym_env(world, stage, version, actions)

    done = True
    with open('record_reward_average.txt', 'rb') as fp:
        reward = pickle.load(fp)

    with open('record_acts.txt', 'rb') as fp:
        acts = pickle.load(fp)

    max_id = reward.index(max(reward))
    acts_8 = acts[max_id]
    print(max(reward))
    print(max_id)

    for act in acts_8:
        if done:
            state = env.reset()
        state, reward, done, info = env.step(act)
        env.render()

    plt.plot(range(1, len(reward) + 1), reward)
    plt.xlabel('Episode')
    plt.ylabel('Episode Rewards Achieved')
    plt.title('Episode Rewards')
    plt.show()
    plt.close()
示例#3
0
def test(idx, shared_model):
    torch.manual_seed(123 + idx)
    env, num_state, num_action = gym_env(world, stage, version, actions)
    model = A3C(num_state, num_action)
    # model.load_state_dict(torch.load(path.join(path.dirname(path.abspath(__file__)),'trained_model.pth'),map_location='cpu'))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    step_counter = 0
    total_reward = 0
    acts = deque(maxlen=max_actions)

    while True:
        step_counter += 1

        if done:
            model.load_state_dict(shared_model.state_dict())

        with torch.no_grad():
            if done:
                hx = torch.zeros((1, 512), dtype=torch.float)
                cx = torch.zeros((1, 512), dtype=torch.float)
            else:
                hx = hx.detach()
                cx = cx.detach()

            action, value, hx, cx = model(state, hx, cx)
            prob = F.softmax(action, dim=-1)
            action = prob.max(1, keepdim=True)[1].numpy()
            state, reward, done, _ = env.step(int(action))
            state = torch.from_numpy(state)
            env.render()
            acts.append(action)
            total_reward += reward

        if done:
            break


# if __name__ == "__main__":
#     torch.manual_seed(123)

#     env,num_state,num_action = gym_env(world,stage,version,actions)    # define environment
#     #env.seed(123+idx)

#     shared_model = A3C(num_state,num_action)
#     shared_model.share_memory()

#     #optimizer = Adam_global(shared_model.parameters(), lr=Args.lr, betas = Args.betas ,eps = Args.eps, weight_decay = Args.weight_decay)
#     optimizer = Adam_global(shared_model.parameters(), lr=lr, betas = betas ,eps = eps, weight_decay = weight_decay)
#     train(0,shared_model,optimizer,0,0)
示例#4
0
def test_global(idx):
    torch.manual_seed(123 + idx)
    env, num_state, num_action = gym_env(world, stage, version, actions)
    model = A3C(num_state, num_action)
    model.load_state_dict(
        torch.load(path.join(path.dirname(path.abspath(__file__)),
                             'trained_model.pth'),
                   map_location='cpu'))
    model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    step_counter = 0
    total_reward = 0
    acts = deque(maxlen=max_actions)

    while True:
        step_counter += 1

        with torch.no_grad():
            if done:
                hx = torch.zeros((1, 512), dtype=torch.float)
                cx = torch.zeros((1, 512), dtype=torch.float)
            else:
                hx = hx.detach()
                cx = cx.detach()

            action, value, hx, cx = model(state, hx, cx)
            prob = F.softmax(action, dim=-1)
            action = prob.max(1, keepdim=True)[1].numpy()
            state, reward, done, info = env.step(int(action))
            state = torch.from_numpy(state)
            env.render()
            acts.append(action)
            total_reward += reward

        if done:
            break
示例#5
0
def train(idx, shared_model, optimizer, counter, lock):
    '''
    A3C for EACH actor-learner thread

    Inputs:
    idx: a scalar, indicting the idx th thread
    shared_model: The global model
    optimizer: The optimizer used for local gradient descent
    global_counter: a scalar, global shared counter

    Returns:
    None
    '''
    # initialization
    torch.manual_seed(123 + idx)
    start = timeit.default_timer()

    env, num_state, num_action = gym_env(world, stage, version,
                                         actions)  # define environment
    env.seed(123 + idx)

    # model = A3C(num_state,num_action)
    model = shared_model

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    step_counter = 0
    curr_episode = 0
    terminated = 0
    success = 0
    fail = 0
    acts = []
    record_reward = []
    record_reward_average = []
    record_acts = []
    success_acts = []

    while True:
        curr_episode += 1
        # sync with the shared model
        # model.load_state_dict(shared_model.state_dict())

        # save data
        if curr_episode % 50 == 0:
            interval_timer = timeit.default_timer()
            print('Current episode:{}, terminated:{},\
                    success:{}, fail:{},elasped time:{}'.format(
                curr_episode, terminated, success, fail,
                interval_timer - start))

            if curr_episode >= 50:
                with open('record_acts.txt', 'wb') as fp:
                    pickle.dump(record_acts, fp)

                with open('record_reward_average.txt', 'wb') as fp:
                    pickle.dump(record_reward_average, fp)
                save_model(model)

        if done:
            hx = torch.zeros((1, 512), dtype=torch.float)
            cx = torch.zeros((1, 512), dtype=torch.float)
            terminated += 1
        else:
            hx = hx.detach()
            cx = cx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        # reset gradient
        action_loss = 0
        critic_loss = 0

        # repeat until terminal or max steps reach
        for step in range(num_local_steps):
            step_counter += 1

            # perform action according to policy
            logits, value, hx, cx = model(state, hx, cx)
            prob = F.softmax(logits,
                             dim=1)  # probability of choosing each actions
            log_prob = F.log_softmax(logits, dim=1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            m = Categorical(prob)
            action = m.sample().item(
            )  # choosing actions based on multinomial distribution
            acts.append(action)

            # recieve reward and new state
            state, reward, done, info = env.step(action)

            with lock:
                counter.value += 1

            if done or step_counter >= num_global_step:
                step_counter = 0
                state = env.reset()
                if info['flag_get']:
                    success = success + 1
                else:
                    fail = fail + 1

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob[0, action])
            rewards.append(reward)
            record_reward.append(reward)

            if done:
                break

        # obtain critic values
        if not done:
            _, R, _, _ = model(state, hx, cx)
            R = R.detach()
        else:
            R = torch.zeros((1, 1), dtype=torch.float)
            record_acts.append(acts)
            avg_reward = sum(record_reward)
            record_reward_average.append(avg_reward)

            if info['flag_get']:
                success_acts.append(acts)
                with open('success_acts.txt', 'wb') as fp:
                    pickle.dump(success_acts, fp)

            record_reward = []
            acts = []

        # gradient acsent
        values.append(R)
        esitimator = torch.zeros((1, 1), dtype=torch.float)
        for i in reversed(range(len(rewards))):
            R = rewards[i] + discount * R
            advantage_fc = rewards[i] + discount * values[i + 1] - values[i]

            # approximate the actor gradient using Generalized Advantage Estimator
            esitimator = discount * tau * esitimator + advantage_fc
            # accumulate gradients wrt the actor
            action_loss = action_loss + log_probs[i] * esitimator.detach(
            ) + beta * entropies[i]
            # accumulate gradients wrt the critic
            critic_loss = critic_loss + (R - values[i])**2 / 2

        # perform asynchronous update
        optimizer.zero_grad()
        total_loss = critic_loss_coef * critic_loss - action_loss
        nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        total_loss.backward()

        optimizer.step()

        if info['flag_get']:
            with open('success_acts.txt', 'wb') as fp:
                pickle.dump(record_acts, fp)

            save_model(shared_model)

        if curr_episode == int(num_global_step / num_local_steps):
            end = timeit.default_timer()
            print('Training process {} terminated, run {} episodes, \n \
                    with {} success and {} failure,elasped time {}'.format(
                idx, terminated, success, fail, end - start))

            return