Пример #1
0
def train(rank, args, shared_model, counter, lock, optimizer):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()
    model.train()

    state = env.reset()
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    n_episode = 0
    total_reward_for_episode = 0
    all_rewards_in_episode = []
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)
            log_probs.append(log_prob)

            action_int = action.cpu().numpy()[0][0].item()

            state, reward, done, _ = env.step(action_int, verbose=False)
            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []
                state = env.reset()
                print(
                    'Process {} Episode {} Over with Length: {} and Reward: {: .2f}. Total Trained Length: {}'
                    .format(rank, n_episode, episode_length,
                            total_reward_for_episode, total_length))
                sys.stdout.flush()
                episode_length = 0
                n_episode += 1

            values.append(value)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        if args.synchronous:
            if total_reward_for_episode >= args.solved_reward:
                print("Process {} Solved with Reward {}".format(
                    rank, total_reward_for_episode))
                env.close()
                break

        total_reward_for_num_steps = sum(rewards)
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        gae = torch.zeros(1, 1)
        if args.cuda:
            if args.point_cloud_model:
                state = (state[0].cuda(), state[1].cuda())
            else:
                state = state.cuda()
            R = R.cuda()
            gae = gae.cuda()
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state, (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Пример #2
0
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        print('Using', torch.cuda.get_device_name(0))
        torch.cuda.init()

    torch.manual_seed(args.seed)
    args.config_dict = {
        'max_episode_length': args.max_episode_length,
        'point_cloud_model': args.point_cloud_model
    }
    env = AI2ThorEnv(config_dict=args.config_dict)

    if args.point_cloud_model:
        shared_model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        shared_model = ActorCritic(env.action_space.n,
                                   env.observation_space.shape[0],
                                   args.frame_dim)

    if args.cuda:
        shared_model = shared_model.cuda()
    shared_model.share_memory()

    env.close(
    )  # above env initialisation was only to find certain params needed

    optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()
Пример #3
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True  # be True at the beginning

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    win = vis.line(X=[0.],
                   Y=[0.],
                   win='testing_Rewards',
                   opts=dict(title='testing_Rewards'))

    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float(), (hx, cx)))

        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print(
                'In test. Episode over because agent repeated action {} times'.
                format(actions.maxlen))
            done = True

        if done:
            print(
                "In test. Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            vis.line(X=[episodes],
                     Y=[reward_sum],
                     win='testing_Rewards',
                     update='append')
            episodes += 1

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)  # wasting resource...
            print('testing...')

        state = torch.from_numpy(state)
Пример #4
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    if args.point_cloud_model:
        model = ActorCritic(env.action_space.n)
    else:
        args.frame_dim = env.config['resolution'][-1]
        model = ActorCritic(env.action_space.n, env.observation_space.shape[0],
                            args.frame_dim)

    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    save = '{}-steps{}-process{}-lr{}-entropy_coef{}'.format(
        "point" if args.point_cloud_model else "conv", args.num_steps,
        args.num_processes, args.lr, args.entropy_coef)
    save = os.path.join('logs', save)
    os.makedirs(save, exist_ok=True)

    if args.model:
        shared_model.load_state_dict(
            torch.load(os.path.join(save, "solved_ai2thor.pth")))
    else:
        logger = CSVLogger(os.path.join(save, 'test.csv'))
        fileds = ['episode_reward', 'frames_rendered']
        logger.log(fileds)

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(deepcopy(shared_model.state_dict()))
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                if args.point_cloud_model:
                    state = (state[0].cuda(), state[1].cuda())
                else:
                    state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state, (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     print('In test. Episode over because agent repeated action {} times'.format(
        #                                                                         actions.maxlen))
        #     done = True

        if done:
            print(
                "Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {: .2f}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            if not args.model:
                logger.log(["{: .2f}".format(reward_sum), counter.value])

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(),
                           os.path.join(save, "solved_ai2thor.pth"))
                env.close()
                logger.close()
                break

            reward_sum = 0
            episode_length = 0
            # actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)
Пример #5
0
def train(rank, args, shared_model, counter, lock, device, optimizer=None):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.frame_dim)
    model = model.to(device)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    # monitoring
    total_reward_for_num_steps_list = []
    episode_total_rewards_list = []
    all_rewards_in_episode = []
    avg_reward_for_num_steps_list = []

    total_length = 0
    episode_length = 0
    episodes = 0

    vis = Visdom()
    assert vis.check_connection()
    vis.close()
    vis.line(X=[0.],
             Y=[0.],
             win='training_Rewards' + str(rank),
             opts=dict(title='training_Rewards' + str(rank)))

    while True:
        episodes += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            total_length += 1
            value, logit, (hx, cx) = model(
                (state.unsqueeze(0).float().cuda(), (hx.cuda(), cx.cuda())))
            value = value.cpu()
            logit = logit.cpu()
            hx = hx.cpu()
            cx = cx.cpu()
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            action_int = action.numpy()[0][0].item()
            state, reward, done, _ = env.step(action_int)

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            if done:
                total_length -= 1
                total_reward_for_episode = sum(all_rewards_in_episode)
                episode_total_rewards_list.append(total_reward_for_episode)
                all_rewards_in_episode = []

                vis.line(X=[episodes],
                         Y=[total_reward_for_episode],
                         win='training_Rewards' + str(rank),
                         update='append')
                print(
                    'In Train. Episode Over. Total Length: {}. Total reward for episode: {}'
                    .format(total_length, total_reward_for_episode))
                print('In Train. Step no: {}. total length: {}'.format(
                    episode_length, total_length))

                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            all_rewards_in_episode.append(reward)

            if done:
                break

        # No interaction with environment below.
        # Monitoring
        total_reward_for_num_steps = sum(rewards)  # accumulate at each step
        total_reward_for_num_steps_list.append(total_reward_for_num_steps)
        avg_reward_for_num_steps = total_reward_for_num_steps / len(rewards)
        avg_reward_for_num_steps_list.append(avg_reward_for_num_steps)

        # Backprop and optimisation
        R = torch.zeros(1, 1)
        if not done:  # to change last reward to predicted value to ....
            value, _, _ = model((state.unsqueeze(0).float(), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        # import pdb;pdb.set_trace() # good place to breakpoint to see training cycle
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - \
                          args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).cuda().backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Пример #6
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    if args.atari:
        env = create_atari_env(args.atari_env_name)
    else:
        args.config_dict = {'max_episode_length': args.max_episode_length}
        env = AI2ThorEnv(config_dict=args.config_dict)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n, args.frame_dim)
    if args.cuda:
        model = model.cuda()

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if args.atari and args.atari_render:
            env.render()
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 64)
            hx = torch.zeros(1, 64)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            if args.cuda:
                state = state.cuda()
                cx = cx.cuda()
                hx = hx.cuda()
            value, logit, (hx, cx) = model((state.unsqueeze(0).float(), (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        # log_prob = F.log_softmax(logit, dim=-1)
        # print(prob)
        # entropy = -(log_prob * prob).sum(1, keepdim=True)
        # print(prob.max(1, keepdim=True)[0].cpu().numpy())
        # print(entropy)

        action = prob.max(1, keepdim=True)[1].cpu().numpy()
        state, reward, done, _ = env.step(action[0, 0], verbose=False)
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # i.e. in test mode an agent can repeat an action ad infinitum
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            print('In test. Episode over because agent repeated action {} times'.format(
                                                                                actions.maxlen))
            done = True

        if done:
            print("Time {}, num steps over all threads {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))

            if reward_sum >= args.solved_reward:
                print("Solved Testing with Reward {}".format(reward_sum))
                torch.save(model.state_dict(), "solved_{}.pth".format("atari" if args.atari else "ai2thor"))
                env.close()
                break

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(args.test_sleep_time)

        state = torch.from_numpy(state)